In [2]:
# 1. Import libraries
import pandas as pd
import re
import string
import joblib


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


In [3]:
# Step 2: Load dataset (NO HEADER in CSV)
df = pd.read_csv('twitter_training.csv', header=None)

# Assign proper column names manually
df.columns = ['tweet_id', 'entity', 'sentiment', 'text']

# Keep only the needed columns
df = df[['text', 'sentiment']]
df.dropna(inplace=True)  # Remove any rows with missing values

# Optional: See a preview
df.head()




Unnamed: 0,text,sentiment
0,im getting on borderlands and i will murder yo...,Positive
1,I am coming to the borders and I will kill you...,Positive
2,im getting on borderlands and i will kill you ...,Positive
3,im coming on borderlands and i will murder you...,Positive
4,im getting on borderlands 2 and i will murder ...,Positive


In [4]:
# Step 3: Clean the tweet text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)         # Remove mentions
    text = re.sub(r'#', '', text)                      # Remove hashtags
    text = re.sub(r'RT[\s]+', '', text)                # Remove retweet
    text = re.sub(r'https?:\/\/\S+', '', text)         # Remove URLs
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()           # Normalize whitespace
    return text

df['cleaned_text'] = df['text'].apply(clean_text)


In [5]:
# Step 4: Prepare the dataset
X = df['cleaned_text']
y = df['sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [6]:
# Step 5: Train Logistic Regression with higher max_iter
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)



In [7]:
# Step 6: Evaluate the model
y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.6761486486486487

Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.68      0.49      0.57      2696
    Negative       0.70      0.78      0.74      4380
     Neutral       0.64      0.62      0.63      3605
    Positive       0.67      0.74      0.70      4119

    accuracy                           0.68     14800
   macro avg       0.67      0.66      0.66     14800
weighted avg       0.68      0.68      0.67     14800



In [9]:
# Step 7: Save the model and vectorizer
joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("Model and vectorizer saved successfully.")


Model and vectorizer saved successfully.


In [15]:
# Load model and vectorizer
model = joblib.load('sentiment_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Predict function
def predict_sentiment(tweet):
    cleaned = clean_text(tweet)
    vec = vectorizer.transform([cleaned])
    prediction = model.predict(vec)
    return prediction[0]

# Example
print(predict_sentiment(" this fucking game!"))
print(predict_sentiment("This app is fucking terrible."))


Negative
Negative
