In [None]:
#Importin libraries

In [5]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import re
import emoji

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression  
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [46]:
from sklearn.svm import LinearSVC  # <-- Added SVM
from sklearn.decomposition import PCA

In [8]:
train_df = pd.read_csv("twitter_training.csv", header=None)
val_df = pd.read_csv("twitter_validation.csv", header=None)

In [9]:
print("Training shape:", train_df.shape)
print("Validation shape:", val_df.shape)

Training shape: (74682, 4)
Validation shape: (1000, 4)


In [10]:
train_df.columns = ['tweet_id', 'entity', 'sentiment', 'content']
val_df.columns = ['tweet_id', 'entity', 'sentiment', 'content']

In [11]:
print("\nColumn Names:")
print(train_df.columns)


Column Names:
Index(['tweet_id', 'entity', 'sentiment', 'content'], dtype='object')


In [12]:
print("\nTraining Data Head:")
print(train_df.head())


Training Data Head:
   tweet_id       entity sentiment  \
0      2401  Borderlands  Positive   
1      2401  Borderlands  Positive   
2      2401  Borderlands  Positive   
3      2401  Borderlands  Positive   
4      2401  Borderlands  Positive   

                                             content  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  


In [13]:
#Pre-process data

In [14]:
print(f"\nDuplicates in training data: {train_df.duplicated().sum()}")
train_df = train_df.drop_duplicates().reset_index(drop=True)

print(f"\nNaN values in training data:\n{train_df.isna().sum()}")
train_df = train_df.dropna().reset_index(drop=True)


Duplicates in training data: 2700

NaN values in training data:
tweet_id       0
entity         0
sentiment      0
content      326
dtype: int64


In [15]:
def preprocess_text(text):
    text = str(text).lower()                                # lowercase
    text = re.sub(r"http\\S+|www\\S+|https\\S+", '', text)     # remove urls
    text = re.sub(r"@\\w+|#\\w+", '', text)                   # remove mentions & hashtags
    text = re.sub(r"[0-9]+", '', text)                      # remove numbers
    text = re.sub(r"[^\\w\\s]", '', text)                     # remove punctuation
    text = emoji.replace_emoji(text, replace='')            # remove emojis
    text = re.sub(r"\\s+", ' ', text).strip()                # remove extra spaces
    return text

In [16]:
print("\nPreprocessing text...")
train_df['content'] = train_df['content'].apply(preprocess_text)
val_df['content']   = val_df['content'].apply(preprocess_text)

print("Preprocessing complete.")
print(train_df.head())


Preprocessing text...
Preprocessing complete.
   tweet_id       entity sentiment content
0      2401  Borderlands  Positive      sw
1      2401  Borderlands  Positive      sw
2      2401  Borderlands  Positive      sw
3      2401  Borderlands  Positive      sw
4      2401  Borderlands  Positive      sw


In [17]:
#Label encoding

In [18]:
label_encoder = LabelEncoder()

In [19]:
train_df['sentiment'] = label_encoder.fit_transform(train_df['sentiment'])
val_df['sentiment'] = label_encoder.transform(val_df['sentiment'])

In [38]:
train_df.head()

Unnamed: 0,tweet_id,entity,sentiment,content
0,2401,Borderlands,3,sw
1,2401,Borderlands,3,sw
2,2401,Borderlands,3,sw
3,2401,Borderlands,3,sw
4,2401,Borderlands,3,sw


In [20]:
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

print(f"\nLabel classes: {label_encoder.classes_}")


Label classes: ['Irrelevant' 'Negative' 'Neutral' 'Positive']


In [21]:
tfidf = TfidfVectorizer(max_features=5000)

In [40]:
print("\nFitting TF-IDF vectorizer...")
x_train_tfidf = tfidf.fit_transform(train_df['content']) # .toarray() is not needed for SVM/LogReg
x_val_tfidf = tfidf.transform(val_df['content'])

print(f"TF-IDF training data shape: {x_train_tfidf.shape}")


Fitting TF-IDF vectorizer...
TF-IDF training data shape: (71656, 5000)


In [41]:
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

# --- Prepare labels for sklearn model ---
y_train = train_df['sentiment'].values
y_val = val_df['sentiment'].values

In [42]:
#Model Training - Logistic Regression

In [43]:
log_reg_model = LogisticRegression(max_iter=1000, random_state=42)

In [44]:
# Train the model
log_reg_model.fit(x_train_tfidf, y_train)

print("Logistic Regression training complete.")

# Get predictions
y_pred_log_reg = log_reg_model.predict(x_val_tfidf)

Logistic Regression training complete.


In [45]:
#Model Training - SVM

In [47]:
svm_model = LinearSVC(random_state=42, dual=True)

In [48]:
# Train the model
svm_model.fit(x_train_tfidf, y_train)

print("SVM training complete.")

# Get predictions
y_pred_svm = svm_model.predict(x_val_tfidf)

SVM training complete.


In [49]:
#Evalutaion

In [50]:
print("\n--- Classification Report (Logistic Regression) ---")
print(classification_report(y_val, y_pred_log_reg, target_names=label_encoder.classes_))


--- Classification Report (Logistic Regression) ---
              precision    recall  f1-score   support

  Irrelevant       0.54      0.08      0.14       172
    Negative       0.33      0.67      0.44       266
     Neutral       0.49      0.28      0.36       285
    Positive       0.39      0.38      0.38       277

    accuracy                           0.38      1000
   macro avg       0.44      0.35      0.33      1000
weighted avg       0.43      0.38      0.35      1000



In [51]:
print("\n--- Classification Report (SVM / LinearSVC) ---")
print(classification_report(y_val, y_pred_svm, target_names=label_encoder.classes_))


--- Classification Report (SVM / LinearSVC) ---
              precision    recall  f1-score   support

  Irrelevant       0.54      0.08      0.14       172
    Negative       0.33      0.67      0.44       266
     Neutral       0.49      0.28      0.36       285
    Positive       0.39      0.38      0.38       277

    accuracy                           0.38      1000
   macro avg       0.44      0.35      0.33      1000
weighted avg       0.43      0.38      0.35      1000



In [52]:
# --- Comparison ---
acc_log_reg = accuracy_score(y_val, y_pred_log_reg)
acc_svm = accuracy_score(y_val, y_pred_svm)

print("\n--- Accuracy Comparison ---")
print(f"Logistic Regression Accuracy: {acc_log_reg:.4f}")
print(f"SVM (LinearSVC) Accuracy:   {acc_svm:.4f}")


--- Accuracy Comparison ---
Logistic Regression Accuracy: 0.3780
SVM (LinearSVC) Accuracy:   0.3780


In [58]:
if acc_svm > acc_log_reg:
    print("\nSVM (LinearSVC) is the more accurate model.")
    # Save the better model
    best_model = svm_model
    with open("sklearn_svm_model.pkl", "wb") as f:
        pickle.dump(best_model, f)
else:
    print("\nLogistic Regression is the more accurate model.")
    best_model = log_reg_model
    with open("sklearn_logistic_model.pkl", "wb") as f:
        pickle.dump(best_model, f)


Logistic Regression is the more accurate model.


In [57]:
print("\n--- Testing the winning model on new text... ---")
with open("sklearn_svm_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

texts = [
    "I love this product!",
    "This is terrible.",
    "The game is okay, nothing special.",
    "This has nothing to do with anything."
]


--- Testing the winning model on new text... ---


FileNotFoundError: [Errno 2] No such file or directory: 'sklearn_svm_model.pkl'

In [56]:
preprocessed_texts = [preprocess_text(t) for t in texts]
x_input = tfidf.transform(preprocessed_texts)
y_pred = loaded_model.predict(x_input)
predicted_labels = label_encoder.inverse_transform(y_pred)

print("\n--- Test Results ---")
for text, label in zip(texts, predicted_labels):
    print(f"Text: '{text}'\nPredicted: {label}\n")

NameError: name 'loaded_model' is not defined