In [1]:
# 1. IMPORTS & INSTALLATIONS

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Install necessary libraries (safe if already installed)
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------- ------------------------------ 3.1/12.8 MB 17.9 MB/s eta 0:00:01
     ------------------------- -------------- 8.1/12.8 MB 21.2 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 21.4 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 20.4 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
# 2. LOAD TRAINING DATASET (NO HEADERS IN YOUR CSV)

columns = ["id", "entity", "sentiment", "tweet"]

df = pd.read_csv("data/twitter_training.csv", names=columns)

print("Shape:", df.shape)
df.head()
df.info()

Shape: (74682, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         74682 non-null  int64 
 1   entity     74682 non-null  object
 2   sentiment  74682 non-null  object
 3   tweet      73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [3]:

# 3. FIX MISSING VALUES (AVOID spaCy ERRORS)

df['tweet'] = df['tweet'].fillna("").astype(str)


In [4]:
# 4. LABEL ENCODING (sentiment → numbers)

le = LabelEncoder()
df['sentiment_encoded'] = le.fit_transform(df['sentiment'])

df[['sentiment', 'sentiment_encoded']].head()

Unnamed: 0,sentiment,sentiment_encoded
0,Positive,3
1,Positive,3
2,Positive,3
3,Positive,3
4,Positive,3


In [5]:
# 5. PREPROCESSING USING SPACY

import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    """Clean text, remove stopwords & punctuation, apply lemmatization."""
    if not isinstance(text, str):
        text = str(text)

    doc = nlp(text)
    filtered_tokens = []
    
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens)

# Apply preprocessing (this step takes 3–5 minutes)
df['clean_text'] = df['tweet'].apply(preprocess)
df.head()


Unnamed: 0,id,entity,sentiment,tweet,sentiment_encoded,clean_text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,3,m get borderland murder
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,3,come border kill
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,3,m get borderland kill
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,3,m come borderland murder
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,3,m get borderland 2 murder


In [6]:
# 6. TRAIN–TEST SPLIT

X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'],
    df['sentiment_encoded'],
    test_size=0.2,
    random_state=42,
    stratify=df['sentiment_encoded']
)

print("Training samples:", X_train.shape)
print("Testing samples:", X_test.shape)

Training samples: (59745,)
Testing samples: (14937,)


In [7]:
# 7. MODEL 1: NAIVE BAYES

clf_nb = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

clf_nb.fit(X_train, y_train)
y_pred_nb = clf_nb.predict(X_test)

print("\n===== Naive Bayes Results =====")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


===== Naive Bayes Results =====
Accuracy: 0.7256477204257883
              precision    recall  f1-score   support

           0       0.94      0.46      0.61      2598
           1       0.65      0.89      0.75      4509
           2       0.82      0.63      0.71      3664
           3       0.71      0.80      0.75      4166

    accuracy                           0.73     14937
   macro avg       0.78      0.69      0.71     14937
weighted avg       0.76      0.73      0.72     14937



In [8]:
# 8. MODEL 2: RANDOM FOREST

clf_rf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

clf_rf.fit(X_train, y_train)
y_pred_rf = clf_rf.predict(X_test)

print("\n===== Random Forest Results =====")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


===== Random Forest Results =====
Accuracy: 0.9064738568655017
              precision    recall  f1-score   support

           0       0.97      0.84      0.90      2598
           1       0.93      0.92      0.92      4509
           2       0.85      0.93      0.89      3664
           3       0.90      0.91      0.91      4166

    accuracy                           0.91     14937
   macro avg       0.91      0.90      0.90     14937
weighted avg       0.91      0.91      0.91     14937



In [9]:
# 9. LOAD VALIDATION DATASET

test_df = pd.read_csv("data/twitter_validation.csv", names=columns)
test_df['tweet'] = test_df['tweet'].fillna("").astype(str)
test_df['clean_text'] = test_df['tweet'].apply(preprocess)

test_df.head()

Unnamed: 0,id,entity,sentiment,tweet,clean_text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...,mention Facebook struggle motivation run day t...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...,BBC News Amazon boss Jeff Bezos reject claim c...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...,@Microsoft pay WORD function poorly @samsungu ...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,...",csgo matchmaking closet hacking truly awful game
4,4433,Google,Neutral,Now the President is slapping Americans in the...,President slap Americans face commit unlawful ...


In [10]:

# 10. SAMPLE PREDICTION USING RANDOM FOREST

sample_text = test_df['tweet'][10]
print("\nTweet:", sample_text)
print("Original Sentiment:", test_df['sentiment'][10])

processed = [preprocess(sample_text)]
pred = clf_rf.predict(processed)

print("Predicted Sentiment:", le.inverse_transform(pred)[0])


Tweet: The professional dota 2 scene is fucking exploding and I completely welcome it.

Get the garbage out.
Original Sentiment: Positive
Predicted Sentiment: Positive
