# Reading the data

In [1]:
import pandas as pd

df = pd.read_csv("Twitter_Data.csv")
df.head()

Unnamed: 0,text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [2]:
df.shape

(162980, 2)

# Removing null values and balancing the dataset

In [3]:
df.isna().sum()

text        4
category    7
dtype: int64

In [4]:
df = df.dropna()

In [5]:
df.category.value_counts()

category
 1.0    72249
 0.0    55211
-1.0    35509
Name: count, dtype: int64

In [6]:
# handle class imbalance

min_samples = 35509

df_positive = df[df.category==1].sample(min_samples, random_state=2024)
df_negative = df[df.category==-1].sample(min_samples, random_state=2024)
df_neutral = df[df.category==0].sample(min_samples, random_state=2024)

In [7]:
df = pd.concat([df_positive, df_negative, df_neutral])
df.category.value_counts()

category
 1.0    35509
-1.0    35509
 0.0    35509
Name: count, dtype: int64

In [8]:
df.head()

Unnamed: 0,text,category
144778,his only strategy seems blame modikcrjagan sho...,1.0
160100,mckinsey report digitizing economy the world u...,1.0
33077,the nation saw one socalled ghatiyabandhan 199...,1.0
84728,these magic healers are supported and promoted...,1.0
124219,lol congress jittery after norah arrest denied...,1.0


# Preprocessing 

In [9]:
import spacy

nlp = spacy.load("en_core_web_lg")

In [10]:
def preprocess(text):
    doc = nlp(text)

    filtered_tokens = []

    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens)

In [11]:
df['clean_text'] = df['text'].apply(preprocess)

In [12]:
df.head()

Unnamed: 0,text,category,clean_text
144778,his only strategy seems blame modikcrjagan sho...,1.0,strategy blame modikcrjagan positive vision fu...
160100,mckinsey report digitizing economy the world u...,1.0,mckinsey report digitize economy world stand s...
33077,the nation saw one socalled ghatiyabandhan 199...,1.0,nation see socalle ghatiyabandhan 1996 agenda ...
84728,these magic healers are supported and promoted...,1.0,magic healer support promote call atheist part...
124219,lol congress jittery after norah arrest denied...,1.0,lol congress jittery norah arrest deny bail pr...


# Training the model

In [13]:
df['vector'] = df['clean_text'].apply(lambda x: nlp(x).vector)

In [14]:
df.head()

Unnamed: 0,text,category,clean_text,vector
144778,his only strategy seems blame modikcrjagan sho...,1.0,strategy blame modikcrjagan positive vision fu...,"[-0.28563753, 1.0427216, -0.7843717, 0.6185192..."
160100,mckinsey report digitizing economy the world u...,1.0,mckinsey report digitize economy world stand s...,"[0.35584846, -0.5865021, -0.96440154, 1.425723..."
33077,the nation saw one socalled ghatiyabandhan 199...,1.0,nation see socalle ghatiyabandhan 1996 agenda ...,"[-0.46112245, 1.043733, -0.979719, 1.2943965, ..."
84728,these magic healers are supported and promoted...,1.0,magic healer support promote call atheist part...,"[-0.4699736, 0.6226154, -2.1400151, 0.41911468..."
124219,lol congress jittery after norah arrest denied...,1.0,lol congress jittery norah arrest deny bail pr...,"[-0.7336413, 0.66533625, -1.9357251, 0.4713708..."


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.vector.values,
    df.category, 
    test_size=0.2,
    random_state=2024
)

In [16]:
X_train.shape

(85221,)

In [17]:
# since the x_train we got was a single numpy array, so we need to convert it to 2D

import numpy as np

X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [18]:
X_train_2d.shape

(85221, 300)

In [19]:
# scaling the data in the range of [0,1] to avoid negative values

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)

In [20]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(scaled_train_embed, y_train) 

# Evaluating the model

In [21]:
from sklearn.metrics import classification_report

y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        -1.0       0.52      0.49      0.50      7108
         0.0       0.49      0.59      0.54      7078
         1.0       0.52      0.44      0.48      7120

    accuracy                           0.51     21306
   macro avg       0.51      0.51      0.50     21306
weighted avg       0.51      0.51      0.50     21306



# Saving the model

In [22]:
from joblib import dump

dump(clf, 'model.joblib')

['model.joblib']