# Reading the data

In [27]:
import pandas as pd

df = pd.read_csv("Twitter_Data.csv")
df.head()

Unnamed: 0,text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [28]:
df.shape

(162980, 2)

In [29]:
df.category.value_counts()

category
 1.0    72250
 0.0    55213
-1.0    35510
Name: count, dtype: int64

# Removing null values

In [30]:
df.isna().sum()

text        4
category    7
dtype: int64

In [31]:
df = df.dropna()

# Preprocessing 

In [32]:
import spacy

nlp = spacy.load("en_core_web_lg")

In [33]:
def preprocess(text):
    doc = nlp(text)

    filtered_tokens = []

    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens)

In [34]:
df['clean_text'] = df['text'].apply(preprocess)

In [None]:
df.head()

# Training the model

In [None]:
df['vector'] = df['clean_text'].apply(lambda x: nlp(x).vector)

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.vector.values,
    df.category, 
    test_size=0.2,
    random_state=2024
)

In [None]:
X_train.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_train, y_train) 

# Evaluating the model

In [None]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))