In [46]:

import re
import pandas as pd


In [150]:
df = pd.read_csv("cleaned_labeled_data.csv")
df = df[12000:]
# df = df.sample(frac=1)


df

Unnamed: 0,comments,labels
12000,Anyone 9 may 2025😂😂❤,POSITIVE
12001,The way you explained this is SO clear. Thank you,POSITIVE
12002,Anyone 9 may 2025,POSITIVE
12003,Listen for the last time😂😂. All Pakistani chan...,NEGATIVE
12004,May 2025?😂,POSITIVE
...,...,...
33907,I have no nostalgia for this and I still hate it.,NEGATIVE
33908,"Ngl, ninja being the bus driver kinda slaps",NEGATIVE
33909,2025 here? 🎉,POSITIVE
33910,20m dislikes,NEGATIVE


In [151]:
df['labels'].value_counts()

labels
POSITIVE    12771
NEGATIVE     9141
Name: count, dtype: int64

In [154]:
df.describe()

Unnamed: 0,comments,labels
count,21912,21912
unique,21106,2
top,LFG,POSITIVE
freq,38,12771


In [155]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21912 entries, 12000 to 33911
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   comments  21912 non-null  object
 1   labels    21912 non-null  object
dtypes: object(2)
memory usage: 342.5+ KB


In [156]:
df.isna().sum()

comments    0
labels      0
dtype: int64

In [157]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['labels'] = encoder.fit_transform(df['labels'])

In [158]:
encoder.classes_

array(['NEGATIVE', 'POSITIVE'], dtype=object)

In [159]:

processed_comments = []
for comment in df['comments']:

  # Remove all the special characters
  processed_comment = re.sub(r'\W', ' ', str(comment))

  # remove all single characters
  processed_comment= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_comment)

  # Remove single characters from the start
  processed_comment = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_comment)

  # Substituting multiple spaces with single space
  processed_comment = re.sub(r'\s+', ' ', processed_comment, flags=re.I)

  # Removing prefixed 'b'
  processed_comment = re.sub(r'^b\s+', '', processed_comment)

  # Converting to Lowercase
  processed_comment = processed_comment.lower()
  processed_comments.append(processed_comment)
     

In [160]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shrisha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [161]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words=stopwords.words('english'))
processed_comments = vectorizer.fit_transform(processed_comments).toarray()

In [162]:
x = processed_comments
y = df['labels']

In [163]:

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=25)

In [164]:

from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=120, random_state=25)
rf_classifier.fit(x_train, y_train)

In [166]:
rf_predictions = rf_classifier.predict(x_test)

In [167]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test, rf_predictions))
print(classification_report(y_test, rf_predictions))
print(accuracy_score(y_test, rf_predictions))

[[1161  698]
 [ 413 2111]]
              precision    recall  f1-score   support

           0       0.74      0.62      0.68      1859
           1       0.75      0.84      0.79      2524

    accuracy                           0.75      4383
   macro avg       0.74      0.73      0.73      4383
weighted avg       0.75      0.75      0.74      4383

0.7465206479580196


In [None]:
from sklearn.svm import SVC

sv_classifier= SVC(
    C=9,
    kernel='rbf',
    gamma='scale',
    max_iter=-1
)
sv_classifier.fit(x_train,y_train)

In [169]:
svc_predictions = sv_classifier.predict(x_test)

In [170]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test, svc_predictions))
print(classification_report(y_test, svc_predictions))
print(accuracy_score(y_test, svc_predictions))

[[1229  630]
 [ 362 2162]]
              precision    recall  f1-score   support

           0       0.77      0.66      0.71      1859
           1       0.77      0.86      0.81      2524

    accuracy                           0.77      4383
   macro avg       0.77      0.76      0.76      4383
weighted avg       0.77      0.77      0.77      4383

0.7736710015970796


In [None]:
models = [
    {'name':'Random Forest','model':rf_classifier,'vectorizer':vectorizer},
    {'name':'SVM','model':sv_classifier,'vectorizer':vectorizer}
]


In [None]:
import pickle

with open("models.pkl", "wb") as file:
    pickle.dump(models,file)
