In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_excel('/content/labeled_comments.xlsx')

In [3]:
df.head()

Unnamed: 0,text,cleaned_comments,label
0,"You may not get a heart attack, crows.",you may not get a heart attack crows,hate
1,Today these newspaper exit polls are in favor ...,today these newspaper exit polls are in favor ...,not hate
2,"🤣🤣🤣🤣🤣 Abbey, you go first and sit here in the ...",abbey you go first and sit here in the room t...,not hate
3,"In Namaskar, Modi's blind opponent Ravish Kuma...",in namaskar modis blind opponent ravish kumar ...,not hate
4,Now why will the exit poll give 400 seats to C...,now why will the exit poll give 400 seats to c...,not hate


In [4]:
df.drop('cleaned_comments', axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,text,label
0,"You may not get a heart attack, crows.",hate
1,Today these newspaper exit polls are in favor ...,not hate
2,"🤣🤣🤣🤣🤣 Abbey, you go first and sit here in the ...",not hate
3,"In Namaskar, Modi's blind opponent Ravish Kuma...",not hate
4,Now why will the exit poll give 400 seats to C...,not hate


In [6]:
df.value_counts('label')

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
not hate,17437
hate,1043


In [7]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[df['label'] == 'not hate']
df_minority = df[df['label'] == 'hate']

# Upsample minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True,      # sample with replacement
                                 n_samples=len(df_majority),
                                 random_state=42)

# Combine majority class with upsampled minority class
df_balanced = pd.concat([df_majority, df_minority_upsampled])

# Save the balanced DataFrame to an Excel file


In [8]:
df_balanced.value_counts('label')

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
hate,17437
not hate,17437


In [9]:
df_balanced.head()

Unnamed: 0,text,label
1,Today these newspaper exit polls are in favor ...,not hate
2,"🤣🤣🤣🤣🤣 Abbey, you go first and sit here in the ...",not hate
3,"In Namaskar, Modi's blind opponent Ravish Kuma...",not hate
4,Now why will the exit poll give 400 seats to C...,not hate
5,Only ✋🙌🙌🙌🖐️🙌🖐️🖐️🙌🙌🖐️,not hate


In [16]:
pip install emoji



In [19]:
pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993222 sha256=7624b46ba5453e4c14328d06e8cd4805f21278cfab3a6b0831952f94626c7023
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [20]:
import re
import emoji
import requests
from langdetect import detect, LangDetectException


In [21]:
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'http\S+', '', text)  # Remove links
        text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove mentions
        text = re.sub(r'#[A-Za-z0-9]+', '', text)  # Remove hashtags
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = emoji.demojize(text)  # Convert emojis to text
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    else:
        text = ''
    return text

In [23]:
from sklearn.model_selection import train_test_split
m = df_balanced['text']
X = df_balanced['text'].apply(clean_text)
y = df_balanced['label']


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X_vector = vectorizer.fit_transform(X)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_vector, y, test_size=0.2, random_state=42)

In [28]:


from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier



# List of models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Support Vector Machine': SVC(),
    'Naive Bayes': MultinomialNB(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Neural Network': MLPClassifier(max_iter=1000)
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"{name}: {accuracy:.4f}")

# Print results
print("\nClassification Model Performance:")
for model, accuracy in results.items():
    print(f"{model}: {accuracy:.4f}")


Logistic Regression: 0.9690
Decision Tree: 0.9653
Random Forest: 0.9994
Gradient Boosting: 0.8126




AdaBoost: 0.8103
Support Vector Machine: 0.9991
Naive Bayes: 0.9193
K-Nearest Neighbors: 0.9528
Neural Network: 0.9890

Classification Model Performance:
Logistic Regression: 0.9690
Decision Tree: 0.9653
Random Forest: 0.9994
Gradient Boosting: 0.8126
AdaBoost: 0.8103
Support Vector Machine: 0.9991
Naive Bayes: 0.9193
K-Nearest Neighbors: 0.9528
Neural Network: 0.9890


In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, None],
    'criterion': ['gini', 'entropy']
}

# Create a base model
rf = RandomForestClassifier(random_state=42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Use the best model to predict
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


Fitting 5 folds for each of 72 candidates, totalling 360 fits


120 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
52 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
skl

Best parameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 'log2', 'n_estimators': 300}
Accuracy: 0.9996


In [30]:
best_rf = RandomForestClassifier(
    criterion='gini',
    max_depth=None,
    max_features='log2',
    n_estimators=300,
    random_state=42
)

# Train the model
best_rf.fit(X_train, y_train)

# Make predictions
y_pred = best_rf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)


<function print>

In [31]:
print('Accuracy: ',accuracy)

Accuracy:  0.9995698924731182


In [32]:
#pickle the model
import pickle
pickle.dump(best_rf, open('model.pkl', 'wb'))