In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_excel('/content/exact_label_rows.xlsx')

In [None]:
df.head()

Unnamed: 0,text,label
0,Hi broker,Not hate
1,22:20 Ravish ji already knows this thing.,Not hate
2,Sidhu disappeared,Not hate
3,exit poll at 2:42,Not hate
4,"Ravish ji, say something about trains Sir ji M...",Not hate


In [None]:
df.value_counts('label')

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Not hate,5468
Hate,406


In [None]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[df['label'] == 'Not hate']
df_minority = df[df['label'] == 'Hate']

# Upsample minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True,      # sample with replacement
                                 n_samples=len(df_majority),
                                 random_state=42)

# Combine majority class with upsampled minority class
df_balanced = pd.concat([df_majority, df_minority_upsampled])



In [None]:
df_balanced.value_counts('label')

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Hate,5468
Not hate,5468


In [None]:
df_balanced.head()

Unnamed: 0,text,label
0,Hi broker,Not hate
1,22:20 Ravish ji already knows this thing.,Not hate
2,Sidhu disappeared,Not hate
3,exit poll at 2:42,Not hate
4,"Ravish ji, say something about trains Sir ji M...",Not hate


In [None]:
pip install emoji

Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/586.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.0


In [None]:
pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m42.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993222 sha256=f9907c342e22ceeb28f61c3760f7c416a0507f9e84a1c78a4bfde5896c7db666
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [None]:
import re
import emoji
import requests
from langdetect import detect, LangDetectException


In [None]:
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'http\S+', '', text)  # Remove links
        text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove mentions
        text = re.sub(r'#[A-Za-z0-9]+', '', text)  # Remove hashtags
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = emoji.demojize(text)  # Convert emojis to text
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    else:
        text = ''
    return text

In [None]:
from sklearn.model_selection import train_test_split
m = df_balanced['text']
X = df_balanced['text'].apply(clean_text)
y = df_balanced['label']


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X_vector = vectorizer.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_vector, y, test_size=0.2, random_state=42)

In [None]:


from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier



# List of models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Support Vector Machine': SVC(),
    'Naive Bayes': MultinomialNB(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Neural Network': MLPClassifier(max_iter=1000)
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"{name}: {accuracy:.4f}")

# Print results
print("\nClassification Model Performance:")
for model, accuracy in results.items():
    print(f"{model}: {accuracy:.4f}")


Logistic Regression: 0.9506
Decision Tree: 0.9401
Random Forest: 0.9954
Gradient Boosting: 0.8373




AdaBoost: 0.7674
Support Vector Machine: 0.9959
Naive Bayes: 0.8985
K-Nearest Neighbors: 0.8574
Neural Network: 0.9452

Classification Model Performance:
Logistic Regression: 0.9506
Decision Tree: 0.9401
Random Forest: 0.9954
Gradient Boosting: 0.8373
AdaBoost: 0.7674
Support Vector Machine: 0.9959
Naive Bayes: 0.8985
K-Nearest Neighbors: 0.8574
Neural Network: 0.9452


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, None],
    'criterion': ['gini', 'entropy']
}

# Create a base model
rf = RandomForestClassifier(random_state=42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Use the best model to predict
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


Fitting 5 folds for each of 72 candidates, totalling 360 fits


120 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sk

Best parameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 'log2', 'n_estimators': 200}
Accuracy: 0.9950


In [None]:
best_rf = RandomForestClassifier(
    criterion='gini',
    max_depth=None,
    max_features='log2',
    n_estimators=200,
    random_state=42
)

# Train the model
best_rf.fit(X_train, y_train)

# Make predictions
y_pred = best_rf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)


In [None]:
print('Accuracy: ',accuracy)

Accuracy:  0.9949725776965265


In [None]:
#pickle the model
import pickle
pickle.dump(best_rf, open('model_open.pkl', 'wb'))