In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import faiss
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import re
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.utils import resample
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from scipy.sparse import hstack, csr_matrix
from scipy.stats import uniform, randint


In [18]:
data = pd.read_csv('../../Datasets/balanced_data.csv')

In [19]:
data.columns

Index(['Label', 'Preprocessed_Content', 'sentiment', 'hate_speech_count',
       'pos_tags', 'word2vec', 'sbert_embedding', 'lexical_diversity',
       'sentence_complexity'],
      dtype='object')

In [20]:
len(data)

60000

In [21]:
data['Label'].value_counts()

Label
0    30000
1    30000
Name: count, dtype: int64

In [22]:
from sklearn.utils import resample

class_0 = data[data['Label'] == 0]
class_1 = data[data['Label'] == 1]

class_0_sampled = resample(class_0, n_samples=10000, random_state=42)
class_1_sampled = resample(class_1, n_samples=10000, random_state=42)

balanced_data = pd.concat([class_0_sampled, class_1_sampled])

balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

print(balanced_data['Label'].value_counts())


Label
1    10000
0    10000
Name: count, dtype: int64


In [23]:
train_data = balanced_data

In [24]:
def convert_embedding(embedding):
    if isinstance(embedding, np.ndarray):
        return embedding
    elif isinstance(embedding, list):
        return np.array(embedding, dtype=np.float32)
    elif isinstance(embedding, str):
        embedding = embedding.strip("[]")
        embedding = np.array([float(x) for x in embedding.split()], dtype=np.float32)
        return embedding
    else:
        return np.zeros(768, dtype=np.float32)

train_data['sbert_embedding'] = train_data['sbert_embedding'].apply(convert_embedding)

embeddings = np.stack(train_data['sbert_embedding'].values)

print("Embeddings Shape:", embeddings.shape)


Embeddings Shape: (20000, 384)


In [25]:
faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
faiss_index.add(embeddings)

In [26]:
tfidf = TfidfVectorizer(ngram_range=(2, 3), stop_words='english')
X_tfidf = tfidf.fit_transform(train_data['Preprocessed_Content'])

In [27]:
X = hstack([
    X_tfidf,
    embeddings,
    train_data[['sentiment', 'hate_speech_count', 'lexical_diversity', 'sentence_complexity']].values
])

Y = train_data['Label']

In [28]:
smote = SMOTE(random_state=42)
X_res, Y_res = smote.fit_resample(X, Y)

In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(X_res, Y_res, test_size=0.2, random_state=42)

In [30]:
class_weights = compute_class_weight('balanced', classes=np.unique(Y_res), y=Y_res)
class_weight_dict = dict(zip(np.unique(Y_res), class_weights))

In [31]:
# === LOGISTIC REGRESSION ===
logistic_model = LogisticRegression(max_iter=500, C=1.0)
logistic_model.fit(X_train, Y_train)
print('Logistic Regression Trained')

logistic_pred = logistic_model.predict(X_test)
print("Logistic Regression Performance:")
print(classification_report(Y_test, logistic_pred))

Logistic Regression Trained
Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.83      0.79      0.81      2055
           1       0.79      0.83      0.81      1945

    accuracy                           0.81      4000
   macro avg       0.81      0.81      0.81      4000
weighted avg       0.81      0.81      0.81      4000



In [33]:
# === RFC using RandomizedSearchCV ===
rf_params = {
    'n_estimators': [100],
    'max_depth': [10, 20, None],
    'max_features': ['sqrt'],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf_model = RandomizedSearchCV(
    RandomForestClassifier(n_jobs=-1, random_state=42), 
    rf_params, 
    n_iter=3, 
    cv=2, 
    scoring='accuracy', 
    n_jobs=-1, 
    verbose=1, 
    random_state=42
)

rf_model.fit(X_train, Y_train)

rf_pred = rf_model.best_estimator_.predict(X_test)
print("Random Forest Performance:")
print(classification_report(Y_test, rf_pred))


Fitting 2 folds for each of 3 candidates, totalling 6 fits
Random Forest Performance:
              precision    recall  f1-score   support

           0       0.85      0.80      0.83      2055
           1       0.80      0.85      0.83      1945

    accuracy                           0.83      4000
   macro avg       0.83      0.83      0.83      4000
weighted avg       0.83      0.83      0.83      4000



In [None]:
# === SVM using RandomizedSearchCV ===
svm_params = {'C': np.logspace(-2, 2, 5), 'kernel': ['rbf']}
svm_model = RandomizedSearchCV(SVC(cache_size=1000), svm_params, n_iter=3, cv=2, scoring='accuracy', n_jobs=-1, random_state=42)
svm_model.fit(X_train, Y_train)
print('SVM Trained')

svm_pred = svm_model.best_estimator_.predict(X_test)
print("SVM Performance:")
print(classification_report(Y_test, svm_pred))