In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import faiss
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import re
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
data = pd.read_csv('Datasets/features.csv')

In [3]:
data.columns

Index(['Content', 'Label', 'Content_int', 'Cleaned_Content',
       'Preprocessed_Content', 'sentiment', 'hate_speech_count', 'pos_tags',
       'word2vec', 'sbert_embedding', 'lexical_diversity',
       'sentence_complexity'],
      dtype='object')

In [4]:
len(data)

440215

In [5]:
train_data = data[0:10000]

In [6]:
def convert_embedding(embedding):
    if isinstance(embedding, np.ndarray):
        return embedding
    elif isinstance(embedding, list):
        return np.array(embedding, dtype=np.float32)
    elif isinstance(embedding, str):
        embedding = embedding.strip("[]")
        embedding = np.array([float(x) for x in embedding.split()], dtype=np.float32)
        return embedding
    else:
        return np.zeros(768, dtype=np.float32)

train_data['sbert_embedding'] = train_data['sbert_embedding'].apply(convert_embedding)

embeddings = np.stack(train_data['sbert_embedding'].values)

print("Embeddings Shape:", embeddings.shape)


Embeddings Shape: (10000, 384)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['sbert_embedding'] = train_data['sbert_embedding'].apply(convert_embedding)


In [7]:
# Initialize FAISS index
faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
faiss_index.add(embeddings)

In [8]:
# TF-IDF and BoW features
tfidf = TfidfVectorizer(ngram_range=(2, 3), stop_words='english')
X_tfidf = tfidf.fit_transform(train_data['Preprocessed_Content'])

In [9]:
vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words='english')
X_bow = vectorizer.fit_transform(train_data['Preprocessed_Content'])

In [12]:
# Combine all features
X = np.hstack([
    X_tfidf.toarray(),
    X_bow.toarray(),
    embeddings,
    train_data[['sentiment', 'hate_speech_count', 'lexical_diversity', 'sentence_complexity']].values
])

Y = train_data['Label']

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [15]:
# === LOGISTIC REGRESSION ===
logistic_model = LogisticRegression(max_iter=500, C=1.0)
logistic_model.fit(X_train, Y_train)
logistic_pred = logistic_model.predict(X_test)
print("Logistic Regression Performance:")
print(classification_report(Y_test, logistic_pred))

KeyboardInterrupt: 

In [None]:
# === SVM WITH HYPERPARAMETER TUNING ===
svm_params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
svm_model = GridSearchCV(SVC(), svm_params, cv=3, scoring='accuracy', n_jobs=-1)
svm_model.fit(X_train, Y_train)
svm_pred = svm_model.best_estimator_.predict(X_test)
print("SVM Performance:")
print(classification_report(Y_test, svm_pred))

In [None]:

# === RANDOM FOREST WITH HYPERPARAMETER TUNING ===
rf_params = {'n_estimators': [100, 200], 'max_depth': [10, 20, None]}
rf_model = GridSearchCV(RandomForestClassifier(), rf_params, cv=3, scoring='accuracy', n_jobs=-1)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.best_estimator_.predict(X_test)
print("Random Forest Performance:")
print(classification_report(y_test, rf_pred))

In [None]:
# # === DEEP LEARNING MODEL ===
# input_dim = X_train.shape[1]
# deep_model = Sequential([
#     Dense(512, activation='relu', input_shape=(input_dim,)),
#     Dropout(0.3),
#     Dense(256, activation='relu'),
#     Dropout(0.3),
#     Dense(128, activation='relu'),
#     Dropout(0.2),
#     Dense(1, activation='sigmoid')  # Adjust for multi-class classification if needed
# ])

# deep_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# deep_model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))
# deep_pred = (deep_model.predict(X_test) > 0.5).astype("int32")
# print("Deep Learning Model Performance:")
# print(classification_report(y_test, deep_pred))