In [9]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [13]:
df = pd.read_csv('New_Merge_Data_Comment.csv')
df.shape

(190065, 3)

In [None]:
from collections import Counter
import math

import numpy as np 

N_occurance = 5

words = [df['Comment'][i].split() for i in range(len(df))]
bag_of_words = sorted(list({word for list_ in words for word in list_}))
word_count = {word:0 for word in bag_of_words}
for word in bag_of_words:
    for list_ in words:
        if word in list_:
            word_count[word]+=1
filtered_words = [k for k,v in word_count.items() if v>N_occurance]
evaluation = np.zeros((len(df),len(filtered_words)))
for i in range(len(words)):
    counter = Counter(words[i])
    for k, v in counter.items():
        if k not in filtered_words:
            continue
        count = [1 for list_ in words if k in list_]
        evaluation[i][filtered_words.index(k)] = sum(count)/len(words)

# print(filtered_words)
evaluation.shape

In [15]:
df = pd.read_csv('New_Merge_Data_Comment.csv')

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X = tfidf_vectorizer.fit_transform(df['Comment'])

# MinMax scaling
scaler = MaxAbsScaler()
X = scaler.fit_transform(X)

# Label Encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Class'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Applying models

from sklearn.calibration import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier


labels = df['Class']
models = [
    RandomForestClassifier(max_features='log2', n_estimators=1000, criterion = 'entropy', 
                           random_state = 0),
    LinearSVC(C=1.0, random_state=0),
    MultinomialNB(alpha=1, fit_prior=True),
    LogisticRegression(C=1.0, penalty='l2', solver='newton-cg', random_state = 0),
    DecisionTreeClassifier(criterion='gini', 
                           max_features=None,min_samples_leaf=1, 
                           min_samples_split=2, random_state=0),
]

cv_df = pd.DataFrame()
entries = []

for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, X, labels, scoring='accuracy', cv=10)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))

cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

from matplotlib import pyplot as plt 
import seaborn as sns
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, size=10, jitter=True, linewidth=1)
plt.show()

In [11]:
# Hyperparameter Tuning for Logistic Regression
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l2']
}

model = LogisticRegression(max_iter=1000)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_params_lr = grid_search.best_params_
best_score_lr = grid_search.best_score_

print("Best parameters for Logistic Regression:", best_params_lr)
print("Best score for Logistic Regression:", best_score_lr)

Best parameters for Logistic Regression: {'C': 10, 'penalty': 'l2'}
Best score for Logistic Regression: 0.9826243610621734


In [14]:
# Define the reduced search space
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': [None, 10, 20],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 4)
}

# Initialize Random Forest model
model = RandomForestClassifier(random_state=42)

# Randomized search with reduced search space and fewer folds
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=3, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

# Get best parameters and best score
best_params_rf = random_search.best_params_
best_score_rf = random_search.best_score_

print("Best parameters for Random Forest:", best_params_rf)
print("Best score for Random Forest:", best_score_rf)

Best parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 171}
Best score for Random Forest: 0.9815786704548444
