In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from gensim.models import Word2Vec
import numpy as np


# Load the dataset
df = pd.read_csv('/content/draft_50000.csv')



# Text preprocessing
df['text'] = df['text'].str.lower().str.replace('[^\w\s]', '', regex=True)
df.dropna(subset=['text', 'class'], inplace=True)

# Naive Bayes with TF-IDF

tfidf = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf.fit_transform(df['text'])
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

model_tfidf = MultinomialNB()
model_tfidf.fit(X_train, y_train)

y_pred_tfidf = model_tfidf.predict(X_test)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print('\n')
print(f'Accuracy (TF-IDF): {accuracy_tfidf:.2f}')
conf_matrix_tfidf = confusion_matrix(y_test, y_pred_tfidf)
print('Confusion Matrix (TF-IDF):')
print(conf_matrix_tfidf)
class_report_tfidf = classification_report(y_test, y_pred_tfidf)
print('Classification Report (TF-IDF):')
print(class_report_tfidf)



# Naive Bayes with Word2Vec

sentences = [text.split() for text in df['text']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)


X_w2v = []
for sentence in sentences:
    sentence_vectors = [word2vec_model.wv[word] for word in sentence if word in word2vec_model.wv]
    if sentence_vectors:
        X_w2v.append(np.mean(sentence_vectors, axis=0))
    else:
        X_w2v.append(np.zeros(word2vec_model.vector_size))
X_w2v = np.array(X_w2v)


X_w2v -= X_w2v.min()

X_train, X_test, y_train, y_test = train_test_split(X_w2v, y, test_size=0.2, random_state=42)

model_w2v = MultinomialNB()
model_w2v.fit(X_train, y_train)

y_pred_w2v = model_w2v.predict(X_test)
accuracy_w2v = accuracy_score(y_test, y_pred_w2v)
print('\n')
print(f'Accuracy (Word2Vec): {accuracy_w2v:.2f}')
conf_matrix_w2v = confusion_matrix(y_test, y_pred_w2v)
print('Confusion Matrix (Word2Vec):')
print(conf_matrix_w2v)
class_report_w2v = classification_report(y_test, y_pred_w2v)
print('Classification Report (Word2Vec):')
print(class_report_w2v)


# Naive Bayes with CountVectorizer

count_vectorizer = CountVectorizer(stop_words='english')
X_count = count_vectorizer.fit_transform(df['text'])


X_train, X_test, y_train, y_test = train_test_split(X_count, y, test_size=0.2, random_state=42)


model_count = MultinomialNB()
model_count.fit(X_train, y_train)

y_pred_count = model_count.predict(X_test)
accuracy_count = accuracy_score(y_test, y_pred_count)
print('\n')
print(f'Accuracy (CountVectorizer): {accuracy_count:.2f}')
conf_matrix_count = confusion_matrix(y_test, y_pred_count)
print('Confusion Matrix (CountVectorizer):')
print(conf_matrix_count)
class_report_count = classification_report(y_test, y_pred_count)
print('Classification Report (CountVectorizer):')
print(class_report_count)



# Naive Bayes with GloVE

glove_file = '/content/draft_50000.csv'
glove_embeddings = {}
with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = vector

max_length = 100

X_glove = []
for sentence in sentences:
    sentence_vectors = [glove_embeddings[word] for word in sentence if word in glove_embeddings]
    if sentence_vectors:
        sentence_vector = np.mean(sentence_vectors, axis=0)
    else:
        sentence_vector = np.zeros(max_length)


    sentence_vector = list(sentence_vector)
    sentence_vector += [0.0] * (max_length - len(sentence_vector))
    sentence_vector = sentence_vector[:max_length]

    X_glove.append(sentence_vector)

X_glove = np.array(X_glove)

X_glove -= X_glove.min()

X_train, X_test, y_train, y_test = train_test_split(X_glove, y, test_size=0.2, random_state=42)

X_train -= X_train.min()
X_test -= X_test.min()


model_glove = MultinomialNB()
model_glove.fit(X_train, y_train)
print('\n')

y_pred_glove = model_glove.predict(X_test)
accuracy_glove = accuracy_score(y_test, y_pred_glove)
print(f'Accuracy (GloVE): {accuracy_glove:.2f}')
conf_matrix_glove = confusion_matrix(y_test, y_pred_glove)
print('Confusion Matrix (GloVE):')
print(conf_matrix_glove)
class_report_glove = classification_report(y_test, y_pred_glove)
print('Classification Report (GloVE):')
print(class_report_glove)





Accuracy (TF-IDF): 0.86
Confusion Matrix (TF-IDF):
[[1992  653]
 [  91 2530]]
Classification Report (TF-IDF):
              precision    recall  f1-score   support

         0.0       0.96      0.75      0.84      2645
         1.0       0.79      0.97      0.87      2621

    accuracy                           0.86      5266
   macro avg       0.88      0.86      0.86      5266
weighted avg       0.88      0.86      0.86      5266



Accuracy (Word2Vec): 0.84
Confusion Matrix (Word2Vec):
[[2102  543]
 [ 316 2305]]
Classification Report (Word2Vec):
              precision    recall  f1-score   support

         0.0       0.87      0.79      0.83      2645
         1.0       0.81      0.88      0.84      2621

    accuracy                           0.84      5266
   macro avg       0.84      0.84      0.84      5266
weighted avg       0.84      0.84      0.84      5266



Accuracy (CountVectorizer): 0.86
Confusion Matrix (CountVectorizer):
[[2018  627]
 [  98 2523]]
Classification Rep