In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")


import spacy
nlp = spacy.load("en_core_web_lg")

### Bag of Words

In [20]:
import time
start = time.time()

df=pd.read_csv("spam.csv")

count_0,count_1 = df["Category"].value_counts()
df_0 = df[df["Category"] == "ham"]
df_1 = df[df["Category"] == "spam"]

df_1_over= df_1.sample(count_0,replace = True)
df = pd.concat([df_0,df_1_over],axis=0)

df["Category"] = df["Category"].apply(lambda x : 0 if x == "ham" else 1)
df.dropna(inplace=True,axis=1)
col=df.select_dtypes(include="object").columns 
cv=CountVectorizer()
for i in col:
    cols=cv.fit_transform(df[i])
    arr=cols.toarray() 
    column_name = list(map(lambda x : f"{i}-{x}",cv.get_feature_names_out()))
    data=pd.DataFrame(arr,columns = column_name)
    df = df.drop(i,axis = 1)
    df.reset_index(inplace = True)
    df=pd.concat([df,data],axis=1)
df.head()

X = df.drop("Category",axis=1)
y = df["Category"]

print("    -----------------------Bernoulli-------------------\n")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf =  BernoulliNB()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

print("    -----------------------Multinomial-------------------\n")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf =  MultinomialNB()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

end = time.time()
print(f"Time Taken {round(end - start)} sec")


    -----------------------Bernoulli-------------------

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       994
           1       1.00      0.97      0.99       936

    accuracy                           0.99      1930
   macro avg       0.99      0.99      0.99      1930
weighted avg       0.99      0.99      0.99      1930

    -----------------------Multinomial-------------------

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       961
           1       0.99      0.97      0.98       969

    accuracy                           0.98      1930
   macro avg       0.98      0.98      0.98      1930
weighted avg       0.98      0.98      0.98      1930

Time Taken 10 sec


### TF-IDF

In [21]:
import time
start = time.time()

df=pd.read_csv("spam.csv")

count_0,count_1 = df["Category"].value_counts()
df_0 = df[df["Category"] == "ham"]
df_1 = df[df["Category"] == "spam"]

df_1_over= df_1.sample(count_0,replace = True)
df = pd.concat([df_0,df_1_over],axis=0)

df["Category"] = df["Category"].apply(lambda x : 0 if x == "ham" else 1)
df.dropna(inplace=True,axis=1)
col=df.select_dtypes(include="object").columns 
cv=TfidfVectorizer()
for i in col:
    cols=cv.fit_transform(df[i])
    arr=cols.toarray() 
    column_name = list(map(lambda x : f"{i}-{x}",cv.get_feature_names_out()))
    data=pd.DataFrame(arr,columns = column_name)
    df = df.drop(i,axis = 1)
    df.reset_index(inplace = True)
    df=pd.concat([df,data],axis=1)
df.head()

X = df.drop("Category",axis=1)
y = df["Category"]

print("    -----------------------Bernoulli-------------------\n")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf =  BernoulliNB()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

print("    -----------------------Multinomial-------------------\n")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf =  MultinomialNB()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

end = time.time()
print(f"Time Taken {round(end - start)} sec")

    -----------------------Bernoulli-------------------

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       947
           1       1.00      0.97      0.98       983

    accuracy                           0.98      1930
   macro avg       0.98      0.98      0.98      1930
weighted avg       0.98      0.98      0.98      1930

    -----------------------Multinomial-------------------

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       939
           1       0.99      0.98      0.99       991

    accuracy                           0.99      1930
   macro avg       0.99      0.99      0.99      1930
weighted avg       0.99      0.99      0.99      1930

Time Taken 8 sec


### Word2Vec

In [2]:
import time
start = time.time()
import gensim.downloader as api
word = api.load("word2vec-google-news-300")
end = time.time()
print(f"Time Taken {round(end - start)} sec")

Time Taken 91 sec


In [23]:
import time
start = time.time()
def preprocessing(text):
    doc = text.split()
    vector = [word[token] for token in doc if token in word]
    return np.mean(vector,axis=0)

df=pd.read_csv("spam.csv")

count_0,count_1 = df["Category"].value_counts()
df_0 = df[df["Category"] == "ham"]
df_1 = df[df["Category"] == "spam"]

df_1_over= df_1.sample(count_0,replace = True)
df = pd.concat([df_0,df_1_over],axis=0)

sep = list(map(lambda x: (x, df[x].fillna(df[x].mode()[0], inplace = True)), df.columns))

df["Category"] = df["Category"].apply(lambda x : 0 if x == "ham" else 1)

col=df.select_dtypes(include="object").columns 

for i in col:
    df[i] = df[i].apply(preprocessing)
df.dropna(inplace=True)
X = np.stack(df["Message"].values)
y = df["Category"]


X_train, X_test, y_train, y_test = train_test_split(
    X,
    y, 
    test_size=0.33, 
    random_state=42,stratify=y)

scaler = MinMaxScaler()
scaler_train = scaler.fit_transform(X_train)
scaler_test = scaler.fit_transform(X_test)


print("    -----------------------Bernoulli-------------------\n")
clf =  BernoulliNB()
clf.fit(scaler_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

print("    -----------------------Multinomial-------------------\n")
clf =  MultinomialNB()
clf.fit(scaler_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

end = time.time()
print(f"Time Taken {round(end - start)} sec")

    -----------------------Bernoulli-------------------

              precision    recall  f1-score   support

           0       0.50      1.00      0.67      1580
           1       0.00      0.00      0.00      1586

    accuracy                           0.50      3166
   macro avg       0.25      0.50      0.33      3166
weighted avg       0.25      0.50      0.33      3166

    -----------------------Multinomial-------------------

              precision    recall  f1-score   support

           0       0.76      0.96      0.85      1580
           1       0.95      0.70      0.81      1586

    accuracy                           0.83      3166
   macro avg       0.86      0.83      0.83      3166
weighted avg       0.86      0.83      0.83      3166

Time Taken 1 sec


### GLove

In [24]:
import time
start = time.time()
df = pd.read_csv("spam.csv")
count_0,count_1 = df["Category"].value_counts()
df_0 = df[df["Category"] == "ham"]
df_1 = df[df["Category"] == "spam"]

df_1_over= df_1.sample(count_0,replace = True)
df = pd.concat([df_0,df_1_over],axis=0)

df["Category"] = df["Category"].apply(lambda x : 0 if x == "ham" else 1)

df["Message"] = df["Message"].apply(lambda x : nlp(x).vector)

X = np.stack(df["Message"])
y = np.stack(df["Category"])

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y, 
    test_size=0.33, 
    random_state=42,stratify=y)
scaler = MinMaxScaler()
scaler_train = scaler.fit_transform(X_train)
scaler_test = scaler.fit_transform(X_test)


print("    -----------------------Bernoulli-------------------\n")

clf =  BernoulliNB()
clf.fit(scaler_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

print("    -----------------------Multinomial-------------------\n")
clf = MultinomialNB()
clf.fit(scaler_train,y_train)
y_pred = clf.predict(scaler_test)
print(classification_report(y_test,y_pred))

end = time.time()
print(f"Time Taken {round(end - start)} sec")

    -----------------------Bernoulli-------------------

              precision    recall  f1-score   support

           0       0.50      1.00      0.67      1593
           1       0.00      0.00      0.00      1592

    accuracy                           0.50      3185
   macro avg       0.25      0.50      0.33      3185
weighted avg       0.25      0.50      0.33      3185

    -----------------------Multinomial-------------------

              precision    recall  f1-score   support

           0       0.92      0.79      0.85      1593
           1       0.82      0.93      0.87      1592

    accuracy                           0.86      3185
   macro avg       0.87      0.86      0.86      3185
weighted avg       0.87      0.86      0.86      3185

Time Taken 122 sec


In [2]:
df=pd.read_csv("IMDB Dataset.csv")
df = df.head(10000)

df["sentiment"] = df["sentiment"].apply(lambda x : 0 if x == "negative" else 1)
df.dropna(inplace=True,axis=1)
col=df.select_dtypes(include="object").columns 
cv=TfidfVectorizer()
for i in col:
    cols=cv.fit_transform(df[i])
    arr=cols.toarray() 
    column_name = list(map(lambda x : f"{i}-{x}",cv.get_feature_names_out()))
    data=pd.DataFrame(arr,columns = column_name)
    df = df.drop(i,axis = 1)
    df.reset_index(inplace = True)
    df=pd.concat([df,data],axis=1)

In [5]:
df.to_csv("IMDB Numeric dataset.csv")

In [4]:
df

Unnamed: 0,index,sentiment,review-00,review-000,review-00001,review-0069,review-007,review-00am,review-00s,review-01,...,review-être,review-ís,review-ísnt,review-île,review-ïn,review-óli,review-önsjön,review-über,review-überwoman,review-ünfaithful
0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,9996,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,9997,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,9998,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
