In [156]:
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


In [36]:
data=pd.read_csv('spam_ham_dataset.csv')

In [145]:
data2=pd.read_csv('mail_data.csv')

In [None]:
data.info()

In [37]:
data.drop(['Unnamed: 0','label'],axis=1,inplace=True)

In [38]:
x=data['text']
y=data['label_num']

In [39]:
conv=CountVectorizer()
x=conv.fit_transform(x)

In [40]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [88]:
def evaluate_model(model, x_test, y_test):
    y_pred = model.predict(x_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, output_dict=True)
    
    return accuracy, precision, recall, f1, conf_matrix, class_report

In [126]:
model3=MultinomialNB()
model3.fit(x_train,y_train)
accuracy, precision, recall, f1, conf_matrix, class_report = evaluate_model(model3, x_test, y_test)


In [127]:
def add_stat(stat, classifier_name, accuracy, precision, recall, f1):
    stat.append({
        "Classifier": classifier_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
    })

In [128]:
stat = []
add_stat(stat, "MultinomialNB", accuracy, precision, recall, f1)


In [129]:
model2=LogisticRegression()
model2.fit(x_train,y_train)
accuracy, precision, recall, f1, conf_matrix, class_report = evaluate_model(model2, x_test, y_test)


In [130]:
add_stat(stat, "Logistic Regression", accuracy, precision, recall, f1)

In [131]:
model1=RandomForestClassifier()
model1.fit(x_train,y_train)
accuracy, precision, recall, f1, conf_matrix, class_report = evaluate_model(model1, x_test, y_test)

In [132]:
add_stat(stat, "Random Forest", accuracy, precision, recall, f1)

In [133]:
# model=RandomForestClassifier(n_estimators=80, random_state=10)
model=AdaBoostClassifier(n_estimators=80, random_state=10)
model.fit(x_train,y_train)
accuracy, precision, recall, f1, conf_matrix, class_report = evaluate_model(model, x_test, y_test)



In [134]:
add_stat(stat, "AdaBoost", accuracy, precision, recall, f1)

In [135]:
model4=GradientBoostingClassifier(n_estimators=80, random_state=10)
model4.fit(x_train,y_train)
accuracy, precision, recall, f1, conf_matrix, class_report = evaluate_model(model4, x_test, y_test)


In [136]:
add_stat(stat, "GradientBoosting", accuracy, precision, recall, f1)

In [137]:
model5=svm.SVC()
model5.fit(x_train,y_train)
accuracy, precision, recall, f1, conf_matrix, class_report = evaluate_model(model5, x_test, y_test)

In [138]:
add_stat(stat, "SVM", accuracy, precision, recall, f1)  

In [139]:
model6=tree.DecisionTreeClassifier()
model6.fit(x_train,y_train)
accuracy, precision, recall, f1, conf_matrix, class_report = evaluate_model(model6, x_test, y_test)

In [140]:
add_stat(stat, "Decision Tree", accuracy, precision, recall, f1)

In [141]:
stat_df = pd.DataFrame(stat)
print(stat_df)

            Classifier  Accuracy  Precision    Recall  F1 Score
0        MultinomialNB  0.987440   0.979381  0.976027  0.977702
1  Logistic Regression  0.981643   0.947541  0.989726  0.968174
2        Random Forest  0.977778   0.952862  0.969178  0.960951
3             AdaBoost  0.966184   0.929766  0.952055  0.940778
4     GradientBoosting  0.955556   0.901961  0.945205  0.923077
5                  SVM  0.959420   0.895570  0.969178  0.930921
6        Decision Tree  0.942995   0.858462  0.955479  0.904376


In [158]:
input=["You have won a lottery of $1000, please click on the link to claim your prize"]
input_data_check = conv.transform(input)
prdict=model2.predict(input_data_check)
if(prdict==0):
    print("Not a spam")
else:
    print("Spam")

Spam
