In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix

In [None]:
email_df=pd.read_csv("email.csv")
email_df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
lable_encoder=LabelEncoder()
email_df['is_spam']=lable_encoder.fit_transform(email_df['Category'])


In [None]:
email_text=email_df['Message']
spam_text=email_df['is_spam']

In [None]:
x_train,x_test,y_train,y_test=train_test_split(
    email_text,
    spam_text,
    test_size=0.2,
    random_state=42
)

In [None]:
knn_pipeline=Pipeline([
    ('text_vectorizer',CountVectorizer()),
    ('knn_model',KNeighborsClassifier(n_neighbors=3))
])
knn_pipeline.fit(x_train,y_train)
knn_predictions=knn_pipeline.predict(x_test)
knn_accuracy=accuracy_score(y_test,knn_predictions)
knn_accuracy

0.9354260089686098

In [None]:
print(confusion_matrix(y_test,knn_predictions))

[[958   0]
 [ 72  85]]


In [None]:
decision_tree_pipeline=Pipeline([
    ('text_vectorizer',CountVectorizer()),
    ('decision_tree_model',DecisionTreeClassifier(random_state=42))
])

decision_tree_pipeline.fit(x_train,y_train)
decision_prediction=decision_tree_pipeline.predict(x_test)
decision_tree_accuracy=accuracy_score(y_test,decision_prediction)
decision_tree_accuracy

0.9775784753363229

In [None]:
df_confusion_matrix=confusion_matrix(y_test,decision_prediction)
df_confusion_matrix

array([[947,  11],
       [ 14, 143]])

In [None]:
random_forest_pipeline=Pipeline([
    ('text_vectorizer',CountVectorizer()),
    ('random_forest_model',RandomForestClassifier(n_estimators=100,random_state=42))
])

random_forest_pipeline.fit(x_train,y_train)
random_prediction=random_forest_pipeline.predict(x_test)
random_forest_accuracy=accuracy_score(y_test,random_prediction)
random_forest_accuracy

0.9766816143497757

In [None]:
def detect(model, email):
    prediction = model.predict([email])
    if prediction[0] == 1:
        print("SPAM email")
    else:

        print("REAL (Ham) email")

In [None]:
s1 = "Upto 20% discount on parking, exclusive offer just for you!"
s2 = "Hey khan, shall we watch the football match tomorrow?"

detect(random_forest_pipeline, s1)
detect(random_forest_pipeline, s2)

REAL (Ham) email
REAL (Ham) email
