# OASIS INFOBYTE INTERNSHIP
## DATA SCIENCE
## TASK 4 - EMAIL SPAM DETECTION
### -BY RAFE ANWAR PANJRI 

In [56]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import nltk
from sklearn.svm import SVC
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\raffe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\raffe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [59]:
df=pd.read_csv(r"spam.csv", encoding="ISO-8859-1")

In [60]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


## Preprocessing

In [61]:
df.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [62]:
df1=df.isna().any(axis=1)
df1[df1].index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            5562, 5563, 5564, 5565, 5566, 5567, 5568, 5569, 5570, 5571],
           dtype='int64', length=5566)

In [63]:
df = df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'])
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [64]:
df.isna().sum()

v1    0
v2    0
dtype: int64

In [65]:
colnames={'v1':'label','v2':'mails'}
df = df.rename(columns=colnames)
df

Unnamed: 0,label,mails
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [66]:

data1=df['label']
data2=df['mails']

#80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(data2, data1, test_size=0.2,random_state=42)

In [67]:
#let remove stopwords from mail text to get convenient result

stopwords=set(stopwords.words('english'))

def preprocessing(text):
    text=text.lower() #make text lowercase
    text=''.join([char for char in text if char.isalnum() or char.isspace()]) #punctuaion removing
    words=word_tokenize(text) #tokenization
    words=[word for word in words if word not in stopwords] #removing stop words
    return ' '.join(words) #making  sentence from words.

trainingdata=X_train.apply(preprocessing)
testingdata=X_test.apply(preprocessing)


In [68]:
# feature extraction

vectorizer = TfidfVectorizer()
trainvec=vectorizer.fit_transform(trainingdata)
testvec=vectorizer.transform(testingdata)


## SVM Model

In [69]:
clf=SVC(kernel='linear')  #kernel change but i select linear


In [70]:
clf.fit(trainvec, y_train)

SVC(kernel='linear')

In [71]:

prediction=clf.predict(testvec)
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       965
        spam       0.97      0.87      0.92       150

    accuracy                           0.98      1115
   macro avg       0.97      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



## Naive Bayes

In [72]:
clf=MultinomialNB()


In [73]:
clf.fit(trainvec, y_train)

MultinomialNB()

In [74]:
prediction=clf.predict(testvec)
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.76      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



## Presentation

In [75]:
from tkinter import *
from tkinter.ttk import Combobox

In [85]:

def find1():
    #Preprocess the entered text
    input_text = txt.get("1.0","end-1c")
    preprocessed_input = preprocessing(input_text)
    
    #vectorized the input text
    input_vec = vectorizer.transform([preprocessed_input])
    
    #Predict using the trained model
    prediction = clf.predict(input_vec)
    
    result_label.config(text=f"prediction: {prediction[0]}")
    
#backgroung colour
ws = Tk()
ws.title("Spam/Not Spam Mail")
ws.geometry("800x800")
ws.configure(bg='#008080')

border1 = LabelFrame(ws, bg = "black")
border1.pack(pady = 10)
Label(border1,text='Write Mail in Below Box:',bg="#483D8B",font = "15").pack()




#main search button
button = Button(text="search", command=find1,width=20,height=5,bg="#483D8B",activebackground='#483D8B',font="15").pack(side=BOTTOM)


#check button

border = LabelFrame(ws, bg = "#800000")
border.pack(pady = 10,side=RIGHT)


txt = Text(ws)
txt.pack(pady=20)

#result label
result_label = Label(ws, text="",font="15")
result_label.pack(pady=10) 

ws.mainloop()