In [1]:
import numpy as np

In [2]:
import pandas as pd
import tkinter as tk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
pd.__version__

'1.4.2'

In [4]:
#READING THE DATASET

In [5]:
df = pd.read_csv('spam.csv',encoding='ISO-8859-1')

In [6]:
print(df)

        v1                                                 v2 Unnamed: 2  \
0      ham  Go until jurong point, crazy.. Available only ...        NaN   
1      ham                      Ok lar... Joking wif u oni...        NaN   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3      ham  U dun say so early hor... U c already then say...        NaN   
4      ham  Nah I don't think he goes to usf, he lives aro...        NaN   
...    ...                                                ...        ...   
5567  spam  This is the 2nd time we have tried 2 contact u...        NaN   
5568   ham              Will Ì_ b going to esplanade fr home?        NaN   
5569   ham  Pity, * was in mood for that. So...any other s...        NaN   
5570   ham  The guy did some bitching but I acted like i'd...        NaN   
5571   ham                         Rofl. Its true to its name        NaN   

     Unnamed: 3 Unnamed: 4  
0           NaN        NaN  
1           NaN        NaN  


In [7]:
data = df.where((pd.notnull(df)))

In [8]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [9]:
data = data.drop(columns=data.columns[2:5])
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
data['v1'] = data['v1'].map({'ham': 1, 'spam': 0})

In [11]:
X = data['v2']
Y = data['v1']

In [12]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object


In [13]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: v1, Length: 5572, dtype: int64


In [14]:
# Splitting data into training and testing sets

In [15]:
X_train,X_test,Y_train,Y_test = train_test_split(X , Y , train_size = 0.2 , random_state = 3)

In [16]:
print(X.shape)

(5572,)


In [17]:
print(X_train.shape)

(1114,)


In [18]:
print(X_test.shape)

(4458,)


USING LOGISTIC REGRESSION

In [19]:
# Feature extraction using TF-IDF vectorizer

In [20]:
feature_extraction = TfidfVectorizer(min_df = 1 , stop_words = 'english' , lowercase = 'True')

In [21]:
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [22]:
print(X_train_features.shape)

(1114, 3265)


In [23]:
print(X_test_features.shape)

(4458, 3265)


In [24]:
print(X_train_features)

  (0, 933)	0.38169343003798417
  (0, 2140)	0.3068986851823308
  (0, 2095)	0.46046052956669653
  (0, 529)	0.46046052956669653
  (0, 464)	0.41687910647850807
  (0, 2575)	0.4028490219712474
  (1, 1708)	0.23659816720475793
  (1, 1901)	0.2553279714810523
  (1, 778)	0.241919438558794
  (1, 3105)	0.19119132397661817
  (1, 1687)	0.20428232263172857
  (1, 280)	0.29184246839191025
  (1, 2977)	0.29184246839191025
  (1, 2632)	0.29184246839191025
  (1, 1779)	0.27568454610539556
  (1, 1390)	0.5836849367838205
  (1, 2470)	0.2642203177983341
  (2, 1686)	0.538775831521014
  (2, 2086)	0.538775831521014
  (2, 1928)	0.4579526213145197
  (2, 2894)	0.4579526213145197
  (3, 1413)	0.33361730229303815
  (3, 1648)	0.33361730229303815
  (3, 3001)	0.19105862345946986
  (3, 1732)	0.23223351021508729
  :	:
  (1109, 889)	0.321090158511031
  (1110, 398)	0.6191306903052041
  (1110, 3148)	0.6191306903052041
  (1110, 742)	0.48306767294490216
  (1111, 875)	0.3961142858659216
  (1111, 844)	0.4219157937260474
  (1111, 2276

In [25]:
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [26]:
print(Y_train)

2309    1
3727    1
385     1
2300    1
4857    1
       ..
789     1
968     1
1667    1
3321    1
1688    1
Name: v1, Length: 1114, dtype: int32


In [27]:
# Model training

In [28]:
model = LogisticRegression()

In [29]:
model.fit(X_train_features, Y_train)

LogisticRegression()

In [30]:
prediction_on_trainingdata = model.predict(X_train_features)
accuracy_on_trainingdata = accuracy_score(Y_train , prediction_on_trainingdata)

In [31]:
print("Accuracy on training dataset is: ",accuracy_on_trainingdata)

Accuracy on training dataset is:  0.9210053859964094


In [32]:
testing_data_prediction = model.predict(X_test_features)

testing_data_accuracy = accuracy_score(Y_test , testing_data_prediction)

In [33]:
print("Accuracy on testing dataset:", testing_data_accuracy)

Accuracy on testing dataset: 0.9062359802602064


In [34]:
input = ["I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."]
input = feature_extraction.transform(input)
predict = model.predict(input)
if(predict[0] == 1):
    print("The mail is a HAM mail")
else:
    print("The mail is a SPAM mail")

The mail is a HAM mail


In [39]:
def classify_sms():
    input_text = entry.get()  # Geting text from the entry field
    input_vectorized = feature_extraction.transform([input_text])  # Vectorizing the input text
    
    # Performing prediction using the model
    prediction = model.predict(input_vectorized)
    result = "HAM" if prediction[0] == 1 else "SPAM"
    result_label.config(text=f"The sms is a {result} mail")

# Creating the GUI window

root = tk.Tk()
root.title("Email Spam Classifier")

# Creating an input field
entry = tk.Entry(root, width=50)
entry.pack(padx=10, pady=10)

# Creating a button to classify the email
classify_button = tk.Button(root, text="Classify", command=classify_sms)
classify_button.pack(padx=10, pady=5)

# Creating a label to display the result
result_label = tk.Label(root, text="")
result_label.pack(padx=10, pady=5)

# Running the main Tkinter event loop
root.mainloop()