Importing the Dependencies

In [85]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import re

Data Collection & Pre-Processing

In [2]:
#Mounting DRIVE
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# loading the data from csv file to a pandas Dataframe

raw_mail_data = pd.read_csv('/content/drive/MyDrive/mail_data.csv')

In [4]:
raw_mail_data2=pd.read_csv('/content/drive/MyDrive/email_spam.csv')

In [5]:
print(raw_mail_data)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [6]:
raw_mail_data2.head()


Unnamed: 0,title,text,type
0,?? the secrets to SUCCESS,"Hi James,\n\nHave you claim your complimentary...",spam
1,?? You Earned 500 GCLoot Points,"\nalt_text\nCongratulations, you just earned\n...",not spam
2,?? Your GitHub launch code,"Here's your GitHub launch code, @Mortyj420!\n ...",not spam
3,[The Virtual Reward Center] Re: ** Clarifications,"Hello,\n \nThank you for contacting the Virtua...",not spam
4,"10-1 MLB Expert Inside, Plus Everything You Ne...","Hey Prachanda Rawal,\n\nToday's newsletter is ...",spam


In [7]:
raw_mail_data.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


In [8]:
raw_mail_data2.isnull().sum()

Unnamed: 0,0
title,0
text,0
type,0


In [9]:
# replace the null values with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')


In [10]:
# printing the first 5 rows of the dataframe
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
#merging columns of the mails
raw_mail_data2['alltxt']=raw_mail_data2['title']+raw_mail_data2['text']
raw_mail_data2.drop(['title','text'],axis=1,inplace=True)

In [12]:
# removing /n

df=raw_mail_data2.copy()
# Function to remove newlines
def remove_newlines(text):
    return " ".join(text.split("\n"))

# Apply the function to the 'text' column
df['alltxt'] = df['alltxt'].apply(remove_newlines)


In [22]:
#mail_data2=df.copy()
mail_data2.head()

Unnamed: 0,Category,Message
0,0,"?? the secrets to SUCCESSHi James, Have you c..."
1,1,?? You Earned 500 GCLoot Points alt_text Congr...
2,1,?? Your GitHub launch codeHere's your GitHub l...
3,1,[The Virtual Reward Center] Re: ** Clarificati...
4,0,"10-1 MLB Expert Inside, Plus Everything You Ne..."


In [15]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [189]:
# checking the number of rows and columns in the dataframe
mail_data.shape

(5572, 2)

In [190]:
mail_data2.shape

(84, 2)

Label Encoding

In [18]:
# label spam mail as 0;  ham mail as 1;

mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1

spam  -  0

ham  -  1

In [19]:
mail_data2.loc[mail_data2['type'] == 'spam', 'type',] = 0
mail_data2.loc[mail_data2['type'] == 'not spam', 'type',] = 1

In [23]:
# Combine the two dataframes row-wise
combined_df = pd.concat([mail_data, mail_data2], axis=0, ignore_index=True)

# Display the combined dataframe
combined_df

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5651,1,Your application for the position of Child Pr...
5652,1,Your Kilimall Account is Ready - Shopping Now!...
5653,1,Your Steam account: Access from new web or mob...
5654,1,Your uploaded document is rejectedView In Brow...


In [55]:
# Randomize the combined dataframe
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

# Display the randomized dataframe
combined_df

Unnamed: 0,Category,Message
0,1,"HEY THERE BABE, HOW U DOIN? WOT U UP 2 2NITE L..."
1,1,Kallis is ready for bat in 2nd innings
2,1,Will you be here for food
3,1,"Just got some gas money, any chance you and th..."
4,1,ARE YOU IN TOWN? THIS IS V. IMPORTANT
...,...,...
5651,1,Ok anyway no need to change with what you said
5652,0,"Loan for any purpose £500 - £75,000. Homeowner..."
5653,1,Also maaaan are you missing out
5654,0,Limited offer. Hurry up! Less than $1 for any ...


In [56]:
# separating the data as texts and label

X = combined_df['Message']

Y = combined_df['Category']

In [57]:
print(X)

0       HEY THERE BABE, HOW U DOIN? WOT U UP 2 2NITE L...
1                  Kallis is ready for bat in 2nd innings
2                               Will you be here for food
3       Just got some gas money, any chance you and th...
4                   ARE YOU IN TOWN? THIS IS V. IMPORTANT
                              ...                        
5651       Ok anyway no need to change with what you said
5652    Loan for any purpose £500 - £75,000. Homeowner...
5653                      Also maaaan are you missing out
5654    Limited offer. Hurry up! Less than $1 for any ...
5655    Thk shld b can... Ya, i wana go 4 lessons... H...
Name: Message, Length: 5656, dtype: object


In [58]:
print(Y)

0       1
1       1
2       1
3       1
4       1
       ..
5651    1
5652    0
5653    1
5654    0
5655    1
Name: Category, Length: 5656, dtype: object


Splitting the data into training data & test data

In [59]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2527)

In [60]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5656,)
(4524,)
(1132,)


Feature Extraction

In [75]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(min_df = 3, ngram_range=(1,3),stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [76]:
print(X_train)

2622                           Tmrw. Im finishing 9 doors
12      For sale - arsenal dartboard. Good condition b...
5216    Get ready for Test [Co-Deo]Hello Sathya,  You ...
4470    Tyler (getting an 8th) has to leave not long a...
3371          Now project pa. After that only i can come.
                              ...                        
2650                       what are your new years plans?
825     How are you, my Love ? Are you with your broth...
613                   Sir Goodmorning, Once free call me.
5180            Uh, heads up we don't have THAT much left
76      Haven't found a way to get another app for you...
Name: Message, Length: 4524, dtype: object


In [None]:
print(X_train_features)

Training the Model

Logistic Regression

In [77]:
model = LogisticRegression()

In [78]:
# training the Logistic Regression model with the training data
model.fit(X_train_features, Y_train)

Evaluating the trained model

In [79]:
# prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [80]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9708222811671088


In [81]:
# prediction on test data

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [82]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9708480565371025


Building a Predictive System

In [192]:
#Compact form of Code
# and use the feature extraction too improve this code so it works

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import re
from google.colab import drive
import ipywidgets as widgets
from IPython.display import display, clear_output

# Mount Google Drive
drive.mount('/content/drive')

# Load the data
raw_mail_data = pd.read_csv('/content/drive/MyDrive/mail_data.csv')
raw_mail_data2 = pd.read_csv('/content/drive/MyDrive/email_spam.csv')

# ... (rest of your data preprocessing code from the original script) ...

# Feature Extraction
feature_extraction = TfidfVectorizer(min_df=3, ngram_range=(1, 3), stop_words='english', lowercase=True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# Training the Model (Logistic Regression)
model = LogisticRegression()
model.fit(X_train_features, Y_train)

# GUI elements
input_text = widgets.Text(description="Enter Text:")
output_label = widgets.Label(value="")
button_predict = widgets.Button(description="Predict")

# Prediction function
def on_button_clicked(b):
    text = input_text.value
    input_features = feature_extraction.transform([text])
    prediction = model.predict(input_features)[0]
    if prediction == 1:
        output_label.value = "Ham"
    else:
        output_label.value = "Spam"
    clear_output(wait=True)
    display(input_text, button_predict, output_label)


#Spam word detection
spam_words = ["free", "urgent", "gift","adult","prize","sex", "immediately"]

def on_text_change(change):
    text = change.new
    for word in spam_words:
      if word in text.lower():
          output_label.value = "Potentially spam due to the presence of '"+ word +"'"

          return 0;
    output_label.value = ""  # Reset the output if no spam words are found



button_predict.on_click(on_button_clicked)
input_text.observe(on_text_change, names='value')

# Display GUI elements
display(input_text, button_predict, output_label)

Text(value='This is the 2nd time we have tried 2 contact u. U have won the Â£750 Pound prize. 2 claim is easy,…

Button(description='Predict', style=ButtonStyle())

Label(value='Spam')