#**Email spam Detection with Machine Learning**

In [None]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# Loading csv file to pandas DF
data= pd.read_csv('/content/spam.csv',encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
# Set display options for better visibility of column contents
pd.set_option('display.max_colwidth', None)

# Drop unnecessary columns
columns_to_drop = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']
data.drop(columns=columns_to_drop, axis=1, inplace=True)

# Rename the first two columns for better readability
data.rename(columns={'v1': 'Category', 'v2': 'Message'}, inplace=True)

# Display the first few rows of the modified DataFrame
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [None]:
data.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [None]:
# Replacing null values
df = data.where((pd.notnull(data)),'')

In [None]:
df.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


In [None]:
df[0:10]

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"
5,spam,"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv"
6,ham,Even my brother is not like to speak with me. They treat me like aids patent.
7,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
8,spam,WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
9,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030


In [None]:
 df.tail()

Unnamed: 0,Category,Message
5567,spam,"This is the 2nd time we have tried 2 contact u. U have won the å£750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate."
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other suggestions?"
5570,ham,The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free
5571,ham,Rofl. Its true to its name


In [None]:
# Cheaking the number of rows and column
df.shape

(5572, 2)

In [None]:
# Text preprocessing functions
def tokenize(text):
    return text.split()

def remove_stopwords(text, stopwords):
    return [word for word in text if word not in stopwords]

def extract_features(text, stopwords, vectorizer):
    text = text.lower()  # Convert to lowercase
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens, stopwords)
    features = vectorizer.transform([tokens])
    return features

In [None]:
# Load stopwords (consider using NLTK or downloading a stopwords list)
stopwords = set([
    "i", "a", "an", "the", "of", "to", "in", "on", "for", "is", "was", "be",
    "are", "were", "that", "as", "with", "at", "by", "have", "has", "had", "it",
    "this", "that", "these", "those", "will", "can", "could", "may", "might", "must",
    "should", "would", "do", "does", "did", "you", "your", "me", "my", "mine", "we",
    "our", "ours", "what", "when", "where", "who", "why", "how", "all", "any", "both",
    "each", "few", "more", "most", "some", "such", "no", "nor", "not", "only", "out",
    "or", "and", "but", "an", "a", "the", "in", "on", "for", "to", "at", "by", "is",
    "are", "was", "were", "of", "that", "as", "with", "his", "hers", "their", "these",
    "those", "can", "will", "up", "because", "about", "which", "get", "if", "go",
    "move", "when", "make", "come", "think", "know", "take", "people", "into",
    "year", "your", "good", "some", "could", "them", "see", "other", "than", "then",
    "now", "look", "only", "come", "its", "over", "think", "also", "back", "after",
    "use", "two", "how", "our", "work", "first", "well", "way", "even", "new", "want",
    "because", "any", "these", "give", "day", "most", "us"
])

* Label Encoding

In [None]:
# Lable spam mail as 0 & ham/not spam mail as 1
df.loc[df['Category'] == 'spam','Category',] = 0
df.loc[df['Category'] == 'ham','Category',] = 1

Spam representing as 0

Ham representing as 1

In [None]:
# Separating the data as text & lable
x = df['Message']
y = df['Category']

In [None]:
print(x)

0                                                         Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
1                                                                                                                                           Ok lar... Joking wif u oni...
2             Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3                                                                                                                       U dun say so early hor... U c already then say...
4                                                                                                           Nah I don't think he goes to usf, he lives around here though
                                                                                      ...                                                             

In [None]:
print(y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


* Train-Test Split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=3)

In [None]:
print(x.shape)
print(x_train.shape)
print(x_test.shape)

(5572,)
(4457,)
(1115,)


* Feature Extraction by using TfidfVectorizer (Convert text into numerical values)

In [None]:
# Create TF-IDF vectorizer
# Transform the text data to feature vectors for Logistic regression
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

In [None]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [None]:
print(x_train)


3075                                        Mum, hope you are having a great day. Hoping this text meets you well and full of life. Have a great day. Abiola
1787                                                                                                                              Yes:)sura in sun tv.:)lol.
1614                                                                                                 Me sef dey laugh you. Meanwhile how's my darling anjie!
4304                                                                                                                   Yo come over carlos will be here soon
3266                                                                                                                       Ok then i come n pick u at engin?
                                                                                ...                                                                         
789                                                       

In [None]:
print(x_train_features)

  (0, 4513)	0.2909649098524696
  (0, 3380)	0.21807195185332803
  (0, 3262)	0.25877035357606315
  (0, 3136)	0.440116181574609
  (0, 2122)	0.38613577623520473
  (0, 3386)	0.3219352588930141
  (0, 6599)	0.20296878731699391
  (0, 4296)	0.3891385935794867
  (0, 3979)	0.2410582143632299
  (0, 741)	0.3219352588930141
  (1, 7443)	0.35056971070320353
  (1, 6442)	0.5652509076654626
  (1, 6417)	0.4769136859540388
  (1, 6872)	0.4306015894277422
  (1, 4061)	0.380431198316959
  (2, 5825)	0.4917598465723273
  (2, 2226)	0.413484525934624
  (2, 3917)	0.40088501350982736
  (2, 2109)	0.42972812260098503
  (2, 933)	0.4917598465723273
  (3, 7453)	0.5202633571003087
  (3, 1842)	0.3708680641487708
  (3, 1599)	0.5927091854194291
  (3, 6140)	0.4903863168693604
  (4, 1842)	0.36051481797205776
  :	:
  (4452, 4636)	0.4030918768627523
  (4453, 1762)	0.45610005640082985
  (4453, 7273)	0.5787739591782677
  (4453, 999)	0.6760129013031282
  (4454, 5370)	0.42618909997886
  (4454, 7346)	0.31166263834107377
  (4454, 1049

In [None]:
model = LogisticRegression()

In [None]:
# training the Logistic Regression model with the training data
model.fit(x_train_features, y_train)

* Evaluating the trained model

In [None]:
# prediction on training data

prediction_on_training_data = model.predict(x_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [None]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9661207089970832


In [None]:
# prediction on test data
prediction_on_test_data = model.predict(x_test_features)
accuracy_on_test_data = accuracy_score(y_test, prediction_on_test_data)

In [None]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9623318385650225


* Predictive System

In [None]:
input_email = input("Enter an email to check for spam: ")
input_data_features = feature_extraction.transform([input_email])
prediction = model.predict(input_data_features)
print(prediction)
if (prediction[0]==1):
  print("This email is likely ham.")

else:
  print("This email is likely spam.")

Enter an email to check for spam: Ok then i come n pick u at engin?Ok then i come n pick u at engin?
[1]
This email is likely ham.
