## Importing neccessary libraries.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import re
import nltk
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('spam.csv', encoding='latin-1',usecols=['v1','v2'])

In [3]:
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [5]:
df.tail(10)

Unnamed: 0,v1,v2
5562,ham,Ok lor... Sony ericsson salesman... I ask shuh...
5563,ham,Ard 6 like dat lor.
5564,ham,Why don't you wait 'til at least wednesday to ...
5565,ham,Huh y lei...
5566,spam,REMINDER FROM O2: To get 2.50 pounds free call...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [6]:
df['v1'].value_counts()

v1
ham     4825
spam     747
Name: count, dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


## Data Preprocessing.

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
encoder=LabelEncoder()
df['v1']=encoder.fit_transform(df['v1'])

In [10]:
df.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
def preprocess_text(text):
    text=text.lower()
    text=re.sub(r'[^a-zA-Z\s]',' ',text)
    return text

In [12]:
df['v2']=df['v2'].apply(preprocess_text)

In [13]:
df

Unnamed: 0,v1,v2
0,0,go until jurong point crazy available only ...
1,0,ok lar joking wif u oni
2,1,free entry in a wkly comp to win fa cup fina...
3,0,u dun say so early hor u c already then say
4,0,nah i don t think he goes to usf he lives aro...
...,...,...
5567,1,this is the nd time we have tried contact u...
5568,0,will b going to esplanade fr home
5569,0,pity was in mood for that so any other s...
5570,0,the guy did some bitching but i acted like i d...


In [14]:
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sravansakhamuri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sravansakhamuri/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
def tokenize(text):
    text=word_tokenize(text)
    return text

In [16]:
df['v2']=df['v2'].apply(tokenize)

In [17]:
df

Unnamed: 0,v1,v2
0,0,"[go, until, jurong, point, crazy, available, o..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, in, a, wkly, comp, to, win, fa, ..."
3,0,"[u, dun, say, so, early, hor, u, c, already, t..."
4,0,"[nah, i, don, t, think, he, goes, to, usf, he,..."
...,...,...
5567,1,"[this, is, the, nd, time, we, have, tried, con..."
5568,0,"[will, b, going, to, esplanade, fr, home]"
5569,0,"[pity, was, in, mood, for, that, so, any, othe..."
5570,0,"[the, guy, did, some, bitching, but, i, acted,..."


In [18]:
def filter(tokens):
    filtered_tokens=[word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

In [19]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sravansakhamuri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
stop_words=set(stopwords.words('english'))
df['v2']=df['v2'].apply(filter)

In [21]:
df

Unnamed: 0,v1,v2
0,0,"[go, jurong, point, crazy, available, bugis, n..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,0,"[u, dun, say, early, hor, u, c, already, say]"
4,0,"[nah, think, goes, usf, lives, around, though]"
...,...,...
5567,1,"[nd, time, tried, contact, u, u, pound, prize,..."
5568,0,"[b, going, esplanade, fr, home]"
5569,0,"[pity, mood, suggestions]"
5570,0,"[guy, bitching, acted, like, interested, buyin..."


In [22]:
lemmatizer=WordNetLemmatizer()
def lemma(tokens):
    lst=[lemmatizer.lemmatize(token) for token in tokens]
    return lst

In [23]:
df['v2']=df['v2'].apply(lemma)

In [24]:
df

Unnamed: 0,v1,v2
0,0,"[go, jurong, point, crazy, available, bugis, n..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,0,"[u, dun, say, early, hor, u, c, already, say]"
4,0,"[nah, think, go, usf, life, around, though]"
...,...,...
5567,1,"[nd, time, tried, contact, u, u, pound, prize,..."
5568,0,"[b, going, esplanade, fr, home]"
5569,0,"[pity, mood, suggestion]"
5570,0,"[guy, bitching, acted, like, interested, buyin..."


In [25]:
def joined(text):
    text=' '.join(text)
    return text

In [26]:
df['v2']=df['v2'].apply(joined)

In [27]:
df

Unnamed: 0,v1,v2
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,1,free entry wkly comp win fa cup final tkts st ...
3,0,u dun say early hor u c already say
4,0,nah think go usf life around though
...,...,...
5567,1,nd time tried contact u u pound prize claim ea...
5568,0,b going esplanade fr home
5569,0,pity mood suggestion
5570,0,guy bitching acted like interested buying some...


## Feature extraction.

In [28]:
vectorizer=TfidfVectorizer()
x=vectorizer.fit_transform(df['v2']).toarray()

In [29]:
x.shape

(5572, 7017)

In [30]:
x.max()

1.0

In [31]:
feature_names = vectorizer.get_feature_names_out()
for i, feature in enumerate(feature_names):
    df[feature] = x[:, i]

In [32]:
df

Unnamed: 0,v1,v2,aa,aah,aaniye,aaooooright,aathi,ab,abbey,abdomen,...,zed,zero,zf,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada
0,0,go jurong point crazy available bugis n great ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,ok lar joking wif u oni,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,free entry wkly comp win fa cup final tkts st ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,u dun say early hor u c already say,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,nah think go usf life around though,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,1,nd time tried contact u u pound prize claim ea...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0,b going esplanade fr home,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0,pity mood suggestion,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0,guy bitching acted like interested buying some...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
df=df.drop('v2',axis=1)

In [34]:
df

Unnamed: 0,v1,aa,aah,aaniye,aaooooright,aathi,ab,abbey,abdomen,abeg,...,zed,zero,zf,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Splitting data and model training.

In [35]:
cols=df.drop('v1',axis=1).columns

In [36]:
x=df[cols]
y=df['v1']

In [37]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,stratify=y,test_size=0.25)

In [38]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [39]:
nb_clf=MultinomialNB()
nb_clf.fit(x_train,y_train)

In [40]:
y_pred_nb=nb_clf.predict(x_test)

In [41]:
accuracy_score(y_test,y_pred_nb)#Test data accuracy.

0.9655419956927495

In [42]:
y_pred_nb_train=nb_clf.predict(x_train)

In [43]:
accuracy_score(y_train,y_pred_nb_train)#Train Data accuracy.

0.9770279971284996

In [44]:
rf_clf=RandomForestClassifier(random_state=42)
rf_clf.fit(x_train,y_train)

In [45]:
y_pred_rf=rf_clf.predict(x_test)

In [46]:
accuracy_score(y_test,y_pred_rf)#Test Data accuracy

0.9741564967695621

In [47]:
y_pred_rf_train=rf_clf.predict(x_train)

In [48]:
accuracy_score(y_train,y_pred_rf_train)#Train Data accuracy.

1.0