# **<span style='color:Yellow'>Importing Libraries</span>**


In [1]:
import pandas as pd
import numpy as np 

# **<span style='color:Yellow'>----------------------------------------------------------------------</span>**

# **<span style='color:Yellow'>Importing Dataset</span>**


In [2]:
df = pd.read_csv("C:\\Users\\moham\\Downloads\\spam.csv",encoding='latin1')
df.shape

(5572, 5)

# **<span style='color:Yellow'>----------------------------------------------------------------------</span>**

# **<span style='color:Yellow'>Data Cleaning and Preprocessing</span>**


In [3]:
df.sample(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
4107,ham,Pls send me your address sir.,,,
3995,ham,We'll you pay over like &lt;#&gt; yrs so its ...,,,
3304,ham,Ee msg na poortiyagi odalebeku: Hanumanji 7 na...,,,
4831,ham,Rats. Hey did u ever vote for the next themes?,,,
2421,ham,Err... Cud do. I'm going to at 8pm. I haven't...,,,
1804,ham,The bus leaves at &lt;#&gt;,,,
4400,ham,Many times we lose our best ones bcoz we are,,,
476,ham,Love you aathi..love u lot..,,,
3678,ham,Stupid.its not possible,,,
4520,ham,Hi good mornin.. Thanku wish u d same..,,,


#### **<span style='color:Aqua'>Dropping unusable columns</span>**


In [4]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)
df.sample(3)

Unnamed: 0,v1,v2
2315,ham,That's significant but dont worry.
1194,ham,Ok... C ya...
3457,ham,Have your lunch and come quickly and open the ...


#### **<span style='color:Aqua'>Renaming columns</span>**


In [5]:
df.rename(columns={'v1':'label','v2':'message'},inplace=True)
df.sample(3)

Unnamed: 0,label,message
1599,ham,"Yeah probably, I still gotta check out with leo"
3747,ham,"A bit of Ur smile is my hppnss, a drop of Ur t..."
1968,ham,2 laptop... I noe infra but too slow lar... I ...


#### **<span style='color:Aqua'>Checking for duplicated values</span>**


In [6]:
df.duplicated().sum()

403

#### **<span style='color:Aqua'>Dropping duplicated values</span>**

In [7]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5169 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5169 non-null   object
 1   message  5169 non-null   object
dtypes: object(2)
memory usage: 121.1+ KB


#### **<span style='color:Aqua'>Lowercasing whole corpus</span>**


In [9]:
df['message'] = df['message'].apply(lambda x:x.lower())

In [10]:
df.sample(2)

Unnamed: 0,label,message
703,ham,"thats a bit weird, even ?- where is the do sup..."
1691,ham,i don't know but i'm raping dudes at poker


# **<span style='color:Yellow'>----------------------------------------------------------------------</span>**

# **<span style='color:Yellow'>Feature Generation</span>**


#### **<span style='color:Aqua'>Generating column containing no. of characters in message</span>**


In [11]:
df['character_count'] = df['message'].apply(lambda x:len(x))

In [12]:
df.sample()

Unnamed: 0,label,message,character_count
3834,ham,i'm thinking that chennai forgot to come for a...,54


#### **<span style='color:Aqua'>Generating column containing no. of punctuation marks</span>**

In [13]:
import string
punc = string.punctuation.replace("'",'').replace('"','')
punc

'!#$%&()*+,-./:;<=>?@[\\]^_`{|}~'

In [14]:
def Count_Punctuation(arg):
    count = 0
    for k in arg:
        if k in punc:
            count = count + 1
    return count  
Count_Punctuation('hey guys!!!, how are you?? glad to see you :). Don"t you get tired?')

10

In [15]:
df['punctuation_count'] = df['message'].apply(Count_Punctuation)
df.sample()

Unnamed: 0,label,message,character_count,punctuation_count
3697,ham,s:)but he had some luck.2 catches put down:),44,5


#### **<span style='color:Aqua'>Generating column containing no. of stopwords in message</span>**

In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
from nltk.corpus import stopwords
words = stopwords.words('english')

In [18]:
words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [19]:
def Remove_Punctuation(arg):
    for k in arg:
        if k in punc:
            arg = arg.replace(k,'')
    return arg 
Remove_Punctuation("hey !! guys, how are you all?? fine? Nice :) and you?") 

'hey  guys how are you all fine Nice  and you'

In [20]:
def Count_Stopwords(arg):
    arg = Remove_Punctuation(arg).replace('  ',' ')
    split = arg.split(' ')
    temp = 0
    for k in split:
        if k in words:
            temp = temp + 1
    return temp 
Count_Stopwords("you? you are you")

4

In [21]:
df['stopword_count'] = df['message'].apply(Count_Stopwords)
df.sample(3)

Unnamed: 0,label,message,character_count,punctuation_count,stopword_count
953,ham,also remember to get dobby's bowl from your car,47,0,3
2119,ham,i hope you know i'm still mad at you.,37,1,4
55,ham,do you know what mallika sherawat did yesterda...,76,6,6


#### **<span style='color:Aqua'>Generating column containing no. of words in message</span>**


In [22]:
df['word_count'] = df['message'].apply(lambda x:len(x.split(' ')))
df.sample(4)

Unnamed: 0,label,message,character_count,punctuation_count,stopword_count,word_count
3489,ham,huh but i got lesson at 4 lei n i was thinkin ...,105,3,10,27
5464,ham,i will treasure every moment we spend together...,49,3,3,8
2995,ham,they released vday shirts and when u put it on...,112,1,9,21
3864,ham,"thatåõs alrite girl, u know gail is neva wrong...",104,5,2,18


#### **<span style='color:Aqua'>Generating column containing no. of unique words in message</span>**


In [23]:
def Count_UniqueWords(arg):
    arg = Remove_Punctuation(arg).lower()
    temp = []
    for k in arg.split(' '):
        if k not in temp:
            temp.append(k)
    return len((temp))

Count_UniqueWords('I am Qamar, Qamar I am.')

3

In [24]:
df['unique_word__count'] = df['message'].apply(Count_UniqueWords)
df.sample(5)

Unnamed: 0,label,message,character_count,punctuation_count,stopword_count,word_count,unique_word__count
914,ham,"call me da, i am waiting for your call.",39,2,5,9,8
1852,spam,this is the 2nd time we have tried 2 contact u...,154,7,11,31,26
4025,ham,oh ok.. wat's ur email?,23,3,0,5,5
2320,ham,this pain couldn't have come at a worse time.,45,1,5,9,9
2912,ham,kindly send some one to our flat before &lt;d...,64,5,4,12,11


*<span style='color:Orange'>Now that we are done with feature engineering,</span>*
*<span style='color:Orange'> we'll now move to machine learning part</span>*



# **<span style='color:Yellow'>----------------------------------------------------------------------</span>**

# **<span style='color:Yellow'>Getting data ready for Machine Learning Model</span>**

##### **<span style='color:Aqua'>Preparing output column</span>**


In [25]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['label'])

In [26]:
df['label'][2]

1

##### **<span style='color:Aqua'>We'll try many techniques like "Bag of Words", "N-Grams" etc. and select the best result</span>**


In [27]:
from sklearn.feature_extraction.text import CountVectorizer 
bow     = CountVectorizer()
bi_gram = CountVectorizer(ngram_range=(2,2)) 

In [28]:
df_bow = pd.DataFrame(bow.fit_transform(df['message']).toarray())
df_bow.shape 

(5169, 8672)

In [29]:
pd.DataFrame(df_bow )

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8662,8663,8664,8665,8666,8667,8668,8669,8670,8671
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5164,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5165,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5166,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5167,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##### **<span style='color:Aqua'>Out of these 2 datasets (bigram and bag of words), we'll use "Bag of Words", because it gives better accuracy and precision</span>**


# **<span style='color:Yellow'>----------------------------------------------------------------------</span>**

# **<span style='color:Yellow'>Pipelining and dumping</span>**

In [30]:
from sklearn.model_selection import train_test_split 
X_train,X_test,y_train,y_test = train_test_split(df_bow,df['label'],test_size=0.2,random_state=0)
print(f"X_train shape : {X_train.shape}\nX_test shape : {X_test.shape}\ny_train shape : {y_train.shape}\ny_test shape : {y_test.shape}")

X_train shape : (4135, 8672)
X_test shape : (1034, 8672)
y_train shape : (4135,)
y_test shape : (1034,)


In [31]:
from sklearn.ensemble import RandomForestClassifier 
rf3 = RandomForestClassifier()
rf3.fit(X_train,y_train)
ypred = rf3.predict(X_test)

from sklearn.metrics import accuracy_score,precision_score,confusion_matrix
print('accuracy score is : ',int(accuracy_score(ypred,y_test)*100),'%')

print('confusion matrix : \n',confusion_matrix(ypred,y_test))

print("precision score is :\n",precision_score(ypred,y_test)) 

accuracy score is :  97 %
confusion matrix : 
 [[885  23]
 [  0 126]]
precision score is :
 0.8456375838926175


In [32]:
import pickle 
pickle.dump(bow,open('Vectorizer.pkl','wb'))
pickle.dump(rf3,open('RandomForest.pkl','wb'))

In [33]:
import os
print(os.getcwd())

c:\Users\moham\OneDrive\Desktop\ML_Training\SMS Spam Classifier
