In [0]:
##This block is only for access of files using google drive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

#For accessing any file from google drive, first share it for public access. Copy its id from last part of its address. Then specify the two lines below.
downloaded = drive.CreateFile({'id':"11udsdq6pPymfbAE213Zfry14SgD0QOQl"})   # replace the id with id of file you want to access
downloaded.GetContentFile('spam.csv')        # replace the file name with your file


In [0]:
#import libraries
import pandas as pd
import numpy as np
import string

#import the data file
filename = 'spam.csv' 

df_sms = pd.read_csv('spam.csv',encoding='latin-1')
df_sms.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [0]:
#Remove the unwanted columns
df_sms = df_sms.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
df_sms = df_sms.rename(columns={"v1":"label", "v2":"sms"})
df_sms.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [0]:
#Print number of records
L = len(df_sms)
print(L)
#Example of accessing a column in pandas dataframe
df_sms.sms

5572


0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: sms, Length: 5572, dtype: object

In [0]:
#Define a Function to convert sms text to Lower case and remove stop words, punctuation and numbers

def preprocess_Text(input_Text):
  input_Text = input_Text.lower();
  stopwords = ['the','what','is','a','an','of', 'that']
  querywords = input_Text.split()

  resultwords  = [word for word in querywords if word not in stopwords]
  result = ' '.join(resultwords)

  exclude = set(string.punctuation)
  result = ''.join(ch for ch in result if ch not in exclude)

  exclude = set('0123456789')
  result = ''.join(ch for ch in result if ch not in exclude)

  return result;
  

In [0]:
#Test the preprocessing funtion
preprocess_Text('Hello, where4 is he.')

'hello where he'

In [0]:
#Preprocess all the sms texts
L = len(df_sms)
for i in range(0,L-1):
  df_sms['sms'][i] = preprocess_Text(df_sms['sms'][i])

In [0]:
#check if preprocessing was applied correctly by looking at any sms
df_sms['sms'][5]

'freemsg hey there darling its been  weeks now and no word back id like some fun you up for it still tb ok xxx std chgs to send å£ to rcv'

In [0]:
#Divide the dataframes into training and and testing set
from sklearn.utils import shuffle
df_sms = shuffle(df_sms)
training_Subset = df_sms.iloc[:round(len(df_sms)*0.9),:]   #90% data into training
test_Subset = df_sms.iloc[round(len(df_sms)*0.9):,:]       #10% data into testing
spam_Subset = training_Subset.query('label == "spam"')
ham_Subset = training_Subset.query('label == "ham"');

In [0]:
training_Subset['sms'][37]

'i see letter b on my car'

In [0]:
test_Subset['sms']

1244                        now im going out  dinner soon
4083    quite ok but bit ex u better go eat smth now e...
2023    there any movie theatre i can go to and watch ...
1599       yeah probably i still gotta check out with leo
4557    piss talking someone realise u point this at i...
                              ...                        
335             tadaaaaa i am home babe are you still up 
3585    i am hot n horny and willing i live local to y...
1229                         jus ans me lar ull noe later
3242    pls accept me for one day or am begging you ch...
1670                                            lmaonice 
Name: sms, Length: 557, dtype: object

In [0]:
#combine all text into one large paragraph which shall be used to list unique words
L = len(training_Subset);
all_Text = ""
for i in training_Subset.index:
  all_Text = all_Text + " "+training_Subset['sms'][i];


In [0]:
all_Text



In [0]:
#make a table with all unique words
allWords = all_Text.split()

row_Names = [] 
for i in allWords:
      if not i in row_Names:
          row_Names.append(i);
print(row_Names)



In [0]:
#For each word find inspam probability and in-ham probability
word = '';
inSpamCount = 0;
inHamCount = 0;
columns = ['inSpamProbability','inHamProbability']
probability_Table = pd.DataFrame(index=row_Names, columns=columns)
for word in row_Names:
  inSpamCount = 0;
  inHamCount = 0;
  for i in spam_Subset['sms']:
    if(i.find(word)==0):
      inSpamCount = inSpamCount+1;
  for i in ham_Subset['sms']:
    if(i.find(word)==0):
      inHamCount = inHamCount+1;
  probability_Table.at[word, 'inSpamProbability'] = inSpamCount/len(spam_Subset);
  probability_Table.at[word,'inHamProbability'] = inHamCount/len(ham_Subset);


In [0]:
probability_Table.sort_values("inSpamProbability", axis = 0, ascending = False, 
                 inplace = True, na_position ='first') 

In [0]:
probability_Table

Unnamed: 0,inSpamProbability,inHamProbability
u,0.127407,0.018894
f,0.100741,0.0140553
y,0.0962963,0.0695853
ur,0.0933333,0.00115207
you,0.0933333,0.0232719
...,...,...
iam,0,0
jamstercouk,0,0
logosmusicnews,0,0
videosounds,0,0


In [0]:
#drop rows wherever spam or ham is zero probability
probability_Table = probability_Table[(probability_Table[['inSpamProbability','inHamProbability']] != 0).all(axis=1)]

#Assignment

1. Write a code to determine if a msg is spam or ham (2)
               def check_msg(test_Msg):
                ............
                ..........
2. Write a code to test all messages in test data and determine the following percentages (1)
  * True positive rate
  * True negative rate
  * False positive rate
  * False negative rate
  * Accuracy
  * Error_Rate

In [0]:
#function to detect spam or ham sms using naive bayes
def check_msg(test_msg):
  spamProb=1
  hamProb=1
  for i in test_msg:
    for j in range(len(probability_Table)):
      if i==probability_Table.index[j]:
        spamProb=spamProb*probability_Table.inSpamProbability[j] 
        hamProb=hamProb*probability_Table.inHamProbability[j]
  if spamProb >=hamProb:
    return 'spam'
  else:
    return 'ham'


In [0]:
test_Subset=test_Subset.reset_index(drop=True) #reseting the index of the test subset
test_Subset # with sorted index numbering

Unnamed: 0,label,sms
0,ham,now im going out dinner soon
1,ham,quite ok but bit ex u better go eat smth now e...
2,ham,there any movie theatre i can go to and watch ...
3,ham,yeah probably i still gotta check out with leo
4,ham,piss talking someone realise u point this at i...
...,...,...
552,ham,tadaaaaa i am home babe are you still up
553,spam,i am hot n horny and willing i live local to y...
554,ham,jus ans me lar ull noe later
555,ham,pls accept me for one day or am begging you ch...


In [0]:
#checking if sms is spam or ham
check_msg(test_Subset['sms'][3])

'ham'

In [0]:
#Calculation of True positive rate,True negative rate,False positive rate,False negative rate,Accuracy,Error_Rate

ac=0;
tp=0
tn=0
fp=0
fn=0

for i in range(len(test_Subset)):
  if check_msg(test_Subset['sms'][i])==test_Subset['label'][i]:
    ac=ac+1;
  if test_Subset['label'][i]=='ham' and check_msg(test_Subset['sms'][i])=='ham': #condition for true positive
    tp=tp+1;
  if test_Subset['label'][i]=='ham' and check_msg(test_Subset['sms'][i])=='spam': #condition for false negative
    fn=fn+1; 
  if test_Subset['label'][i]=='spam' and check_msg(test_Subset['sms'][i])=='ham': #condition for false positive
    fp=fp+1;
  if test_Subset['label'][i]=='spam' and check_msg(test_Subset['sms'][i])=='spam': #condition for true negative
    tn=tn+1;
accuracy=ac/len(test_Subset)*100
truePositive=tp/len(test_Subset)*100
falseNegative=fn/len(test_Subset)*100
falsePositive=fp/len(test_Subset)*100
trueNegative=tn/len(test_Subset)*100
print ("Accuracy: ", accuracy,'%')
print ("True Positive: ",truePositive)
print ("True Negative: ",trueNegative)
print ("False Positive: ",falsePositive)
print ("False Negative: ",falseNegative)
print ("Error Rate: ",100-accuracy)



Accuracy:  84.56014362657092 %
True Positive:  84.38061041292639
True Negative:  0.17953321364452424
False Positive:  12.746858168761221
False Negative:  2.6929982046678633
Error Rate:  15.43985637342908
