In [0]:
##This block is only for access of files using google drive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

#For accessing any file from google drive, first share it for public access. Copy its id from last part of its address. Then specify the two lines below.
downloaded = drive.CreateFile({'id':"11udsdq6pPymfbAE213Zfry14SgD0QOQl"})   # replace the id with id of file you want to access
downloaded.GetContentFile('spam.csv')        # replace the file name with your file


In [3]:
import pandas as pd
import numpy as np
import string

#import the data file
filename = 'spam.csv' 

df_sms = pd.read_csv('spam.csv',encoding='latin-1')
df_sms.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
#Remove the unwanted columns
df_sms = df_sms.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
df_sms = df_sms.rename(columns={"v1":"label", "v2":"sms"})
df_sms.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
#Print number of records
L = len(df_sms)
print(L)
#Example of accessing a column in pandas dataframe
df_sms.sms

5572


0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: sms, Length: 5572, dtype: object

In [0]:
#Define a Function to convert sms text to Lower case and remove stop words, punctuation and numbers

def preprocess_Text(input_Text):
  input_Text = input_Text.lower();
  stopwords = ['the','what','is','a','an','of', 'that']
  querywords = input_Text.split()

  resultwords  = [word for word in querywords if word not in stopwords]
  result = ' '.join(resultwords)

  exclude = set(string.punctuation)
  result = ''.join(ch for ch in result if ch not in exclude)

  exclude = set('0123456789')
  result = ''.join(ch for ch in result if ch not in exclude)

  return result;
  

In [9]:
#Test the preprocessing funtion
preprocess_Text('Hello, where4 is he.')

'hello where he'

In [0]:
#Preprocess all the sms texts
L = len(df_sms)
for i in range(0,L-1):
  df_sms['sms'][i] = preprocess_Text(df_sms['sms'][i])

In [11]:
#check if preprocessing was applied correctly by looking at any sms
df_sms['sms'][5]

'freemsg hey there darling its been  weeks now and no word back id like some fun you up for it still tb ok xxx std chgs to send å£ to rcv'

In [0]:
#Divide the dataframes into training and and testing set
from sklearn.utils import shuffle
df_sms = shuffle(df_sms)
training_Subset = df_sms.iloc[:round(len(df_sms)*0.9),:]   #90% data into training
test_Subset = df_sms.iloc[round(len(df_sms)*0.9):,:]       #10% data into testing
spam_Subset = training_Subset.query('label == "spam"')
ham_Subset = training_Subset.query('label == "ham"');

In [13]:
training_Subset['sms'][37]

'i see letter b on my car'

In [0]:
#combine all text into one large paragraph which shall be used to list unique words
L = len(training_Subset);
all_Text = ""
for i in training_Subset.index:
  all_Text = all_Text + " "+training_Subset['sms'][i];


In [15]:
all_Text



In [16]:
#make a table with all unique words
allWords = all_Text.split()

row_Names = [] 
for i in allWords:
      if not i in row_Names:
          row_Names.append(i);
print(row_Names)



In [0]:
#For each word find inspam probability and in-ham probability
word = '';
inSpamCount = 0;
inHamCount = 0;
columns = ['inSpamProbability','inHamProbability']
probability_Table = pd.DataFrame(index=row_Names, columns=columns)
for word in row_Names:
  inSpamCount = 0;
  inHamCount = 0;
  for i in spam_Subset['sms']:
    if(i.find(word)==0):
      inSpamCount = inSpamCount+1;
  for i in ham_Subset['sms']:
    if(i.find(word)==0):
      inHamCount = inHamCount+1;
  probability_Table.at[word, 'inSpamProbability'] = inSpamCount/len(spam_Subset);
  probability_Table.at[word,'inHamProbability'] = inHamCount/len(ham_Subset);


In [0]:
probability_Table.sort_values("inSpamProbability", axis = 0, ascending = False, 
                 inplace = True, na_position ='first') 

In [20]:
probability_Table

Unnamed: 0,inSpamProbability,inHamProbability
u,0.124814,0.0198065
f,0.10847,0.0128973
ur,0.0936107,0.00138185
fr,0.0832095,0.0027637
y,0.0802377,0.0690926
...,...,...
english,0,0
apart,0,0.000230309
teju,0,0
iq,0,0


In [0]:
#drop rows wherever spam or ham is zero probability
probability_Table = probability_Table[(probability_Table[['inSpamProbability','inHamProbability']] != 0).all(axis=1)]

In [22]:
probability_Table

Unnamed: 0,inSpamProbability,inHamProbability
u,0.124814,0.0198065
f,0.10847,0.0128973
ur,0.0936107,0.00138185
fr,0.0832095,0.0027637
y,0.0802377,0.0690926
...,...,...
big,0.00148588,0.000230309
comp,0.00148588,0.000690926
upd,0.00148588,0.000230309
pa,0.00148588,0.00161216


#Assignment

1. Write a code to determine if a msg is spam or ham (2)
               def check_msg(test_Msg):
                ............
                ..........
2. Write a code to test all messages in test data and determine the following percentages (1)
  * True positive rate
  * True negative rate
  * False positive rate
  * False negative rate
  * Accuracy
  * Error_Rate

In [0]:
def check_msg(test_msg):
  message_words=[]
  for i in(test_msg.split()):
    if i not in message_words:
      message_words.append(i)
  spam_prob=1
  ham_prob=1
  for i in message_words:
    if (i in probability_Table.index):
      spam_prob*=probability_Table.loc[i]['inSpamProbability']
      ham_prob*=probability_Table.loc[i]['inHamProbability']
  spam_prob=spam_prob*len(spam_Subset)
  ham_prob=ham_prob*len(ham_Subset)
  if(spam_prob>ham_prob):
    return 'spam'
  else:
    return 'ham'

In [0]:
true_positive=0
true_negative=0
false_positive=0
false_negative=0
for i in test_Subset.index:
  curr_message=test_Subset['sms'][i]
  curr_label=test_Subset['label'][i]
  prediction=check_msg(curr_message)
  if(curr_label==prediction):
    if(curr_label=='spam'):
      true_positive+=1
    else:
      true_negative+=1
  else:
    if(curr_label=='spam'):
      false_negative+=1
    else:
      false_positive+=1

In [40]:
true_positive_rate=true_positive/(true_positive+false_negative)*100
true_negative_rate=true_negative/(true_negative+false_positive)*100
false_positive_rate=false_positive/(true_negative+false_positive)*100
false_negative_rate=false_negative/(true_positive+false_negative)*100
accuracy=(true_positive+true_negative)/len(test_Subset)*100
error_rate=100-accuracy
print ("True Positive Rate: {}\nTrue Negative Rate: {}\nFalse Positive Rate: {}\nFalse Negative Rate: {}\nAccuracy: {}\nError Rate: {}".format(true_positive_rate,true_negative_rate,false_positive_rate,false_negative_rate,accuracy,error_rate))

True Positive Rate: 85.13513513513513
True Negative Rate: 77.22567287784679
False Positive Rate: 22.77432712215321
False Negative Rate: 14.864864864864865
Accuracy: 78.27648114901257
Error Rate: 21.723518850987432
