In [1]:
import numpy as np 
import pandas as pd

# NLP
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Modeling 
from sklearn.svm import SVC
from sklearn.metrics import f1_score

In [2]:
data = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv",encoding = 'latin-1')
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [3]:
data.drop([data.columns[col] for col in [2,3,4]], axis = 1, inplace = True)

In [4]:
data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
encoder = LabelEncoder()
data['v1'] = encoder.fit_transform(data['v1'])
class_mappings = {index: label for index,label in enumerate(encoder.classes_)}

In [6]:
class_mappings

{0: 'ham', 1: 'spam'}

In [7]:
# Take an email string and convert it to a list stemmed words

def processEmail(contents):
    ps = PorterStemmer()

    contents = contents.lower()
    contents = re.sub(r'<[^<>]+>',' ', contents)                    # Removing any sybool from text 
    contents = re.sub(r'[0-9]+','number',contents)                  # Removing number from text 
    contents = re.sub(r'(http|https)://[^\s]*','httpaddr',contents) # Removing any URL from text 
    contents = re.sub(r'[^\s]+@[^\s]+','emailaddr',contents)        # Removing email address 
    contents = re.sub(r'[$]+','dollor',contents)                    # Removing dollor sign form text 
 
    # Word tokenzier : Take contents string and break it down into single words 
    words = word_tokenize(contents)

    for i in range(len(words)): 
        words[i] = re.sub(r'[^a-zA-Z0-9]','',words[i])
        words[i] = ps.stem(words[i])

    # Rmove any blank space 
    words = [word for word in words if len(word) >= 1]

    return words

In [8]:
# Creating vocabulary list for prediction of which word are spam and which word are not spam

# Function : Take list of emails and get a dictionay of the most common words 

def getVocabulary(emails, vocab_length):
    vocabulary = dict()

    # Finding most common words in emails 
    for i in range(len(emails)):
        emails[i] = processEmail(emails[i])
        for word in emails[i]:
            if word in vocabulary.keys(): 
                vocabulary[word] += 1 
            else: 
                vocabulary[word] = 1 
                
    # Arranging most common words in decending order
    vocabulary = sorted(vocabulary.items(), key = lambda x : x[1], reverse = True) # It convert vocab into tupples 
    
    # Chopping the size of vocabulary
    # X means every values in tupples 
    #convert it into list
    vocabulary = list(map(lambda x:x[0], vocabulary[0: vocab_length])) 
    
    # converting list into dictionary
    vocabulary = {index: word for index, word in enumerate(vocabulary)}  

    return vocabulary

In [9]:
# Get a dictionary key given a value 
def getKey(dictionary,val):
    for key,value in dictionary.items():
        if value == val: 
            return key

In [10]:
# Get the indices of vocab words used to a given email 

def getIndices(email,vocabulary):
    word_indices = set()

    for word in email: 
        if word in vocabulary.values():
            word_indices.add(getKey(vocabulary,word))
            
    return word_indices     

In [11]:
# Feature Vector containing 0 and 1 
# 0 --> Word not found in email 
# 1 --> Word found in email 

def getFeatureVector(word_indices,vocab_length):
    feature_vec = np.zeros(vocab_length)

    for i in word_indices: 
        feature_vec[i] = 1 
        
    return feature_vec

In [12]:
data

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [13]:
vocab_length = 2000

In [14]:
vocabulary = getVocabulary(data['v2'].to_list(),vocab_length)

emails = data['v2'].to_list()

emails = list(map(lambda x: processEmail(x), emails))

In [15]:
X = list(map(lambda x:getFeatureVector(getIndices(x,vocabulary),vocab_length),emails))
X = pd.DataFrame(np.array(X).astype(np.int16))

In [16]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,1,0,0,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,1,0,1,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
y = data['v1']

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size = 0.8)


In [19]:
model = SVC()
model.fit(X_train,y_train)

In [20]:
model.score(X_test,y_test)

0.9820627802690582

In [21]:
 # Number of positive example // total number of example = it gives percentage of positive example of our data 
print(f"Spam email percentage : {np.sum(y)/len(y)}")


Spam email percentage : 0.13406317300789664


In [22]:
y_pred = model.predict(X_test)

In [23]:
f1_score(y_test,y_pred)

0.9242424242424242