# Importing Libraries 📖

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# NLP
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Modeling 
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading Data set 💾

In [None]:
data = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv', encoding = 'latin')

In [None]:
data

# Pre Processing ⭕

In [None]:
data.drop([data.columns[col] for col in [2, 3, 4]], axis = 1, inplace = True)

In [None]:
data

In [None]:
encoder = LabelEncoder()
data['v1'] = encoder.fit_transform(data['v1'])
class_mapping = {index : label for index, label in enumerate(encoder.classes_)}

In [None]:
class_mapping

In [None]:
data

### Stemming of words

In [None]:
def process_mail(mail):
    ps = PorterStemmer()
    
    mail = mail.lower()
    mail = re.sub(r'<[^<>]+>', ' ', mail)
    mail = re.sub(r'[0-9]+', 'number', mail)
    mail = re.sub(r'(http|https)://[^\s]*', 'httpaddr', mail)
    mail = re.sub(r'[^\s]+@[^\s]+', 'emailaddr', mail)
    mail = re.sub(r'[$]+', 'dollar', mail)
    
    words = word_tokenize(mail)

    for i in range(len(words)):
        words[i] = re.sub(r'[^a-zA-Z0-9]', '', words[i])
        words[i] = ps.stem(words[i])
        
    words = [word for word in words if len(word) >= 1]
    
    return words

### Storing vocabularies

In [None]:
def getVocabulary(emails, vocab_length):
    vocabulary = dict()
    
    for i in range(len(emails)):
        emails[i] = process_mail(emails[i])
        for word in emails[i]:
            if word in vocabulary.keys():
                vocabulary[word] += 1
            else:
                vocabulary[word] = 1
                
    vocabulary = sorted(vocabulary.items(), key = lambda x : x[1], reverse = True)
    vocabulary = list(map(lambda x : x[0], vocabulary[0: vocab_length]))
    vocabulary = {index : word for index, word in enumerate(vocabulary)}
    
    return vocabulary

In [None]:
def getKey(dictionary, val):
    for key, value in dictionary.items():
        if value == val:
            return key

In [None]:
def getIndices(email, vocabulary):
    word_indices = set()
    
    for word in email:
        if word in vocabulary.values():
            word_indices.add(getKey(vocabulary, word))
            
    return word_indices

In [None]:
def getFeatureVector(word_indices, vocab_length):
    feature_vec = np.zeros(vocab_length)
    
    for i in word_indices:
        feature_vec[i] = 1
        
    return feature_vec

### Setting length of vocabulary list 

In [None]:
vocab_length = 2000

In [None]:
vocabulary = getVocabulary(data['v2'].to_list(), vocab_length)

emails = data['v2'].to_list()
emails = list(map(lambda x : process_mail(x), emails))

In [None]:
X = list(map(lambda x : getFeatureVector(getIndices(x, vocabulary), vocab_length), emails))
X = pd.DataFrame(np.array(X).astype(np.int16))

# Model Training 🛠️

In [None]:
X

In [None]:
y = data['v1']

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

In [None]:
model = SVC()
model.fit(X_train, y_train)

# Scores 📈

### Accuracy

In [None]:
model.score(X_test, y_test)

### Percentage of +ve results in dataset

In [None]:
np.sum(y)/len(y)

In [None]:
y_pred = model.predict(X_test)

### F1 score

In [None]:
f1_score(y_test, y_pred)