In [None]:
# Data processing
import numpy as np
import pandas as pd
import regex as re
import string
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Algorithms
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB

# Loading & getting familiar with data
- Text value of target column will be transform to numerical value for training purpose ("Spam" = 1; "Ham" = 0)

In [None]:
# Loading data
data = pd.read_csv("../input/spam-text-message-classification/SPAM text message 20170820 - Data.csv")
X = data['Message']
# Transorm 'Spam': 1; 'Ham': 0
y = data['Category'].apply(lambda x: 1 if x == 'spam' else 0)

print(f'X data shape = {X.shape}')
print(f'y data shape = {y.shape}')

In [None]:
X.head()

In [None]:
y.head()

### Target column visualizing

In [None]:
# Visualizing data
cnt_occ = y.value_counts()
plt.figure(figsize=(12, 6))
sns.barplot(["Ham", "Spam"], cnt_occ.values)
plt.ylabel("Number of occurences", fontsize=15)
plt.title("Spam/Non-spam number of occurences", fontsize=15)
plt.show()

In [None]:
spam = y.loc[y==1].count()
non_spam = y.loc[y==0].count()
print(f'{non_spam} of {non_spam+spam} text message labeling non_spam and it accounts for {round(non_spam/(non_spam+spam), 2)*100}% of the dataset.')
print(f'{spam} of {non_spam+spam} text message labeling spam and it accounts for {round(spam/(non_spam+spam), 2)*100}%  of the dataset.')

# Text preprocessing & Feature considering

## Preprocessing text message before training
Processing actions

  1) Lower case
    
  2) Handle number: Replace number with text "number"
  
  3) Handle URL: Replace URLs with the text "httpaddr"
  
  4) Handle email addresses: Replace email address with the text "emailaddr"
  
  5) Handle dollars $: Replace \\$ with "dollar"
  
  6) Remove non-words: Remove non-alphanumeric characters (e.g: "?", ".", ...) and punctuation
  
  7) Word stemming Porter algorithm

In [None]:
# TODO: Preprocessing text message before training
def text_processing(content):
    process = content.lower()
    process = re.sub(r"\d+", "number", process)
    process = re.sub(r"(https|http):\S*", "httpaddr", process)
    process = re.sub(r"\S+@\S+", "emailaddr", process)
    process = re.sub(r"[$]", "dollar", process)

    # 7) Remove non-words: Remove non-alphanumeric characters (e.g: "?", ".", ...) and punctuation
    for punctuation in string.punctuation:
        process = process.replace(punctuation, " ")

    # Stemming words using "Porter stemmer" algorithm
    stemmer = PorterStemmer()
    process = " ".join([stemmer.stem(re.sub('[^a-zA-Z0-9]', '', word)) for word in process.split()])
    process = " ".join(process.split())

    # Return string removing heading and trailing characters
    return process.strip()

### What is Stemming Porter's algorithm?
- The **Porter Stemming algorithm** is used to *remove the suffixes* from an English word and obtain its stem which becomes very useful in the field of Information Retrieval (IR)
- **Suffixes** are word endings which modify a word’s meaning (E.g. -able, -ation, -ed, -er, -est,...)
- Obviously **swimming** and **swimmer** has different meanings, but when removing the suffixes , algorithm knows they are talking about something similar

Reference link: [here](https://vijinimallawaarachchi.com/2017/05/09/porter-stemming-algorithm/)

In [None]:
# Apply text preprocessing to data
X = X.apply(text_processing)
X.head()

## Features consideration (Index filtering)
- We use "Porter vocabulary" dictionary as the feature for Spam/non-spam text classifier model

In [None]:
# Porter vocabulary list
vocab_df = pd.read_csv("../input/porter-vocabulary/vocab.txt", sep="\t", header=None)
vocab_df.columns = ("index", "vocab")
print("Vocabulary shape: ", vocab_df.shape)
vocab_df.head()

- We filter each word of text message after preprocessing step from Porter dictionary vocab by "index" column
- The result will be a list of Porter dictionary's index associated with each words in text sample

In [None]:
# TODO: Filtering index from Porter dictionary vocab of each word in text message
def text_index_filtering(content, vocab_df):
    
    # Filtering index of each word in content from Porter dictionary vocab
    indices = [vocab_df[vocab_df.vocab == word]["index"].values[0] for word in content.split()
            if len(vocab_df[vocab_df.vocab == word]["index"].values > 0)]

    return indices

In [None]:
# Filtering index for data 
for i in range(X.shape[0]):
    X[i] = text_index_filtering(X[i], vocab_df)
X.head()

Next we will transform text message to 1899-elements zero array & change all the indices above to 1 as the existing of text's words in Porter's vocab dictionary
- This technique is nearly similar to One-hot encoding which is an processing technique for categorical features as binary vector (Link: [here](https://en.wikipedia.org/wiki/One-hot#:~:text=One%2Dhot%20encoding%20is%20often,the%20nth%20bit%20is%20high.))
- Note that Porter's vocab dictionary has 1899 words

In [None]:
# TODO: Change all the existing Porter's vocab in text message to 1 as in the list of zeros vocab
# Ex return: [0, 0, 0, 1, 0, ...]: word with index 4 in vocab exist in text (4: abl)
def text_features(indices):
    n = 1899  # Number of Porter's vocab words
    X = np.zeros((n, 1))
    X[indices] = 1
    return X.T[0]

In [None]:
# Count # of appearances of Porter's words in each text message
X = X.apply(text_features)
X

In [None]:
print(f"Total number of Porter's words features: {vocab_df.shape[0]}")
print(f"Number of Porter's word in 1st text sample: {np.sum(X[0] == 1)}")
print(f"Number of Porter's word in 2nd text sample: {np.sum(X[1] == 1)}")

# Building machine learning model
## Splitting data into training and validation set

In [None]:
# Splitting data 80% train, 20% validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

print(f'Number of Training Examples = {X_train.shape[0]}')
print(f'Number of Validation Examples = {X_valid.shape[0]}\n')
print(f'Training X Shape = {X_train.shape}')
print(f'Training y Shape = {y_train.shape[0]}\n')
print(f'Validation X Shape = {X_valid.shape}')
print(f'Validation y Shape = {y_valid.shape[0]}\n')

In [None]:
# Prepare input data for training algorithms
y_train = y_train.astype('uint8')
funct = lambda x: x.astype('uint8')
X_train = np.array([funct(val) for val in X_train])
X_valid = np.array([funct(val) for val in X_valid])

## Support Vector Machine (SVM)
- We use Support vector machine algorithm with linear kernel to train the spam/non-spam classification model
- Note that svm algorithm require input data type: np.ndarray with element-type: 'uint8'

In [None]:
# Train linear SVM model & fit data to model
print("\nTrain linear SVM model (Spam classifiers)")
svm_linear = svm.SVC(kernel="linear", C=0.1, gamma=0.1)
svm_linear.fit(X_train, y_train.ravel())

# Prediction & Accuracy on validation data
p_train = svm_linear.predict(X_valid)  # 1D array
# accuracy = clf.score(data["X"], data["y"]) (Case 2)
print(f"Training accuracy: {np.mean(p_train == y_valid.ravel())}")

In [None]:
# Reporting model validation
conf_mtrx = pd.DataFrame(confusion_matrix(y_valid.ravel(), p_train), 
                         columns=['predicted spam', 'predicted non-spam'],
                         index=['actual spam', 'actual non-spam'])
conf_mtrx

In [None]:
plt.figure(figsize=(5, 5))
sns.heatmap(conf_mtrx, annot=True)

### Top predictors for spam using SVM algo
- We determine the top 15 words that signal the text message as spam

--> Need to find the weight of each word in Porter's vocab

In [None]:
print("\nFinding the weight of each word in Porter's vocab")
vocab_df["weights"] = svm_linear.coef_[0, :]
top15 = vocab_df.sort_values("weights", axis=0, ascending=False).head(15)
print(top15)

In [None]:
# Visualize the weights of top 15 words signaling the spam message
fig = plt.figure(figsize=(10, 10))
sns.barplot(x="weights", y="vocab", data=top15)
plt.xlabel("Porter's words", fontsize=14)
plt.ylabel("Correlation with target column", fontsize=14)
plt.title("Top 15 correlation of features in SVM model", fontsize=14)
plt.show()

## Stochastic Gradient Descent (SGD)

In [None]:
print("Train linear SVM model with stochastic gradient")
# max_iter: The maximum number of passes over the training data (aka epochs)
# tol: stopping criterion
# loss='hinge': gives linear SVM
sgd = SGDClassifier(loss='hinge', max_iter=5, tol=None)
sgd.fit(X_train, y_train.ravel())

# Prediction & Accuracy on validation data
sgd_pred = sgd.predict(X_valid)  # 1D array
print(f"Training accuracy: {np.mean(sgd_pred == y_valid.ravel())}")

In [None]:
# Reporting model validation
conf_mtrx = pd.DataFrame(confusion_matrix(y_valid.ravel(), sgd_pred), 
                         columns=['predicted spam', 'predicted non-spam'],
                         index=['actual spam', 'actual non-spam'])
conf_mtrx

In [None]:
plt.figure(figsize=(5, 5))
sns.heatmap(conf_mtrx, annot=True)

## Random Forest Tree

In [None]:
print("Train SPAM classifier model with random forest tree algorithm")
# n_estimators: # of branchs
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)

# Prediction & Accuracy on validation data
ranForest_pred = random_forest.predict(X_valid)  # 1D array
print(f"Training accuracy: {np.mean(ranForest_pred == y_valid.ravel())}")

In [None]:
# Reporting model validation
conf_mtrx = pd.DataFrame(confusion_matrix(y_valid.ravel(), ranForest_pred), 
                         columns=['predicted spam', 'predicted non-spam'],
                         index=['actual spam', 'actual non-spam'])
conf_mtrx

In [None]:
plt.figure(figsize=(5, 5))
sns.heatmap(conf_mtrx, annot=True)

## Gaussian Naive Bayes

In [None]:
print("Train SPAM classifier model with Gaussian Naive Bayes algorithm")
gaussian = GaussianNB() 
gaussian.fit(X_train, y_train.ravel())  

# Prediction & Accuracy on validation data
Y_pred = gaussian.predict(X_valid)  # 1D array
print(f"Training accuracy: {np.mean(Y_pred == y_valid.ravel())}")

In [None]:
# Reporting model validation
conf_mtrx = pd.DataFrame(confusion_matrix(y_valid.ravel(), Y_pred), 
                         columns=['predicted spam', 'predicted non-spam'],
                         index=['actual spam', 'actual non-spam'])
conf_mtrx

In [None]:
plt.figure(figsize=(5, 5))
sns.heatmap(conf_mtrx, annot=True)