# SMS Spam detection using Machine Learning

This is the code written in Python for building the a Spam Detector using Machine Learning trained on the 'spam.csv' file from Kaggle. We start from importing the dataset to building models.

We start by importing the required libraries such as numpy, pandas, nltk, etc... then we import the dataset by using Pandas.
The 'stopwords' contains the common English words that we don't really need for our algorithm and we stem each words. Then we convert the texts to a feature vector so that we can fit them to a classifier

### Import the required library and data

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer

In [None]:
df = pd.read_csv('spam.csv')
df.head()

### Clean the text data

In [None]:
df = df.drop('Unnamed: 0',axis=1)
df = df.rename(columns={'v2': 'messages', 'v1': 'label'})
df['label'] = df['label'].replace('ham',0)
df['label'] = df['label'].replace('spam',1)

Let us build a function that can clean the data by removing special characters, extra spaces and stopwords and stemming all the words in the text. We also remove the unecessary column.

In [None]:
def clean_text_data(data):
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    wn = WordNetLemmatizer()
    data = data.lower()
    #Remove special characters
    data = re.sub(r'[^0-9a-zA-Z]', ' ', data)
    #Remove extra spaces
    data = re.sub(r'\s+', ' ', data)
    #Remove stopwords
    data = " ".join(w for w in data.split() if w not in stop_words)
    #stemming the words
    #Perform Lemmatization then Stemming
    words = data.split()
    l = []
    for w in words:
        w = re.sub('ly$','',w)
        w = wn.lemmatize(w,pos='v')
        w = wn.lemmatize(w,pos='n')
        w = wn.lemmatize(w,pos='a')
        w = ps.stem(w)
        l.append(w)
    data = " ".join(w for w in l)
    #data = l
    return data

In [None]:
df['messages']=df['messages'].apply(clean_text_data)

In [None]:
df['label'].mean()

In [None]:
# The class is imbalanced so the "accuracy" is not a good choice of classification metric
# We will evaluate the model with "precision","recall" and "f1"

## Model building

In [None]:
X =df['messages']
Y = df['label']

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
vec = HashingVectorizer()
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline,make_pipeline
f_selector = SelectFromModel(SGDClassifier(),threshold='mean')
preprocessor = Pipeline([('Vectorizer',vec),('Feature_Selector',f_selector)])

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y)

In [None]:
from sklearn.model_selection import GridSearchCV,learning_curve
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
def plot_learning_curve(model,X_train,Y_train,metrics="f1"):
    N,train_score,val_score=learning_curve(model,X_train,Y_train,scoring=metrics)
    plt.figure()
    plt.plot(N,train_score.mean(axis=1),color='orange',label='training score')
    plt.plot(N,val_score.mean(axis=1),color='blue',label='validation score')
    plt.title('Learning curve')
    plt.xlabel('Training Size')
    plt.ylabel(metrics)
    plt.legend()
    plt.grid()

def tune_hyperparameters(model,X_train,Y_train,grid,metrics="f1"):
    gs = GridSearchCV(model,param_grid=grid,scoring=metrics)
    gs.fit(X_train,Y_train)
    print('Best Score: ',gs.best_score_)
    print('Best parameters: ',gs.best_params_)
    return gs.best_estimator_

def classification_test(model,X_test,Y_test):
    y_pred = model.predict(X_test)
    print('------ Evaluation on the test set-----')
    print('CLASSIFICATION REPORT: \n')
    print(classification_report(Y_test,y_pred))
    plt.figure(figsize=(3,3))
    sns.heatmap(data=confusion_matrix(Y_test,y_pred),square=True,annot=True)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = make_pipeline(preprocessor,LogisticRegression(class_weight={0:1,1:3.89}))
lr.fit(X_train,Y_train)

In [None]:
plot_learning_curve(lr,X_train,Y_train,metrics="recall")

In [None]:
classification_test(lr,X_test,Y_test)

In [None]:
grid = {'logisticregression__C': [1,2,5,8]}
lr = tune_hyperparameters(lr,X_train,Y_train,grid,metrics='recall')
plot_learning_curve(lr,X_train,Y_train,metrics="recall")

In [None]:
classification_test(lr,X_test,Y_test)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = make_pipeline(vec,f_selector,DecisionTreeClassifier(class_weight={0:1,1:3.89}))
dtc.fit(X_train,Y_train)

In [None]:
classification_test(dtc,X_test,Y_test)

In [None]:
plot_learning_curve(dtc,X_train,Y_train,metrics='precision')

In [None]:
grid = {'decisiontreeclassifier__max_depth': [1,5,10,20,25,30]}
dtc = tune_hyperparameters(dtc,X_train,Y_train,grid,metrics='precision')
plot_learning_curve(dtc,X_train,Y_train,metrics='precision')

In [None]:
classification_test(dtc,X_test,Y_test)

### Support Vector Machine

In [None]:
from sklearn.svm import SVC
svm = make_pipeline(vec,f_selector,SVC(kernel='rbf',C=1,gamma=1,class_weight={0:1,1:3.89}))
svm.fit(X_train,Y_train)
plot_learning_curve(svm,X_train,Y_train,metrics='recall')

In [None]:
grid = {'svc__C':[40,45,70],'svc__gamma':[0.01,0.005,0.05]}
svm = tune_hyperparameters(svm,X_train,Y_train,grid,metrics='f1')
plot_learning_curve(svm,X_train,Y_train,metrics='f1')

In [None]:
classification_test(svm,X_test,Y_test)

### Stacking Classifier

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
stc = StackingClassifier(estimators=[('LogisticRegression',lr),('SVM',svm),('DecisionTree',dtc)],final_estimator=KNeighborsClassifier())
stc.fit(X_train,Y_train)

In [None]:
plot_learning_curve(stc,X_train,Y_train,'recall')

In [None]:
classification_test(stc,X_test,Y_test)

## Conclusion
* The models didn't overfit the training set
* The Support Vector Machine and Logistic Regression are our best candidate for this classification task
* The Stacked model performs well above the three model trained

# Save the model as a pickle file

In [None]:
import pickle
def save_model(model,file_name):
	pickl = {'model':model}	
	pickle.dump(pickl,open(file_name+".p","wb"))
    
def load_models(file_name):
	with open(file_name,'rb') as pickled:
		data = pickle.load(pickled)
		model = data['model']
	return model

We are going to save the model by building a pipeline of combining the "Vectorizer" with the "Models" because we need it later for building the web app
Now lets build a "Function transformer" for the SCIKIT-LEARN MODEL capable of cleaning all the input data.

In [None]:
from sklearn.preprocessing import FunctionTransformer
def text_cleaner(text):
    return [clean_text_data(text)]

cleaner = FunctionTransformer(func=text_cleaner)

In [None]:
final_model1 = Pipeline([('Text cleaner',cleaner),('LR_Pipeline',lr)])
final_model2 = Pipeline([('Text cleaner',cleaner),('SVM_Pipeline',svm)])
final_model3 = Pipeline([('Text cleaner',cleaner),('Stack_Pipeline',stc)])

In [31]:
save_model(final_model1,'lr')
save_model(final_model2,'svm')
save_model(final_model3,'stc')

In [32]:
X_test.to_csv('X_test.csv')