In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import re
import spacy
nlp = spacy.load('en_core_web_lg')
import itertools
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
df = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv',encoding = "ISO-8859-1")

In [None]:
df.head()

# EDA

In [None]:
sns.countplot(df['v1'])

In [None]:
df.isna().sum()

In [None]:
df_tmp = df.copy()
df_tmp['text_len'] =  df_tmp['v2'].apply(lambda x:len(x))

In [None]:
sns.histplot(data=df_tmp,x='text_len',hue='v1',log_scale=True)
plt.title('Text length')

Spam messages tend to be longer than ham messages.

# Cleaning

Define functions for word count plot.

In [None]:
def word_count(text):
    word_list_spam = [x.split() for x in df[df['v1']=='spam'][text]]
    word_list_ham = [x.split() for x in df[df['v1']=='ham'][text]]
    word_count_spam = Counter(itertools.chain.from_iterable(word_list_spam))
    word_count_ham = Counter(itertools.chain.from_iterable(word_list_ham))
    return word_count_spam, word_count_ham

def to_DataFrame(word_count,target):
    keys = []
    values = []
    for key, value in word_count.items():
        keys.append(key)
        values.append(value)
    
    df_word_count = pd.DataFrame({'word':keys, 'count':values, 'target':target})
    return df_word_count

def word_count_plot(text,n_top=25):
    word_count_spam, word_count_ham = word_count(text)
    df_word_count_ham = to_DataFrame(word_count_ham,'ham').sort_values(by='count', ascending=False)
    df_word_count_spam = to_DataFrame(word_count_spam,'spam').sort_values(by='count', ascending=False)

    fig, (ax1,ax2) = plt.subplots(1,2,figsize=(10,15))
    sns.barplot(data=df_word_count_ham[:n_top], y='word',x='count',ax=ax1)
    sns.barplot(data=df_word_count_spam[:n_top], y='word',x='count',ax=ax2)
    ax1.set_title('Word counts in ham')
    ax2.set_title('Word coutns in spam')

In [None]:
word_count_plot('v2',50)

This is the counts of words before text cleaning.

Let's convert messages to to SpaCy nlp. We convert some major abbreviations in text beforehand.

In [None]:
def abbreviation_converter(text):
    converted_text = re.sub(r'\bu\b','you',text.lower())
    converted_text = re.sub(r'\br\b','are',converted_text)
    converted_text = re.sub(r'\bur\b','you are',converted_text)
    converted_text = re.sub(r'\bnt\b','nice try',converted_text)
    converted_text = re.sub(r'\bÌ\b','I',converted_text)
    converted_text = re.sub(r'\b2\b','to',converted_text)
    converted_text = re.sub(r'\b4\b','for',converted_text)
    converted_text = re.sub(r'\bmsg\b','message',converted_text)
    converted_text = re.sub(r'\btxt\b','text',converted_text)
    converted_text = re.sub(r'<.*?>','',converted_text)
    converted_text = re.sub(r'\blt;#&gt\b','',converted_text)
    converted_text = re.sub(r'[=-><+-]','',converted_text)
    return converted_text

df['v2_conv'] = df['v2'].apply(lambda x: abbreviation_converter(x))

Let's clean up the text using SpaCy. We lemmetalize text and removes stop-words, puctuation, numbers, and URL.

In [None]:
df['text'] = df['v2_conv'].apply(lambda x: nlp(x))

In [None]:
df['text_clean'] = df['text'].apply(lambda x: [w.lemma_ for w in x 
                                    if w.is_punct != 1
                                    and w.is_stop != 1
                                    and w.like_num != 1
                                   and w.like_url != 1
                                    and w.text not in ['u']])
df['text_clean'] = df['text_clean'].apply(lambda x: ' '.join(x))

In [None]:
word_count_plot('text_clean',50)

This is the count of words after text cleaning. Spam messages often contain 'free' and 'win' and other monetary wrods like 'prize', 'cash', and 'award'.

# Modelling

In [None]:
mapping = {'ham':0,'spam':1}
X_train, X_test, y_train, y_test = train_test_split(df['text_clean'], df['v1'].map(mapping),test_size=0.2, random_state=42)

Vectorization is performed using TfIdf. Several simple classification models are tried below.

## Logistic Regression Classifier

In [None]:
pipeline_lr = Pipeline([('tfidf',TfidfVectorizer()),
                     ('lr_clf',LogisticRegression())])
pipeline_lr.fit(X_train,y_train)

In [None]:
y_pred = pipeline_lr.predict(X_test)

print(classification_report(y_test,y_pred))

In [None]:
sns.heatmap(confusion_matrix(y_test,y_pred),
            annot=True, fmt='d', cmap='magma', square=True,linewidths=.5,
            xticklabels=['Ham predicted','Spam predicted'],
           yticklabels=['Ham','Spam'])

## Support vector machine

In [None]:
pipeline_svc = Pipeline([('tfidf',TfidfVectorizer()),
                     ('svc',SVC())])
pipeline_svc.fit(X_train,y_train)

In [None]:
y_pred = pipeline_svc.predict(X_test)

print(classification_report(y_test,y_pred))

In [None]:
sns.heatmap(confusion_matrix(y_test,y_pred),
            annot=True, fmt='d', cmap='magma', square=True,linewidths=.5,
            xticklabels=['Ham predicted','Spam predicted'],
           yticklabels=['Ham','Spam'])

## Gradient Boosting Classifier

In [None]:
pipeline_xgb = Pipeline([('tfidf',TfidfVectorizer()),
                     ('xgb',GradientBoostingClassifier())])
pipeline_xgb.fit(X_train,y_train)

In [None]:
y_pred = pipeline_xgb.predict(X_test)

print(classification_report(y_test,y_pred))

In [None]:
sns.heatmap(confusion_matrix(y_test,y_pred),
            annot=True,fmt='d', cmap='magma', square=True,linewidths=.5,
            xticklabels=['Ham predicted','Spam predicted'],
           yticklabels=['Ham','Spam'])

# Parameter tuning and finalize model

SVM worked well. We did parameter tuning with grid-search.

In [None]:
tfidf = TfidfVectorizer()
tfidf.fit(X_train)
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
param_grid = {'C':[0.01, 0.1, 1,10, 100, 1000], 'gamma':[0.001, 0.01, 0.1, 1, 10, 100]}

grid_search = GridSearchCV(SVC(), param_grid, cv=5)

grid_search.fit(X_train_tfidf,y_train)

In [None]:
print(f'Grid search score: {grid_search.score(X_test_tfidf,y_test)}')
print(f'Best paramters: {grid_search.best_params_}')

Plot grid search results.

In [None]:
results = pd.DataFrame(grid_search.cv_results_)
scores = np.array(results.mean_test_score).reshape(6,6)

sns.heatmap(scores, annot=True, cmap='viridis',
            xticklabels=param_grid['gamma'], yticklabels=param_grid['C'], 
            square=True, fmt='.3f')
plt.xlabel('gamma')
plt.ylabel('C')

Finally, we predict using the best SVC model

In [None]:
final_svc_model = grid_search.best_estimator_
y_pred = final_svc_model.predict(X_test_tfidf)

print(classification_report(y_test,y_pred))

In [None]:
sns.heatmap(confusion_matrix(y_test,y_pred),
            annot=True,fmt='d', cmap='magma', square=True,linewidths=.5,
            xticklabels=['Ham predicted','Spam predicted'],
           yticklabels=['Ham','Spam'])

The End