<h1>Text Classification For SMS 
    Spam Dataset</h1>

In [1]:
import logging
import pandas as pd
import numpy as np
from numpy import random
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
from sklearn.pipeline import Pipeline

%matplotlib inline

In [2]:
df = pd.read_csv('spam.csv',encoding = 'latin1')
# df = df[pd.notnull(df['tags'])]
df.head(10)

Unnamed: 0,type,Message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [3]:
df['Message'].apply(lambda x: len(x.split(' '))).sum()

86961

We have over 10 million words in the data.

In [4]:
my_tags = ['spam','Not Spam']



# The classes are very well balanced.

In [5]:
def print_plot(index):
    example = df[df.index == index][['Message', 'type']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Tag:', example[1])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
type          5572 non-null object
Message       5572 non-null object
Unnamed: 2    50 non-null object
Unnamed: 3    12 non-null object
Unnamed: 4    6 non-null object
dtypes: object(5)
memory usage: 217.7+ KB


few posts and tags pairs.

In [7]:
print_plot(93)

Please call our customer service representative on 0800 169 6031 between 10am-9pm as you have WON a guaranteed å£1000 cash or å£5000 prize!
Tag: spam


<h2>Cleaning Up the Text </h2>

In [8]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text


In [9]:
df['Message'] = df['Message'].apply(clean_text)

In [10]:
print_plot(10)

im gonna home soon dont want talk stuff anymore tonight k ive cried enough today
Tag: ham


Now the Text after Cleaning

In [11]:
df['Message'].apply(lambda x: len(x.split(' '))).sum()

52750

In [12]:
X = df.Message
y = df.type
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

In [13]:
X_train[1]

'ok lar joking wif u oni'

### Linear support vector machine

In [14]:
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=0, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ndom_state=0, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [15]:
y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

accuracy 0.9587443946188341
              precision    recall  f1-score   support

        spam       0.95      1.00      0.98       949
    Not Spam       0.99      0.73      0.84       166

   micro avg       0.96      0.96      0.96      1115
   macro avg       0.97      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [16]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[948,   1],
       [ 45, 121]], dtype=int64)

### Logistic regression

In [17]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False))])

In [18]:


y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

accuracy 0.9820627802690582
              precision    recall  f1-score   support

        spam       0.98      1.00      0.99       949
    Not Spam       0.99      0.89      0.94       166

   micro avg       0.98      0.98      0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [19]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[947,   2],
       [ 18, 148]], dtype=int64)

In [30]:
from sklearn.externals import joblib
joblib.dump(logreg,"Logistic_spam.sav")


['Logistic_spam.sav']

<h2>Naive Bays</h2>

In [20]:
from sklearn.naive_bayes import MultinomialNB

naive = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB()),
               ])
naive.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [21]:

y_pred = naive.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

accuracy 0.9587443946188341
              precision    recall  f1-score   support

        spam       0.95      1.00      0.98       949
    Not Spam       1.00      0.72      0.84       166

   micro avg       0.96      0.96      0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [22]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[949,   0],
       [ 46, 120]], dtype=int64)

<h2>K Nearest N</h2>

In [23]:
from sklearn.neighbors import KNeighborsClassifier

Kneis = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', KNeighborsClassifier(n_neighbors = 15, metric = 'minkowski')),
               ])
Kneis.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...i',
           metric_params=None, n_jobs=None, n_neighbors=15, p=2,
           weights='uniform'))])

In [24]:
y_pred = Kneis.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

accuracy 0.8600896860986547
              precision    recall  f1-score   support

        spam       0.86      1.00      0.92       949
    Not Spam       1.00      0.06      0.11       166

   micro avg       0.86      0.86      0.86      1115
   macro avg       0.93      0.53      0.52      1115
weighted avg       0.88      0.86      0.80      1115



In [25]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[949,   0],
       [156,  10]], dtype=int64)

<h2>Random Forest</h2>

In [26]:
from sklearn.ensemble import RandomForestClassifier

random = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier(n_estimators=20,random_state=0,criterion='entropy')),
               ])
random.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...imators=20, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False))])

In [27]:
y_pred = random.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

accuracy 0.9641255605381166
              precision    recall  f1-score   support

        spam       0.96      1.00      0.98       949
    Not Spam       1.00      0.76      0.86       166

   micro avg       0.96      0.96      0.96      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.96      0.96      1115



In [28]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[949,   0],
       [ 40, 126]], dtype=int64)