<h1>Text Classification For Email Dataset</h1>

In [1]:
import logging
import pandas as pd
import numpy as np
from numpy import random
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
from sklearn.pipeline import Pipeline

%matplotlib inline

In [2]:
df = pd.read_csv('emails.csv')
# df = df[pd.notnull(df['tags'])]
df.head(10)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
5,"Subject: great nnews hello , welcome to medzo...",1
6,Subject: here ' s a hot play in motion homela...,1
7,Subject: save your money buy getting this thin...,1
8,Subject: undeliverable : home based business f...,1
9,Subject: save your money buy getting this thin...,1


In [3]:
df['text'].apply(lambda x: len(x.split(' '))).sum()

2050165

We have over 10 million words in the data.

In [4]:
my_tags = ['spam','not Spam']


# The classes are very well balanced.

In [5]:
def print_plot(index):
    example = df[df.index == index][['text', 'spam']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Tag:', example[1])

few posts and tags pairs.

In [6]:
print_plot(363)

Subject: be one of our survey takers and we ' ll send you a complimentary laptop computer .  computer survey group needs survey takers in your area now . we ' d like to send you a complimentary laptop computer now for helping us . ( )  qrwzyyvp
Tag: 1


<h2>Cleaning Up the Text </h2>

In [7]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

In [8]:
df['text'] = df['text'].apply(clean_text)

In [9]:
print_plot(10)

subject las vegas high rise boom las vegas fast becoming major metropolitan city 60 + new high rise towers expected built around las vegas strip within next 3 4 years 30 000 + condominiums boom begun buy first early phase pre construction pricing available las vegas high rises including trump cosmopolitan mgm turnberry icon sky among others join interest list http www verticallv com message sent realty one highrise learn www verticallv comif wish excluded future mailings please reply word remove subject line
Tag: 1


Now the Text after Cleaning

In [10]:
df['text'].apply(lambda x: len(x.split(' '))).sum()

901058

Now we have over 3 million words to work with.

In [11]:
X = df.text
y = df.spam
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 0)

### Linear support vector machine

In [12]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=0, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ndom_state=0, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [15]:
y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

accuracy 0.983129726585224
              precision    recall  f1-score   support

        spam       0.98      1.00      0.99      1314
    not Spam       0.99      0.93      0.96       405

   micro avg       0.98      0.98      0.98      1719
   macro avg       0.99      0.97      0.98      1719
weighted avg       0.98      0.98      0.98      1719



In [16]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[1312,    2],
       [  27,  378]], dtype=int64)

### Logistic regression

In [17]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False))])

In [18]:
y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

accuracy 0.9912739965095986
              precision    recall  f1-score   support

        spam       0.99      1.00      0.99      1314
    not Spam       0.99      0.98      0.98       405

   micro avg       0.99      0.99      0.99      1719
   macro avg       0.99      0.99      0.99      1719
weighted avg       0.99      0.99      0.99      1719



In [19]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[1309,    5],
       [  10,  395]], dtype=int64)

<h2>Naive Bays</h2>

In [20]:
from sklearn.naive_bayes import MultinomialNB

naive = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB()),
               ])
naive.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [21]:
y_pred = naive.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

accuracy 0.900523560209424
              precision    recall  f1-score   support

        spam       0.88      1.00      0.94      1314
    not Spam       1.00      0.58      0.73       405

   micro avg       0.90      0.90      0.90      1719
   macro avg       0.94      0.79      0.84      1719
weighted avg       0.91      0.90      0.89      1719



In [22]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[1314,    0],
       [ 171,  234]], dtype=int64)

<h2>K Nearest N</h2>

In [23]:
from sklearn.neighbors import KNeighborsClassifier

Kneis = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', KNeighborsClassifier(n_neighbors = 100, metric = 'minkowski')),
               ])
Kneis.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...',
           metric_params=None, n_jobs=None, n_neighbors=100, p=2,
           weights='uniform'))])

In [24]:
y_pred = Kneis.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

accuracy 0.9592786503781268
              precision    recall  f1-score   support

        spam       0.95      1.00      0.97      1314
    not Spam       0.99      0.84      0.91       405

   micro avg       0.96      0.96      0.96      1719
   macro avg       0.97      0.92      0.94      1719
weighted avg       0.96      0.96      0.96      1719



In [25]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[1310,    4],
       [  66,  339]], dtype=int64)

<h2>Random Forest</h2>

In [26]:
from sklearn.ensemble import RandomForestClassifier

random = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier(n_estimators=20,random_state=0,criterion='entropy')),
               ])
random.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...imators=20, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False))])

In [27]:

y_pred = random.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

accuracy 0.9662594531704479
              precision    recall  f1-score   support

        spam       0.96      0.99      0.98      1314
    not Spam       0.98      0.87      0.92       405

   micro avg       0.97      0.97      0.97      1719
   macro avg       0.97      0.93      0.95      1719
weighted avg       0.97      0.97      0.97      1719



In [28]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[1307,    7],
       [  51,  354]], dtype=int64)