In [24]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import nltk
import unicodedata
import re
import env
from sklearn.model_selection import train_test_split
from requests import get
from bs4 import BeautifulSoup
import os
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB

In [2]:
ADDITIONAL_STOPWORDS = ['r', 'u', '2', 'ltgt']
def clean2(text):
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    text = (unicodedata.normalize('NFKD', text)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [3]:
df = pd.read_csv('ham.csv',index_col='id')

In [4]:
df.head()

Unnamed: 0_level_0,Unnamed: 0,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,ham,"Go until jurong point, crazy.. Available only ..."
1,1,ham,Ok lar... Joking wif u oni...
2,2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,3,ham,U dun say so early hor... U c already then say...
4,4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df = df.drop(columns=['Unnamed: 0'])

In [6]:
df.head()

Unnamed: 0_level_0,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
def clean(text: str) -> list:
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = set(nltk.corpus.stopwords.words('english'))
    text = (text.encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split() # tokenization
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [8]:
df['clean_text'] = df.text.apply(clean).apply(' '.join)

In [9]:
df.head()

Unnamed: 0_level_0,label,text,clean_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf life around though


In [10]:
train_validate, test = train_test_split(df,
                                        random_state=1349,
                                        train_size=0.8,
                                       stratify=df.label)
train, validate = train_test_split(train_validate,
                                   random_state=1349,
                                   train_size=0.7,
                                  stratify=train_validate.label)

In [11]:
train.shape,validate.shape,test.shape

((3119, 3), (1338, 3), (1115, 3))

In [12]:
X_train = train.clean_text
y_train = train.label
X_validate = validate.clean_text
y_validate = validate.label
X_test = test.clean_text
y_test = test.label

In [13]:
X_train.head()

id
4803    er hello thing didnt quite go plan limping slo...
4345                       still around could use half8th
4218                               anything lor go go lor
1876                              watching tv got new job
4052                                        slept timeyou
Name: clean_text, dtype: object

In [14]:
y_train.head()

id
4803    ham
4345    ham
4218    ham
1876    ham
4052    ham
Name: label, dtype: object

## Model DTC

In [15]:
tfidf = TfidfVectorizer()
X_bow = tfidf.fit_transform(X_train)

In [16]:
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(X_bow, y_train)
tree.score(X_bow, y_train)

0.9304264187239499

In [18]:
tree_preds = tree.predict(X_bow)
pd.crosstab(tree_preds,y_train)

label,ham,spam
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,2649,165
spam,52,253


In [20]:
# as with any other sklearn transformation, 
# transform only on our validate and/or test, 
# only fit on train
X_validate_bow = tfidf.transform(X_validate)
tree.score(X_validate_bow, y_validate)

0.922272047832586

In [23]:
print(f'Accuracy-Train {round(tree.score(X_bow,y_train),4)}')
print(f'Accuracy-Validate {round(tree.score(X_validate_bow,y_validate),4)}')
print(classification_report(y_train,tree_preds))
print(classification_report(y_validate,tree.predict(X_validate_bow)))

Accuracy-Train 0.9304
Accuracy-Validate 0.9223
              precision    recall  f1-score   support

         ham       0.94      0.98      0.96      2701
        spam       0.83      0.61      0.70       418

    accuracy                           0.93      3119
   macro avg       0.89      0.79      0.83      3119
weighted avg       0.93      0.93      0.93      3119

              precision    recall  f1-score   support

         ham       0.93      0.98      0.96      1158
        spam       0.83      0.53      0.65       180

    accuracy                           0.92      1338
   macro avg       0.88      0.76      0.80      1338
weighted avg       0.92      0.92      0.91      1338



## Mutilnomial NB

In [26]:
mnb = MultinomialNB()
mnb.fit(X_bow, y_train)
mnb.score(X_bow, y_train)

0.9721064443731965

In [27]:
mnb_preds = mnb.predict(X_bow)
pd.crosstab(mnb_preds,y_train)

label,ham,spam
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,2701,87
spam,0,331


In [28]:
X_validate_bow = tfidf.transform(X_validate)
mnb.score(X_validate_bow, y_validate)

0.9551569506726457

In [29]:
print(f'Accuracy-Train {round(mnb.score(X_bow,y_train),4)}')
print(f'Accuracy-Validate {round(mnb.score(X_validate_bow,y_validate),4)}')
print(classification_report(y_train,mnb_preds))
print(classification_report(y_validate,mnb.predict(X_validate_bow)))

Accuracy-Train 0.9721
Accuracy-Validate 0.9552
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      2701
        spam       1.00      0.79      0.88       418

    accuracy                           0.97      3119
   macro avg       0.98      0.90      0.93      3119
weighted avg       0.97      0.97      0.97      3119

              precision    recall  f1-score   support

         ham       0.95      1.00      0.97      1158
        spam       1.00      0.67      0.80       180

    accuracy                           0.96      1338
   macro avg       0.98      0.83      0.89      1338
weighted avg       0.96      0.96      0.95      1338



## RFC maxDepth=6 

In [45]:
rf6 = RandomForestClassifier(n_estimators=201,max_depth=6,min_samples_leaf=1)
rf6.fit(X_bow, y_train)
rf6.score(X_bow, y_train)

0.8727156139788393

In [46]:
rf6_preds = rf6.predict(X_bow)
pd.crosstab(rf6_preds,y_train)

label,ham,spam
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,2701,397
spam,0,21


In [47]:
X_validate_bow = tfidf.transform(X_validate)
rf6.score(X_validate_bow, y_validate)

0.8736920777279522

In [48]:
print(f'Accuracy-Train {round(rf6.score(X_bow,y_train),4)}')
print(f'Accuracy-Validate {round(rf6.score(X_validate_bow,y_validate),4)}')
print(classification_report(y_train,rf6_preds))
print(classification_report(y_validate,rf6.predict(X_validate_bow)))

Accuracy-Train 0.8727
Accuracy-Validate 0.8737
              precision    recall  f1-score   support

         ham       0.87      1.00      0.93      2701
        spam       1.00      0.05      0.10       418

    accuracy                           0.87      3119
   macro avg       0.94      0.53      0.51      3119
weighted avg       0.89      0.87      0.82      3119

              precision    recall  f1-score   support

         ham       0.87      1.00      0.93      1158
        spam       1.00      0.06      0.12       180

    accuracy                           0.87      1338
   macro avg       0.94      0.53      0.52      1338
weighted avg       0.89      0.87      0.82      1338



## Now use Bag of NGRAMS

## DTC

In [49]:
tfidf2 = TfidfVectorizer(ngram_range=(2,2))
X_bow = tfidf2.fit_transform(X_train)

In [50]:
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(X_bow, y_train)
tree.score(X_bow, y_train)

0.8820134658544405

In [51]:
tree_preds = tree.predict(X_bow)
pd.crosstab(tree_preds,y_train)

label,ham,spam
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,2701,368
spam,0,50


In [53]:
# as with any other sklearn transformation, 
# transform only on our validate and/or test, 
# only fit on train
X_validate_bow = tfidf2.transform(X_validate)
tree.score(X_validate_bow, y_validate)

0.8796711509715994

In [54]:
print(f'Accuracy-Train {round(tree.score(X_bow,y_train),4)}')
print(f'Accuracy-Validate {round(tree.score(X_validate_bow,y_validate),4)}')
print(classification_report(y_train,tree_preds))
print(classification_report(y_validate,tree.predict(X_validate_bow)))

Accuracy-Train 0.882
Accuracy-Validate 0.8797
              precision    recall  f1-score   support

         ham       0.88      1.00      0.94      2701
        spam       1.00      0.12      0.21       418

    accuracy                           0.88      3119
   macro avg       0.94      0.56      0.57      3119
weighted avg       0.90      0.88      0.84      3119

              precision    recall  f1-score   support

         ham       0.88      1.00      0.93      1158
        spam       0.91      0.12      0.21       180

    accuracy                           0.88      1338
   macro avg       0.90      0.56      0.57      1338
weighted avg       0.88      0.88      0.84      1338



## Mutilnomial NB

In [55]:
mnb = MultinomialNB()
mnb.fit(X_bow, y_train)
mnb.score(X_bow, y_train)

0.9628085924975954

In [56]:
mnb_preds = mnb.predict(X_bow)
pd.crosstab(mnb_preds,y_train)

label,ham,spam
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,2701,116
spam,0,302


In [57]:
X_validate_bow = tfidf2.transform(X_validate)
mnb.score(X_validate_bow, y_validate)

0.9252615844544095

In [58]:
print(f'Accuracy-Train {round(mnb.score(X_bow,y_train),4)}')
print(f'Accuracy-Validate {round(mnb.score(X_validate_bow,y_validate),4)}')
print(classification_report(y_train,mnb_preds))
print(classification_report(y_validate,mnb.predict(X_validate_bow)))

Accuracy-Train 0.9628
Accuracy-Validate 0.9253
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      2701
        spam       1.00      0.72      0.84       418

    accuracy                           0.96      3119
   macro avg       0.98      0.86      0.91      3119
weighted avg       0.96      0.96      0.96      3119

              precision    recall  f1-score   support

         ham       0.92      1.00      0.96      1158
        spam       1.00      0.44      0.62       180

    accuracy                           0.93      1338
   macro avg       0.96      0.72      0.79      1338
weighted avg       0.93      0.93      0.91      1338



## Logistic Regression

In [60]:
logit = LogisticRegression()
logit.fit(X_bow, y_train)
logit.score(X_bow, y_train)

0.8794485411991023

In [61]:
logit_preds = logit.predict(X_bow)
pd.crosstab(logit_preds,y_train)

label,ham,spam
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,2701,376
spam,0,42


In [62]:
X_validate_bow = tfidf2.transform(X_validate)
logit.score(X_validate_bow, y_validate)

0.8804185351270553

In [63]:
print(f'Accuracy-Train {round(logit.score(X_bow,y_train),4)}')
print(f'Accuracy-Validate {round(logit.score(X_validate_bow,y_validate),4)}')
print(classification_report(y_train,logit_preds))
print(classification_report(y_validate,logit.predict(X_validate_bow)))

Accuracy-Train 0.8794
Accuracy-Validate 0.8804
              precision    recall  f1-score   support

         ham       0.88      1.00      0.93      2701
        spam       1.00      0.10      0.18       418

    accuracy                           0.88      3119
   macro avg       0.94      0.55      0.56      3119
weighted avg       0.89      0.88      0.83      3119

              precision    recall  f1-score   support

         ham       0.88      1.00      0.94      1158
        spam       1.00      0.11      0.20       180

    accuracy                           0.88      1338
   macro avg       0.94      0.56      0.57      1338
weighted avg       0.89      0.88      0.84      1338

