In [13]:
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup
import unicodedata
import nltk

import os
import time
import re

from prepare import basic_clean, lemmatize
import acquire
from pprint import pprint

%matplotlib inline
import matplotlib.pyplot as plt

from prepare import basic_clean, lemmatize

# imports for visualization
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import cm # for colormap tools
import seaborn as sns
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image


# imports for modeling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer



from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, recall_score, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression

# visualization settings
plt.rc('figure', figsize=(13, 7))
plt.rc('font', size=16)
plt.style.use('seaborn-whitegrid')
sns.set_palette('twilight')
# sns.color_palette("cubehelix", as_cmap=True)
plt.rc('font', size=16)

# Ignore Warnings
import warnings
warnings.filterwarnings("ignore")

from env import password, user, host

In [14]:
def get_db_url(database, host=host, user=user, password=password):
    return f'mysql+pymysql://{user}:{password}@{host}/{database}'

url = get_db_url("spam_db")
sql = "SELECT * FROM spam"

df = pd.read_sql(sql, url, index_col="id")
df.head()

Unnamed: 0_level_0,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# TFIDF

In [37]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.text)
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

lm = LogisticRegression().fit(X_train, y_train)

train['predicted'] = lm.predict(X_train)
test['predicted'] = lm.predict(X_test)

In [38]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 97.44%
---
Confusion Matrix
actual      ham  spam
predicted            
ham        3857   112
spam          2   486
---
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      3859
        spam       1.00      0.81      0.90       598

    accuracy                           0.97      4457
   macro avg       0.98      0.91      0.94      4457
weighted avg       0.98      0.97      0.97      4457



In [39]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 96.32%
---
Confusion Matrix
actual     ham  spam
predicted           
ham        966    41
spam         0   108
---
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       1.00      0.72      0.84       149

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [40]:
train

Unnamed: 0_level_0,actual,predicted
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2904,ham,ham
2964,ham,ham
5239,ham,ham
4015,ham,ham
4983,spam,spam
...,...,...
2242,ham,ham
4626,spam,spam
2193,ham,ham
2006,ham,ham


In [41]:
test

Unnamed: 0_level_0,actual,predicted
id,Unnamed: 1_level_1,Unnamed: 2_level_1
759,ham,ham
879,spam,spam
14,ham,ham
3041,ham,ham
4028,ham,ham
...,...,...
638,ham,ham
4455,ham,ham
2985,spam,spam
2117,ham,ham


# Desision Tree

In [42]:
# create Decision Tree model
dt = DecisionTreeClassifier(max_depth=4, random_state = 833)

# fit the model
dt = dt.fit(X_train, y_train)

In [43]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.text)
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))


train['predicted'] = dt.predict(X_train)
test['predicted'] = dt.predict(X_test)

In [44]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))


Accuracy: 95.29%
---
Confusion Matrix
actual      ham  spam
predicted            
ham        3787   138
spam         72   460
---
              precision    recall  f1-score   support

         ham       0.96      0.98      0.97      3859
        spam       0.86      0.77      0.81       598

    accuracy                           0.95      4457
   macro avg       0.91      0.88      0.89      4457
weighted avg       0.95      0.95      0.95      4457



In [45]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))


Accuracy: 94.80%
---
Confusion Matrix
actual     ham  spam
predicted           
ham        944    36
spam        22   113
---
              precision    recall  f1-score   support

         ham       0.96      0.98      0.97       966
        spam       0.84      0.76      0.80       149

    accuracy                           0.95      1115
   macro avg       0.90      0.87      0.88      1115
weighted avg       0.95      0.95      0.95      1115

