In [1]:
%autosave 0

Autosave disabled


In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from env import get_connection
from prepare import clean, lemmatize

Defining a url and query that we can use to retrieve data from Codeup's SQL server

In [3]:
url = get_connection('spam_db')

query = 'SELECT * FROM spam'

df = pd.read_sql(query, url, index_col='id')

df.head()

Unnamed: 0_level_0,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df['clean_text'] = df.text.apply(clean, args=['us']) # .apply is the function to each entry of df
df.head()

Unnamed: 0_level_0,label,text,clean_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,dun say early hor c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah ' think goes usf lives around though


I choose to apply lemmatization because the dataset is not giganitc 

In [5]:
df['lemmas'] = df.clean_text.apply(lemmatize)
df.head()

Unnamed: 0_level_0,label,text,clean_text,lemmas
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif oni,ok lar joking wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,dun say early hor c already say,dun say early hor c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah ' think goes usf lives around though,nah ' think go usf life around though


In [6]:
X = df.clean_text
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7,
                                                    random_state=31)

In [7]:
baseline_acc = y_train.value_counts().max()/y_train.shape[0]*100
print(f'Hello Edwige this is my baseline accuracy: {round(baseline_acc, 2)}')

Hello Edwige this is my baseline accuracy: 86.51


In [8]:
tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

X_train_tfidf[:10]

<10x7576 sparse matrix of type '<class 'numpy.float64'>'
	with 74 stored elements in Compressed Sparse Row format>

In [9]:
lm = LogisticRegression()

lm.fit(X_train_tfidf, y_train)

In [10]:
y_train_res = pd.DataFrame({'actual': y_train, 
                           'preds': lm.predict(X_train_tfidf)})
y_train_res

Unnamed: 0_level_0,actual,preds
id,Unnamed: 1_level_1,Unnamed: 2_level_1
5422,ham,ham
681,ham,ham
3515,ham,ham
157,ham,ham
3386,ham,ham
...,...,...
826,ham,ham
3735,ham,ham
2942,ham,ham
2064,ham,ham


In [11]:
y_train.shape

(3900,)

In [12]:
print(classification_report(y_train_res.actual, y_train_res.preds))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      3374
        spam       0.99      0.72      0.83       526

    accuracy                           0.96      3900
   macro avg       0.97      0.86      0.91      3900
weighted avg       0.96      0.96      0.96      3900



why was the recall is lower on the spam?

In [13]:
y_train.value_counts()

ham     3374
spam     526
Name: label, dtype: int64

In [14]:
y_test_res = pd.DataFrame({'actual':y_test,
                          'preds':lm.predict(X_test_tfidf)})
y_test_res.head()

Unnamed: 0_level_0,actual,preds
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1187,ham,ham
661,ham,ham
3037,ham,ham
1314,ham,ham
3206,ham,ham


In [15]:
print(classification_report(y_test_res.actual, y_test_res.preds))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1451
        spam       0.99      0.71      0.83       221

    accuracy                           0.96      1672
   macro avg       0.97      0.86      0.90      1672
weighted avg       0.96      0.96      0.96      1672

