### Load packages

In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
import warnings
from sklearn.metrics import recall_score, confusion_matrix, accuracy_score
import pickle

### Load dataset

In [3]:
data = pd.read_csv("Dataset/sms_spam.csv")
data.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Categorical Label Encoding 

In [4]:
data["type"] = data.type.map({'ham':0,'spam':1})
data.head()

Unnamed: 0,type,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### Data Cleaning

In [5]:
def clean(x):
    x = re.sub(r'<.*>',' ', x)
    x = re.sub(r"You're", 'You are', x)
    x = re.sub(r"I'm", 'I am', x)
    x = re.sub(r"can't", 'can not', x)
    x = re.sub(r"haven't", "have not", x)
    x = re.sub(r"didn't", "did not", x)
    x = re.sub(r"don't", "do not", x)
    x = re.sub(r"[\d-]{10,12}", "mobno", x)
    x = re.sub(r"[^A-Z-a-z]"," ", x)
    x = re.sub(r"\s+"," ", x)
    return x.lower()

In [6]:
data["text"] = data.text.apply(clean)

In [7]:
data.head()

Unnamed: 0,type,text
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in a wkly comp to win fa cup final ...
3,0,u dun say so early hor u c already then say
4,0,nah i do not think he goes to usf he lives aro...


### Data Analysis

In [8]:
data.shape

(5574, 2)

In [9]:
data.type.value_counts()

0    4827
1     747
Name: type, dtype: int64

### Split Data

In [10]:
xtrain, xtest, ytrain, ytest = train_test_split(data.text.values,
                                               data.type.values,
                                               test_size=.20,
                                               random_state=10)

In [11]:
print(xtrain.shape, xtest.shape)

(4459,) (1115,)


### Tokenization

In [12]:
cv = CountVectorizer()
cv_train = cv.fit_transform(xtrain).toarray()
cv_test = cv.transform(xtest).toarray()

### Parameter Tuning via gridsearchcv

In [13]:
p_gris = {
    'C':[.001,.01,.1,1,10]
}
svm_p_gris = {
    'C':[.001,.01,.1,1,10],
     'gamma':[.001,.01,.1,1,10],
    'kernel':['linear','rbf','sigmoid']
}
kf = KFold(10, random_state=10, shuffle=True)

In [14]:
gs = GridSearchCV(LogisticRegression(), param_grid=p_gris, cv=kf, 
                  scoring='recall')

In [15]:
warnings.filterwarnings('ignore')

gs.fit(cv_train, ytrain)

GridSearchCV(cv=KFold(n_splits=10, random_state=10, shuffle=True),
             estimator=LogisticRegression(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10]}, scoring='recall')

In [16]:
lg = gs.best_estimator_
lg

LogisticRegression(C=10)

### Train Model

In [17]:
lg = LogisticRegression(C=10,class_weight={1:50})
lg.fit(cv_train, ytrain)

LogisticRegression(C=10, class_weight={1: 50})

### Predictions

In [18]:
log_pred = lg.predict(cv_test)

### Evaluation

In [19]:
print("Logistic Reg:")
print(confusion_matrix(ytest, log_pred))
print()

Logistic Reg:
[[989   0]
 [ 12 114]]



In [20]:
print("Logistic Regression score: ",recall_score(ytest, log_pred))
per=recall_score(ytest, log_pred)*100
print("Accuracy: ",per,"%")

Logistic Regression score:  0.9047619047619048
Accuracy:  90.47619047619048 %


### Checking Predictions

In [21]:
df = pd.DataFrame()
df["Text"] = xtest
df["Actual"]= ytest
df["Prediction"] = log_pred
df["Prediction"]=df.Prediction.map({0:"ham",1:"spam"})
df["Actual"]=df.Actual.map({0:"ham",1:"spam"})

In [22]:
df.iloc[1:30,:]

Unnamed: 0,Text,Actual,Prediction
1,just got part nottingham - hrs miles good thin...,ham,ham
2,just nw i came to hme da,ham,ham
3,ok lor but buy wat,ham,ham
4,ard like dat y,ham,ham
5,aight let me know when you re gonna be around usf,ham,ham
6,so i could kiss and feel you next to me,ham,ham
7,dude sux for snake he got old and raiden got buff,ham,ham
8,urgent please call mobno from landline your ab...,spam,spam
9,yes it completely in out of form clark also ut...,ham,ham
10,in the simpsons movie released in july name th...,spam,ham


### Save model to a file

In [23]:
f1 = open("Models/model.pkl","wb")
pickle.dump(lg,f1)
f1.close()

In [24]:
f2 = open('Models/cv.pkl','wb')
pickle.dump(cv,f2)
f2.close()