In [1]:
import pandas as pd
import numpy as np

In [4]:
df1 = pd.read_csv('../resources/datasets/spam_dataset.csv')
df1.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Converting result to numeric data
**ML Models better predict when data is in numeric format**

In [5]:
mapping = {
    'spam': 1,
    'ham': 0
}

df1['Category'] = df1['Category'].map(mapping)
df1.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df1.shape

(5572, 2)

In [7]:
from sklearn.model_selection import train_test_split
X = df1['Message']
y = df1['Category']

x_1, x_2, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 11)

## Converting Messages to meaningful numeric data

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
features = TfidfVectorizer(min_df=1, lowercase=True, stop_words='english')

x_train = features.fit_transform(x_1)
x_test = features.transform(x_2)

## Generating the model

In [9]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(x_train, y_train)

In [10]:
y_pred_train = lr.predict(x_train)
y_pred_test = lr.predict(x_test)

print("Predictions (Training) = ", y_pred_train)
print("Predictions (Testing) = ", y_pred_test)

Predictions (Training) =  [0 1 0 ... 1 0 0]
Predictions (Testing) =  [1 0 0 ... 0 0 0]


In [11]:
from sklearn.metrics import accuracy_score

accuracy_training = accuracy_score(y_train, y_pred_train)
accuracy_testing = accuracy_score(y_test, y_pred_test)

print("Accuracy (Training) = ", accuracy_training)
print("Accuracy (Testing) = ", accuracy_testing)

Accuracy (Training) =  0.9664102564102565
Accuracy (Testing) =  0.9611244019138756


## Exporting the model to make it reusable

In [12]:
import pickle

# you have to export vectorizer as well because it is also trained on your data.
# otherwise when you just use the pickle model, you will probably get 'NotFittedError'
f_vec = open('../models/vectorizer.pkl', 'wb')
pickle.dump(features, f_vec)

f_lr = open('../models/model.pkl', 'wb')
pickle.dump(lr, f_lr)

f_vec.close()
f_lr.close()

## Testing the exported model

In [14]:
vectorizer = pickle.load(open('../models/vectorizer.pkl', 'rb'))
classifier = pickle.load(open('../models/model.pkl', 'rb'))

# you have to provide the input in a list format
sample_input_1 = ['Congratulations! You have won a lottery worth $100,000. Click the link below to claim your reward!']
sample_input_2 = ['Hi, how are you?']

# convert the above input to some meaningful numeric data
sample_input_converted_1 = vectorizer.transform(sample_input_1)
sample_input_converted_2 = vectorizer.transform(sample_input_2)

res_1 = classifier.predict(sample_input_converted_1)
res_2 = classifier.predict(sample_input_converted_2)
# print(res)

if res_1[0] == 1:
    print('Result 1 - SPAM!!!')
else:
    print('Result 1 - All Good')

    
if res_2[0] == 1:
    print('Result 2 - SPAM!!!')
else:
    print('Result 2 - All Good')

Result 1 - SPAM!!!
Result 2 - All Good
