In [2]:
import os
import numpy as np
import pandas as pd
dataset = pd.read_csv('SPAM_text_message.csv')
dataset.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
dataset['Category'].value_counts()/len(dataset)*100

Category
ham     86.593683
spam    13.406317
Name: count, dtype: float64

In [5]:
dataset.isnull().sum().sum()

0

In [7]:
ham = dataset[dataset['Category']=='ham']
spam = dataset[dataset['Category']=='spam']

In [8]:
print(ham.shape, spam.shape)

(4825, 2) (747, 2)


In [9]:
ham

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
6,ham,Even my brother is not like to speak with me. ...
...,...,...
5565,ham,Huh y lei...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [10]:
spam

Unnamed: 0,Category,Message
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
5537,spam,Want explicit SEX in 30 secs? Ring 02073162414...
5540,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,spam,Had your contract mobile 11 Mnths? Latest Moto...
5566,spam,REMINDER FROM O2: To get 2.50 pounds free call...


In [14]:
# Balance the dataset
spam = spam.sample(ham.shape[0], replace=True)

In [11]:
ham.shape[0]

4825

In [15]:
spam.shape[0]

4825

In [21]:
data = pd.concat([ham, spam])

In [22]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
6,ham,Even my brother is not like to speak with me. ...


In [23]:
data.shape

(9650, 2)

In [24]:
# split the data into training and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data['Message'],data['Category'], test_size=0.3, random_state=42)

# Feature Extraction and Model Building

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [27]:
classifier = Pipeline([('bow', CountVectorizer()), 
                       ('rfmodel', RandomForestClassifier())])

In [28]:
classifier.fit(x_train, y_train)

In [29]:
y_pred_train = classifier.predict(x_train)
y_pred_test = classifier.predict(x_test)

In [30]:
y_pred_test

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'spam'], dtype=object)

In [31]:
# Evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [32]:
print(confusion_matrix(y_train, y_pred_train))
print("****************************************************")
print(confusion_matrix(y_test, y_pred_test))

[[3363    0]
 [   0 3392]]
****************************************************
[[1462    0]
 [   3 1430]]


In [33]:
print(classification_report(y_train, y_pred_train))
print("****************************************************")
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3363
        spam       1.00      1.00      1.00      3392

    accuracy                           1.00      6755
   macro avg       1.00      1.00      1.00      6755
weighted avg       1.00      1.00      1.00      6755

****************************************************
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      1462
        spam       1.00      1.00      1.00      1433

    accuracy                           1.00      2895
   macro avg       1.00      1.00      1.00      2895
weighted avg       1.00      1.00      1.00      2895



In [34]:
print("Training Accuracy :", accuracy_score(y_train, y_pred_train))
print("****************************************************")
print("Test Accuracy :",accuracy_score(y_test, y_pred_test))

Training Accuracy : 1.0
****************************************************
Test Accuracy : 0.9989637305699481


In [35]:
# Check your model performance 
test1 =["Hello, Hope you all are doing well."]
test2 = ["Hi, hope you all are enjoying with NLP session"]
test3 = ["Congratulations, you won a lottery ticket worth $100 millions ! To claim call @111111 and also share your account details so the we can do smooth traction."]

In [36]:
print(classifier.predict(test1))
print()
print(classifier.predict(test2))
print()
print(classifier.predict(test3))

['ham']

['ham']

['spam']
