In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
data = pd.read_table("SMSSpamCollection.txt",
                    sep="\t",
                    header=None,
                    names=["label", "messages"])
data.head()

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   label     5572 non-null   object
 1   messages  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
data.describe()

Unnamed: 0,label,messages
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [5]:
data.shape

(5572, 2)

In [7]:
#Explotary Data Analysis
print("Ham", data["label"][data.label=="ham"].count())
print("Spam", data["label"][data.label=="spam"].count())

Ham 4825
Spam 747


In [8]:
data.isnull().sum()

label       0
messages    0
dtype: int64

## Text Preprocessing

In [10]:
def text_preprocess(x):
    x = str(x).lower()
    x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
                           .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
                           .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
                           .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
                           .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
                           .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
                           .replace("€", " euro ").replace("'ll", " will") 
    return x

data["Preprocessed Text"] = data["messages"].apply(lambda x: text_preprocess(x))
data.head()

Unnamed: 0,label,messages,Preprocessed Text
0,ham,"Go until jurong point, crazy.. Available only ...","go until jurong point, crazy.. available only ..."
1,ham,Ok lar... Joking wif u oni...,ok lar... joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor... u c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro...","nah i don't think he goes to usf, he lives aro..."


## Feature Engineering

In [11]:
data["label"]=data.label.map({'ham':0, 'spam':1})

## Data Development

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data["messages"], data["label"], random_state=1)
print("X_train: ", X_train.shape)
print("X_test: ", X_test.shape)
print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)

X_train:  (4179,)
X_test:  (1393,)
y_train:  (4179,)
y_test:  (1393,)


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
training_data=count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

## Model Development

In [17]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes=MultinomialNB()
naive_bayes.fit(training_data, y_train)
pred = naive_bayes.predict(testing_data)

In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("Accuracy Score: ", format(accuracy_score(y_test, pred)))
print("Precision Score: ", format(precision_score(y_test, pred)))
print("Recall Score: ", format(recall_score(y_test, pred)))
print("F1 Score: ", format(f1_score(y_test, pred)))

Accuracy Score:  0.9885139985642498
Precision Score:  0.9720670391061452
Recall Score:  0.9405405405405406
F1 Score:  0.9560439560439562


In [24]:
txt = pd.Series("Pallav is a good boy...We're trying 2 contact u for 2nd time")

def preprocess(text):
    num = count_vector.transform(text)
    pred = naive_bayes.predict(num)
    return pred
preprocess(txt)

ValueError: X has 4 features, but MultinomialNB is expecting 7456 features as input.

In [None]:
doc = pd.Series("Thi")