In [1]:
import pandas as pd

import warnings
warnings.simplefilter("ignore")

In [2]:
# flat file
df = pd.read_csv("SMSSpamCollection",sep="\t",names=["label","message"])
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
df["label"].unique()

array(['ham', 'spam'], dtype=object)

In [5]:
df["label"].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [6]:
df.isnull().sum()

label      0
message    0
dtype: int64

## Text Cleaning

In [7]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [8]:
corpus = []
for i in range(len(df)):
    s = re.sub("[^a-zA-Z]"," ",df["message"][i])
    s = s.lower()
    s = s.split()
    s = [ps.stem(word) for word in s if not word in set(stopwords.words("english"))]
    s = " ".join(s)
    corpus.append(s)

## Vectorization

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [10]:
y = pd.get_dummies(df["label"],drop_first=True)
y

Unnamed: 0,spam
0,False
1,False
2,True
3,False
4,False
...,...
5567,True
5568,False
5569,False
5570,False


#### Train-Test Split

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

## Modeling
#### Navie Bayes Classifier with default parameters

In [12]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train,y_train)

## Predictions

In [13]:
ypred_test = model.predict(X_test)
ypred_train = model.predict(X_train)

## Evaluation 

In [14]:
from sklearn.metrics import accuracy_score
print("Train Accuracy:",accuracy_score(y_train,ypred_train))
print("Test Accuracy:",accuracy_score(y_test,ypred_test))

Train Accuracy: 0.9921471842046219
Test Accuracy: 0.979372197309417
