In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
import string
from nltk.corpus import stopwords
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split as tts
from wordcloud import WordCloud

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\009kr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df = pd.read_csv("spam.csv", encoding='ISO-8859-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [5]:
df.v1.value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [6]:
label = {'ham':0,'spam':1}
df = df.replace({'v1':label})

In [7]:
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [8]:
stop_words = set(stopwords.words('english'))
def clean_text(text):
    if isinstance(text,str):
        text = text.lower()
        text = text.translate(str.maketrans('','',string.punctuation))
        words = text.split()
        words = [word for word in words if word not in stop_words]
        clean = ' '.join(words)
        return clean 
    else:
        return ""

In [9]:
df.v2 = df.v2.apply(clean_text)

In [10]:
X = df['v2']
y = df['v1']

In [11]:
train_text, test_text, ytrain, ytest = tts(X,y,test_size=0.2)

In [12]:
tf = TfidfVectorizer(stop_words='english')

In [13]:
Xtrain = tf.fit_transform(train_text)
Xtest = tf.transform(test_text)

In [14]:
nb = MultinomialNB()
nb.fit(Xtrain,ytrain)
prednb_train = nb.predict(Xtrain)
prednb_test = nb.predict(Xtest)

print("Train Accuracy(F1): ", f1_score(ytrain,prednb_train))
print("Test Accuracy(F1): ", f1_score(ytest,prednb_test))

Train Accuracy(F1):  0.9198542805100183
Test Accuracy(F1):  0.8676470588235293


In [15]:
lr = LogisticRegression()
lr.fit(Xtrain,ytrain)
predlr_train = lr.predict(Xtrain)
predlr_test = lr.predict(Xtest)

print("Train Accuracy(F1): ", f1_score(ytrain,predlr_train))
print("Test Accuracy(F1): ", f1_score(ytest,predlr_test))

Train Accuracy(F1):  0.837573385518591
Test Accuracy(F1):  0.8181818181818182


In [16]:
cv = CountVectorizer()
X_train = cv.fit_transform(train_text)
X_test = cv.transform(test_text)

In [17]:
nb2 = MultinomialNB()
nb2.fit(X_train,ytrain)
prednb2_train = nb2.predict(X_train)
prednb2_test = nb2.predict(X_test)

print("Train Accuracy(F1): ", f1_score(ytrain,prednb2_train))
print("Test Accuracy(F1): ", f1_score(ytest,prednb2_test))

Train Accuracy(F1):  0.9754860524091293
Test Accuracy(F1):  0.9473684210526316


In [18]:
lr2 = LogisticRegression()
lr2.fit(X_train,ytrain)
predlr2_train = lr2.predict(X_train)
predlr2_test = lr2.predict(X_test)

print("Train Accuracy(F1): ", f1_score(ytrain,predlr2_train))
print("Test Accuracy(F1): ", f1_score(ytest,predlr2_test))

Train Accuracy(F1):  0.9785038693035253
Test Accuracy(F1):  0.9310344827586208
