In [80]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

In [81]:
df = pd.read_csv('spam.csv', encoding = "ISO-8859-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [82]:
df = df[['v2', 'v1']]
df.rename(columns = {'v2': 'SMS', 'v1': 'label'}, inplace = True)
df.head()

Unnamed: 0,SMS,label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [83]:
stopWords = set(stopwords.words('english'))

def cleanTxt(text):
    text = text.lower()
    text = re.sub(r'^0-9a-zA-Z', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = " ".join(word for word in text.split() if word not in stopWords)
    return text

In [84]:
df["Clean Text"] = df['SMS'].apply(cleanTxt)
df.head()

Unnamed: 0,SMS,label,Clean Text
0,"Go until jurong point, crazy.. Available only ...",ham,"go jurong point, crazy.. available bugis n gre..."
1,Ok lar... Joking wif u oni...,ham,ok lar... joking wif u oni...
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,U dun say so early hor... U c already then say...,ham,u dun say early hor... u c already say...
4,"Nah I don't think he goes to usf, he lives aro...",ham,"nah think goes usf, lives around though"


In [85]:
X = df["Clean Text"]
y = df['label']

In [86]:
def classify(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True, stratify=y)
    pipeline_model = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', model)])
    pipeline_model.fit(X_train, y_train)
    
    print("Accuracy = ", pipeline_model.score(X_test, y_test)*100)
    
    y_pred = pipeline_model.predict(X_test)
    print(classification_report(y_test, y_pred))

In [87]:
model = LogisticRegression()
classify(model, X, y)

Accuracy =  96.8413496051687
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1206
        spam       1.00      0.76      0.87       187

    accuracy                           0.97      1393
   macro avg       0.98      0.88      0.92      1393
weighted avg       0.97      0.97      0.97      1393



In [88]:
model = MultinomialNB()
classify(model, X, y)

Accuracy =  96.55419956927494
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1206
        spam       1.00      0.74      0.85       187

    accuracy                           0.97      1393
   macro avg       0.98      0.87      0.92      1393
weighted avg       0.97      0.97      0.96      1393

