In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv("spam.csv", encoding="latin-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1, inplace = True)
df.rename(columns={'v1':'label', 'v2':'messages'}, inplace = True)
df.head()

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# check for null values
df.isna().sum()

label       0
messages    0
dtype: int64

In [5]:
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    # Convert to lower-case
    text = text.lower()
    # Remove the special characters
    text = re.sub(r'[^0-9a-zA-Z]', ' ', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove stopwords
    text = " ".join(word for word in text.split() if word not in STOPWORDS)
    return text

In [6]:
# Clean the message
df['clean_text'] = df['messages'].apply(clean_text)
df.head()

Unnamed: 0,label,messages,clean_text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though


In [7]:
x = df['clean_text']
y = df['label']

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

def classify(model, x, y):
    # train test split
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42, shuffle = True, stratify =  y)
    # model training
    pipeline_model = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', (TfidfTransformer())),
                               ('clf', model)])
    pipeline_model.fit(x_train, y_train)
    
    print('Accuracy', pipeline_model.score(x_test, y_test)*100)
    
    # cv_score = cross_val_score(model, x, y, TfidfTransformercv = 5)
    # print("cv score:", np.mean(cv_score)*100)
    y_pred = pipeline_model.predict(x_test)
    print(classification_report(y_test, y_pred))


In [9]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model, x, y)

Accuracy 96.95067264573991
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       0.99      0.78      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [10]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
classify(model, x, y)

Accuracy 96.7713004484305
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       1.00      0.76      0.86       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



In [11]:
from sklearn.svm import SVC
model = SVC(C=3)
classify(model, x, y)

Accuracy 97.9372197309417
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       1.00      0.85      0.92       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

