Loading and Exploring the Data

In [1]:
import pandas as pd

df = pd.read_csv("spam.csv", encoding='latin-1')

df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


Cleaning the Data

In [2]:
df = df.iloc[:, :2]
df.columns = ['label', 'message']

In [3]:
# Convert labels to binary (spam = 1, ham = 0)
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['label'].map({'spam': 1, 'ham': 0})


In [4]:
df.isnull().sum()

Unnamed: 0,0
label,0
message,0


Train test Split

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

In [7]:
#This is to keep a copy of X_train to use in pipeline, we'll create later on.
X_train_untrained = X_train
y_train_untrained = y_train

In [8]:
X_train.head()

Unnamed: 0,message
1978,No I'm in the same boat. Still here at my moms...
3989,(Bank of Granite issues Strong-Buy) EXPLOSIVE ...
3935,They r giving a second chance to rahul dengra.
4078,O i played smash bros &lt;#&gt; religiously.
4086,PRIVATE! Your 2003 Account Statement for 07973...


Some Text preprocessing

In [9]:
import re

def preprocess_string(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

def preprocess_column(df_column):
    if not isinstance(df_column, pd.Series):
        df_column = pd.Series(df_column)

    return df_column.apply(preprocess_string)

In [10]:
from sklearn.preprocessing import FunctionTransformer

preprocessor = FunctionTransformer(preprocess_column)

In [11]:
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

Feature Extraction (TF-IDF)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=3000)

X_train = tfidf.fit_transform(X_train).toarray()
X_test = tfidf.fit_transform(X_test).toarray()
y_train = y_train.values
y_test = y_test.values


Model Selection and Training

Naive Bayes

In [13]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

Support Vector Machine (SVM)

In [15]:
from sklearn.svm import SVC

svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

Evaluation

In [16]:
from sklearn.metrics import accuracy_score,f1_score

In [17]:
y_pred_nb = nb_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("F1 Score:", f1_score(y_test, y_pred_nb))


Accuracy: 0.8690582959641255
F1 Score: 0.5165562913907285


In [18]:
y_pred_lr = lr_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))

Accuracy: 0.8654708520179372
F1 Score: 0.0


In [19]:
y_pred_svm = svm_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("F1 Score:", f1_score(y_test, y_pred_svm))

Accuracy: 0.8663677130044843
F1 Score: 0.06289308176100629


Support Vector machine shows the best results so we'll use SVM model for our pipeline!

In [20]:
X_train_untrained.head()

Unnamed: 0,message
1978,No I'm in the same boat. Still here at my moms...
3989,(Bank of Granite issues Strong-Buy) EXPLOSIVE ...
3935,They r giving a second chance to rahul dengra.
4078,O i played smash bros &lt;#&gt; religiously.
4086,PRIVATE! Your 2003 Account Statement for 07973...


In [21]:
from sklearn.pipeline import Pipeline

In [22]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('vectorizer', TfidfVectorizer(max_features=3000)),
    ('classifier', SVC(kernel='linear', probability = True))
])

In [23]:
pipeline.fit(X_train_untrained, y_train_untrained)

Pickle model

In [24]:
import pickle
with open('spam_classifier.pkl', 'wb') as model_file:
    pickle.dump(pipeline, model_file)
print("Model saved to 'spam_classifier.pkl'")

Model saved to 'spam_classifier.pkl'


Dill model

In [25]:
!pip install dill
import dill
with open('spam_classifier_dill.pkl', 'wb') as model_file:
    pickle.dump(pipeline, model_file)
print("Model saved to 'spam_classifier_dill.pkl'")

Model saved to 'spam_classifier_dill.pkl'


Cloudpickle model

In [26]:
import cloudpickle
with open('spam_classifier_cloud.pkl', 'wb') as model_file:
    pickle.dump(pipeline, model_file)
print("Model saved to 'spam_classifier_cloud.pkl'")

Model saved to 'spam_classifier_cloud.pkl'
