In [5]:
import kaggle

In [3]:
kaggle.api.authenticate()

In [4]:
kaggle.api.dataset_download_files('uciml/sms-spam-collection-dataset/', path='data', unzip=True)

In [59]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")
  
df = pd.read_csv('data/spam.csv', encoding="ISO-8859-1")
print('The shape of the dataset is:', df.shape)

The shape of the dataset is: (5572, 5)


In [60]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


#### Exploratory Data Analysis

In [61]:
updated_df = df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)

In [62]:
updated_df.head()


Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [63]:
model_feature = 'v2'
model_target = 'v1'

### Text processing

In [64]:
updated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [65]:
updated_df.isnull().sum()

v1    0
v2    0
dtype: int64

In [66]:
updated_df[model_feature] = updated_df[model_feature].astype('str')

In [67]:
from stop_words import get_stop_words

stop_words = get_stop_words('english')

In [68]:
print(stop_words)

['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's", 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 't

In [69]:
import re, string
import nltk
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('english')

def preProcessText(text):
    # fill in here

    #lowercase and strip white space
    text = text.lower().strip()

   
    return text

def lexiconProcess(text, stop_words, stemmer):
    # fill in here
    filtered_sentence = []
    words = text.split(" ")
    for w in words:
        if w not in stop_words:
            filtered_sentence.append(stemmer.stem(w))

    text = " ".join(filtered_sentence)
    return text

def cleanSentence(text, stop_words, stemmer):
    return lexiconProcess(preProcessText(text), stop_words, stemmer)

# Clean the text features
for c in [model_feature]:
    print('Text cleaning: ', c)
    updated_df[c] = [cleanSentence(item, stop_words, stemmer) for item in updated_df[c].values]

Text cleaning:  v2


### Train, Test data creation and Target balancing

In [70]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(updated_df, test_size=0.1, shuffle=True, random_state=42)

In [71]:
print(train_data.shape)
print(test_data.shape)

(5014, 2)
(558, 2)


In [72]:
updated_df[model_target].value_counts()

v1
ham     4825
spam     747
Name: count, dtype: int64

In [73]:
print('Training set shape:', train_data.shape)

print('Class ham samples in the training set:', sum(train_data[model_target] == 'ham'))
print('Class spam samples in the training set:', sum(train_data[model_target] == 'spam'))

print('')

print('Test set shape:', test_data.shape)

print('Class ham samples in the test set:', sum(test_data[model_target] == 'ham'))
print('Class spam samples in the test set:', sum(test_data[model_target] == 'spam'))


Training set shape: (5014, 2)
Class ham samples in the training set: 4334
Class spam samples in the training set: 680

Test set shape: (558, 2)
Class ham samples in the test set: 491
Class spam samples in the test set: 67


In [74]:
class_ham_no = train_data[train_data[model_target] == 'ham']
class_spam_no = train_data[train_data[model_target] == 'spam']

In [75]:
from sklearn.utils import resample
from sklearn.utils import shuffle

upsampled = resample(class_spam_no, replace=True, n_samples = 4334)
downsampled = resample(class_ham_no, replace=False, n_samples=4334)

train_data = pd.concat([downsampled, upsampled])
train_data = shuffle(train_data)

In [76]:
print('Training set shape:', train_data.shape)

print('Class ham samples in the training set:', sum(train_data[model_target] == 'ham'))
print('Class spam samples in the training set:', sum(train_data[model_target] == 'spam'))

print('')


Training set shape: (8668, 2)
Class ham samples in the training set: 4334
Class spam samples in the training set: 4334



### Pipeline

In [77]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

text_processor_0 = Pipeline([
    ('text_vect_0', TfidfVectorizer(use_idf=True, max_features=50)) ])

pipeline = Pipeline([
    ('data_preprocessing', text_processor_0),
    ('BGClf', BaggingClassifier())
])    

from sklearn import set_config
set_config(display='diagram')
pipeline

### Parameter Tuning and Model Evaluation

In [78]:
from sklearn.metrics import confusion_matrix

X_train = train_data[model_feature]
y_train = train_data[model_target]

# Fit the Pipeline to training data
pipeline.fit(X_train, y_train)

# Use the fitted pipeline to make predictions on the train dataset
train_predictions = pipeline.predict(X_train)
print(confusion_matrix(y_train, train_predictions))
print(classification_report(y_train, train_predictions))
print("Accuracy (training):", accuracy_score(y_train, train_predictions))

# Get test data to test the pipeline
X_test = test_data[model_feature]
y_test = test_data[model_target]

# Use the fitted pipeline to make predictions on the test dataset
test_predictions = pipeline.predict(X_test)
print(confusion_matrix(y_test, test_predictions))
print(classification_report(y_test, test_predictions))
print("Accuracy (test):", accuracy_score(y_test, test_predictions))

[[4266   68]
 [ 217 4117]]
              precision    recall  f1-score   support

         ham       0.95      0.98      0.97      4334
        spam       0.98      0.95      0.97      4334

    accuracy                           0.97      8668
   macro avg       0.97      0.97      0.97      8668
weighted avg       0.97      0.97      0.97      8668

Accuracy (training): 0.9671204430087679
[[477  14]
 [ 13  54]]
              precision    recall  f1-score   support

         ham       0.97      0.97      0.97       491
        spam       0.79      0.81      0.80        67

    accuracy                           0.95       558
   macro avg       0.88      0.89      0.89       558
weighted avg       0.95      0.95      0.95       558

Accuracy (test): 0.9516129032258065


In [98]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

### PIPELINE GRID_SEARCH ###
############################

# Parameter grid for GridSearch
param_grid={ 'BGClf__estimator' :[SVC(),SGDClassifier(loss="hinge", penalty="l2", max_iter=20)],
             'BGClf__n_estimators': [5,10,15,20]
           }

grid_search = GridSearchCV(pipeline, # Base model
                           param_grid, # Parameters to try
                           cv = 3, # Apply 5-fold cross validation
                           verbose = 1, # Print summary
                           n_jobs = -1 # Use all available processors
                          )

# Fit the GridSearch to our training data
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits




In [99]:
classifier = grid_search.best_estimator_
print(classifier)
train_predictions = classifier.predict(X_train)

print('Model performance on the train set:')
print(confusion_matrix(y_train, train_predictions))
print(classification_report(y_train, train_predictions))
print("Train accuracy:", accuracy_score(y_train, train_predictions)) 

Pipeline(steps=[('data_preprocessing',
                 Pipeline(steps=[('text_vect_0',
                                  TfidfVectorizer(max_features=50))])),
                ('BGClf', BaggingClassifier(estimator=SVC(), n_estimators=20))])
Model performance on the train set:
[[4275   59]
 [ 268 4066]]
              precision    recall  f1-score   support

         ham       0.94      0.99      0.96      4334
        spam       0.99      0.94      0.96      4334

    accuracy                           0.96      8668
   macro avg       0.96      0.96      0.96      8668
weighted avg       0.96      0.96      0.96      8668

Train accuracy: 0.96227503461006


In [100]:
test_predictions = classifier.predict(X_test)

print('Model performance on the test set:')
print(confusion_matrix(y_test, test_predictions))
print(classification_report(y_test, test_predictions))
print("test accuracy:", accuracy_score(y_test, test_predictions)) 

Model performance on the test set:
[[478  13]
 [ 11  56]]
              precision    recall  f1-score   support

         ham       0.98      0.97      0.98       491
        spam       0.81      0.84      0.82        67

    accuracy                           0.96       558
   macro avg       0.89      0.90      0.90       558
weighted avg       0.96      0.96      0.96       558

test accuracy: 0.956989247311828
