Implement bagging method to train machine learning on SMS spam detection. 

Dataset: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset 

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

path = "E:\Milan\MDS\machine learning\Dataset\sms spam\spam.csv"

In [2]:
#using utf-8 encoding to decode the byte data, errors= "replace" ignores bytes that cant be decoded
with open(path,"r", encoding= "utf-8" , errors = "replace") as file: 
    data = file.read()

In [3]:
tokensy = word_tokenize(data)        #inserting each word in array
stop_words = set(stopwords.words())  #set of unique words in tokensy

In [4]:
#normalizing the string text in tokensy array
for i in range(len(tokensy)):
    a = tokensy[i]
    if a =="||ham||" or a =="||spam||":        #"||spam|| and ||ham|| are the categories so no need to normalize"
        pass
    else:
        tokensy[i] = a.lower()                 #changing word in array to lower case
        a = tokensy[i]
        tokensy[i]= re.sub(r'[^\w]+'," ",a)    #replacing any letter in words of array that is not alphanumeric, by a space character
        tokensy[i] = re.sub(r'^[ ]+|[ ]+$',"",tokensy[i])  #removing space in words at either start or end of word
        tokensy[i] = re.sub(r'(\d+)|([a-zA-Z]+)|([ ]+)', r"\1\2 ", tokensy[i])[:-1] #adding space betwwen alphabets and numbers if any
        tokensy[i] = re.sub(r'[ ]+',r" ",tokensy[i])  #removing multiple spaces in word with single space

#removing the words from tokensy that is empty or are stop words
tokensy = [item for item in tokensy if (item != "" and item not in stop_words)]

In [5]:
y_data = []
x_data = []
new_text = []
#getting array of categorical words and assigning the words of that category to another array
for i in range(len(tokensy)):
    if tokensy[i].lower() == "||ham||" or tokensy[i].lower() == "||spam||":
        y_data.append(tokensy[i])
        x_data.append(new_text)
        new_text= []
    else:
        new_text.append(tokensy[i])
x_data.append(new_text)
x_data = x_data[1:]                      #removing fist array of words since it is empty
x_data[-1] = x_data[-1][:-2]             #removing headers of csv file from end index of array

In [6]:
#joining only outer elements of array to form string containing array of individual words so we could feed to CountVectorizer class of python
change_dim = [" ".join(x_data[i]) for i in range(len(x_data))]

#Steps to create bag of words
vectorizer = CountVectorizer(lowercase=False ) #false since it is already in lower case

x = vectorizer.fit_transform(change_dim) #feeding string of individual words

y_data = np.array(y_data)
y_data[y_data == "||ham||"] = 1        #changing element of array that contains word "||ham||" to 1 as classifier
y_data[y_data =='||spam||'] = 2        #changing element of array that contains word "||spam||" to 2 as classifier 

dense_matrix = x.toarray()             #changing to array
table = pd.DataFrame(dense_matrix, columns = vectorizer.get_feature_names_out(), index = y_data)    #converting array to data frame

In [7]:
#converting data to data frame
x_data = pd.DataFrame(x_data)
y_data = pd.DataFrame(y_data)
y_data = y_data.applymap(lambda x: np.int64(x) if pd.notna(x) else x)  #converting all numbers that is in string data type to integer

In [8]:
class BaggingClassifier:
    def __init__(self, base_classifier, n_estimators):        #constructor for the python class
        self.base_classifier = base_classifier
        self.n_estimators = n_estimators
        self.classifiers = []
        self.predictions = []

    def fit(self, X, y):
        for i in range(self.n_estimators):
            #Bootstrap sampling with replacement having same length as variable X
            indices = np.random.choice(len(X), len(X), replace=True)  #array of numbers from 0 to len(X) with total number of elements len(X)
            X_sampled = X.iloc[indices]     #getting only data that match provided indices 
            y_sampled = y.iloc[indices]

            # Create a new base classifier and train it on the sampled data
            classifier = self.base_classifier.__class__()      #self.base_classifier is "decision tree"
            classifier.fit(X_sampled, y_sampled)               #fit sampled data in decision tree classifier

            # Store the trained classifier in the list of classifiers
            self.classifiers.append(classifier)                #creating array  of decission tree classifier with different inputs
            print(f"Successfully fitted {i+1} base classifier with random {len(indices)} data with replacement")
            
        return self.classifiers

    def predict(self, X):
        # Make predictions using all the base classifiers
        self.predictions = [classifier.predict(X) for classifier in self.classifiers]  #predicting with data using decission tree and creating array
        
        # using voting to determine the classifying class for each observation of dataset
        #bincount determines data with max frequency along given axis (i.e along column for this case)
        #agrmax finds the index having max value data
        majority_votes = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=self.predictions) 
        return majority_votes 

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(table, y_data, test_size=0.3, random_state=42)

# Create the base classifier
dc = DecisionTreeClassifier()

#assigning decission tree object as base classifier and no of different classifier
model = BaggingClassifier(base_classifier=dc, n_estimators=5)  
classifiers = model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy of test dat
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy of bagging classifier for test data:", accuracy)

#calculate the accuracy of training data
y_pred = model.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
print("Accuracy of bagging classifier for train data:", accuracy)

Successfully fitted 1 base classifier with random 3900 data with replacement
Successfully fitted 2 base classifier with random 3900 data with replacement
Successfully fitted 3 base classifier with random 3900 data with replacement
Successfully fitted 4 base classifier with random 3900 data with replacement
Successfully fitted 5 base classifier with random 3900 data with replacement

Accuracy of bagging classifier for test data: 0.9706937799043063
Accuracy of bagging classifier for train data: 0.9946153846153846


In [10]:
for i, clf in enumerate(classifiers):
	y_pred = clf.predict(X_test)
	# Calculate accuracy
	accuracy = accuracy_score(y_test, y_pred)
	print("Accuracy of base classifier for test data "+str(i+1),'is:', accuracy)

Accuracy of base classifier for test data 1 is: 0.9593301435406698
Accuracy of base classifier for test data 2 is: 0.9671052631578947
Accuracy of base classifier for test data 3 is: 0.965311004784689
Accuracy of base classifier for test data 4 is: 0.9712918660287081
Accuracy of base classifier for test data 5 is: 0.9635167464114832


In [11]:
for i, clf in enumerate(classifiers):
	y_pred = clf.predict(X_train)
	# Calculate accuracy
	accuracy = accuracy_score(y_train, y_pred)
	print("Accuracy of base classifier for train data "+str(i+1),'is:', accuracy)

Accuracy of base classifier for train data 1 is: 0.9861538461538462
Accuracy of base classifier for train data 2 is: 0.9851282051282051
Accuracy of base classifier for train data 3 is: 0.9851282051282051
Accuracy of base classifier for train data 4 is: 0.9841025641025641
Accuracy of base classifier for train data 5 is: 0.9851282051282051
