In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("sms_spam.csv")

**Proccessing the Data before training**

In [3]:
#create an array that is 0 if ham and 1 if spam
labels = (df["label"]=="spam").astype(int)

stopwords = ["the", "and", "is", "in", "to", "a", "of", "i", "you", "u", "me", "it", "for", "your", "my"]
#lower to lowercase a string, str to apply it to all strings in the column
X_texts = df["message"].str.lower()

#replace any characters that are not a-z or 0-9
X_texts = X_texts.replace(r'\W+', " ", regex=True)

#split words and remove any stopwords
X_texts  = [text.split() for text in X_texts]
X_texts = [[w for w in text if w not in stopwords] for text in X_texts]

#create a dictionary that counts the instances of each word in X_texts
vocab = Counter()
[vocab.update(text) for text in X_texts]

#create a dict of the most common words as keys and index as value
vocab = {word:idx for idx, (word,__) in enumerate(vocab.most_common(3000))}

#create an array of vectors to show how many times a word from vocab appears in each message
X_data = [[0]*len(vocab) for i in range(len(X_texts))] ##array of vectors of 0s as initialization
for i, text in enumerate(X_texts):
    for word in text: 
        if(word in vocab):
            X_data[i][vocab[word]]=+1
print(len(df[df["label"]=="spam"])/len(df))
print(len(df[df["label"]=="ham"])/len(df))

0.13406317300789664
0.8659368269921034


In [4]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

def init_weights(n_features):
    return np.zeros(n_features), 0.0

def predict(X, W, b, threshold=0.5):
    linear = np.dot(X, W) +b
    pred = sigmoid(linear)
    return(pred >= threshold).astype(int)

def train(X_train, Y_train, lr=0.01, epochs=1000):
    W,b = init_weights(len(vocab))
    
    for _ in range(epochs):
        Y_pred = sigmoid(np.dot(X_train,W) + b)
        
        dw = np.dot(X_train.T, (Y_pred - Y_train)) / len(X_train)
        db = np.sum(Y_pred-Y_train) / len(X_train)
        
        W -= lr * dw
        b -= lr * db
    return W, b
    

In [5]:
X_data = np.array(X_data)  #Is an array of vectors, the array index corresponds to a message, and each vector index corresponds to stopword, each value in the vector is the number of times the word appeared in the message
Y_data = np.array(labels)  #Spam or ham labels corresponding to the messages

X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, stratify=Y_data, test_size = 0.2)

W, b = train(X_train, Y_train, lr=0.15, epochs=1000)


In [6]:
Y_test_predict = predict(X_test, W, b, threshold=0.5)
print(W, b)


[ 1.60806653 -0.47810018  0.01217579 ... -0.00534177 -0.00931403
 -0.00667397] -2.965970562977659


In [7]:
from sklearn.metrics import precision_score, recall_score, f1_score

print("Precision:", precision_score(Y_test, Y_test_predict))
print("Recall:", recall_score(Y_test, Y_test_predict))
print("F1 Score:", f1_score(Y_test, Y_test_predict))

Precision: 0.9918032786885246
Recall: 0.8120805369127517
F1 Score: 0.8929889298892989


In [8]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(Y_test, Y_test_predict))
print(classification_report(Y_test, Y_test_predict, digits=3))

[[965   1]
 [ 28 121]]
              precision    recall  f1-score   support

           0      0.972     0.999     0.985       966
           1      0.992     0.812     0.893       149

    accuracy                          0.974      1115
   macro avg      0.982     0.906     0.939      1115
weighted avg      0.974     0.974     0.973      1115

