In [75]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.model_selection import train_test_split

In [20]:
df = pd.read_csv("sms_spam.csv")

**Proccessing the Data before training**

In [135]:
#create an array that is 0 if ham and 1 if spam
labels = (df["label"]=="spam").astype(int)

stopwords = ["the", "and", "is", "in", "to", "a", "of", "i", "you", "u", "me", "it", "for", "your", "my"]
#lower to lowercase a string, str to apply it to all strings in the column
X_texts = df["message"].str.lower()

#replace any characters that are not a-z or 0-9
X_texts = X_texts.replace(r'\W+', " ", regex=True)

#split words and remove any stopwords
X_texts  = [text.split() for text in X_texts]
X_texts = [[w for w in text if w not in stopwords] for text in X_texts]

#create a dictionary that counts the instances of each word in X_texts
vocab = Counter()
[vocab.update(text) for text in X_texts]

#create a dict of the most common words as keys and index as value
vocab = {word:idx for idx, (word,__) in enumerate(vocab.most_common(3000))}

#create an array of vectors to show how many times a word from vocab appears in each message
X_data = [[0]*len(vocab) for i in range(len(X_texts))] ##array of vectors of 0s as initialization
for i, text in enumerate(X_texts):
    for word in text: 
        if(word in vocab):
            X_data[i][vocab[word]]=+1
print(len(df[df["label"]=="spam"])/len(df))
print(len(df[df["label"]=="ham"])/len(df))

0.13406317300789664
0.8659368269921034


In [160]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

def init_weights(n_features):
    return np.zeros(n_features), 0.0

def predict(X, W, b, threshold=0.5):
    linear = np.dot(X, W) +b
    pred = sigmoid(linear)
    return(pred >= threshold).astype(int)

def train(X_train, Y_train, lr=0.01, epochs=1000):
    W,b = init_weights(len(vocab))
    
    for _ in range(epochs):
        Y_pred = sigmoid(np.dot(X_train,W) + b)
        
        dw = np.dot(X_train.T, (Y_pred - Y_train)) / len(X_train)
        db = np.sum(Y_pred-Y_train) / len(X_train)
        
        W -= lr * dw
        b -= lr * db
    return W, b
    

In [173]:
X_data = np.array(X_data)
Y_data = np.array(labels)

X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, stratify=Y_data, test_size = 0.2)

W, b = train(X_train, Y_train, lr=0.15, epochs=1000)


In [174]:
Y_test_predict = predict(X_test, W, b, threshold=0.5)
print(W, b)


[ 1.57671485 -0.46831001  0.0375414  ... -0.00337246 -0.00876347
 -0.00700808] -2.947637797746006


In [175]:
from sklearn.metrics import precision_score, recall_score, f1_score

print("Precision:", precision_score(Y_test, Y_test_predict))
print("Recall:", recall_score(Y_test, Y_test_predict))
print("F1 Score:", f1_score(Y_test, Y_test_predict))

Precision: 1.0
Recall: 0.8187919463087249
F1 Score: 0.900369003690037


In [155]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(Y_test, Y_test_predict))
print(classification_report(Y_test, Y_test_predict, digits=3))

[[966   0]
 [ 79  70]]
              precision    recall  f1-score   support

           0      0.924     1.000     0.961       966
           1      1.000     0.470     0.639       149

    accuracy                          0.929      1115
   macro avg      0.962     0.735     0.800      1115
weighted avg      0.935     0.929     0.918      1115



In [150]:
y_pred = predict(X_test, W, b)
print("Predictions:", np.unique(y_pred, return_counts=True))
print("Actual:", np.unique(Y_test, return_counts=True))


Predictions: (array([0]), array([1115], dtype=int64))
Actual: (array([0, 1]), array([966, 149], dtype=int64))


In [158]:
for t in [0.5, 0.4, 0.3, 0.2]:
    y_pred = predict(X_test, W, b, threshold=t)
    print(f"\nThreshold: {t}")
    print(classification_report(Y_test, y_pred, target_names=["ham", "spam"]))



Threshold: 0.5
              precision    recall  f1-score   support

         ham       0.87      1.00      0.93       966
        spam       0.00      0.00      0.00       149

    accuracy                           0.87      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.75      0.87      0.80      1115


Threshold: 0.4
              precision    recall  f1-score   support

         ham       0.87      1.00      0.93       966
        spam       1.00      0.06      0.11       149

    accuracy                           0.87      1115
   macro avg       0.94      0.53      0.52      1115
weighted avg       0.89      0.87      0.82      1115


Threshold: 0.3
              precision    recall  f1-score   support

         ham       0.92      1.00      0.96       966
        spam       1.00      0.47      0.64       149

    accuracy                           0.93      1115
   macro avg       0.96      0.73      0.80      1115
weighted avg       0.93   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
