In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import copy
from scipy.special import expit


In [3]:
df = pd.read_csv("/Users/mayank/Downloads/SMSSpamCollection", sep='\t', names=['label', 'message'])
df['label'] = df.label.map({'ham': 0, 'spam': 1})

In [4]:
df['message'] = df['message'].str.lower()
df['message'] = df['message'].str.replace('[^\w\s]', '', regex=True)

In [5]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['message'])
y = df['label'].astype(int)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

m, n_x = X_train.shape
learning_rate = 0.01
num_epochs = 8000
print(m)
X_train = X_train.toarray()
X_test = X_test.toarray()
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

4457


In [7]:
def initialize_with_zeros(dim):
    w = np.zeros((dim, 1))
    b = 0.0
    return w, b

In [12]:
def propagate(w, b, X, Y):
    X=np.array(X)
    Y=np.array(Y)
    w=np.array(w)
    m = 4457
    linear_combination = np.dot(X, w) + b
    A = expit(linear_combination)  # Use expit for sigmoid
    cost = -1/m * (np.dot(Y.T, np.log(A)) + np.dot((1 - Y).T, np.log(1 - A)))

    dw = 1/m * X.T.dot(A - Y)
    db = np.sum(A - Y)/m
    cost = np.squeeze(np.array(cost))
    grads = {"dw": dw,
             "db": db}
    return grads, cost


In [9]:
def optimize(w, b, X, Y, epochs=500, learning_rate=0.01):
    costs = []
    for i in range(epochs):
        grads, cost = propagate(w, b, X, Y)
        dw = grads["dw"]
        db = grads["db"]
        w -= learning_rate * dw
        b -= learning_rate * db
        if i % 100 == 0:
            print("Cost for iteration {}: {}".format(i, cost))
            costs.append(cost)
    params = {"w": w,
              "b": b}
    grads = {"dw": dw,
             "db": db}
    return params, grads, costs

In [10]:
def predict(w, b, X):
    X=np.array(X)
    w=np.array(w)
    linear_combination = np.dot(X, w) + b
    A = expit(linear_combination)  
    Y_prediction = (A >= 0.5).astype(int)
    return Y_prediction

In [13]:
def model(X_train, Y_train, X_test, Y_test, num_iterations=2000, learning_rate=0.01):
    w, b = initialize_with_zeros(n_x)
    params, grads, costs = optimize(w, b, X_train, Y_train, num_iterations, learning_rate)
    w = params["w"]
    b = params["b"]
    Y_prediction_test = predict(w, b, X_test)
    Y_prediction_train = predict(w, b, X_train)
    print("Train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
    print("Test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))
    d = {"costs": costs,
         "Y_prediction_test": Y_prediction_test,
         "Y_prediction_train": Y_prediction_train,
         "w": w,
         "b": b,
         "learning_rate": learning_rate,
         "num_iterations": num_iterations}
    return d

logistic_regression_model = model(X_train, y_train, X_test, y_test, num_iterations=8000, learning_rate=0.01)

Cost for iteration 0: 0.693147180559947
Cost for iteration 100: 0.5415156926797872
Cost for iteration 200: 0.4702054040333221
Cost for iteration 300: 0.4281903253603253
Cost for iteration 400: 0.39913548479367866
Cost for iteration 500: 0.3767901209994725
Cost for iteration 600: 0.3584097444910779
Cost for iteration 700: 0.342654790019315
Cost for iteration 800: 0.32880830940779393
Cost for iteration 900: 0.3164502745442551
Cost for iteration 1000: 0.30531098799443146
Cost for iteration 1100: 0.29520153269237637
Cost for iteration 1200: 0.28597952012155464
Cost for iteration 1300: 0.2775316539307681
Cost for iteration 1400: 0.2697644800483757
Cost for iteration 1500: 0.2625991776519079
Cost for iteration 1600: 0.2559683782567649
Cost for iteration 1700: 0.2498140354496529
Cost for iteration 1800: 0.24408587117866357
Cost for iteration 1900: 0.23874016706282628
Cost for iteration 2000: 0.23373878450886565
Cost for iteration 2100: 0.22904835165596155
Cost for iteration 2200: 0.2246395806