In [None]:
# Task 2 - Implement Logistic Regression Model
import numpy as np

class LogisticRegression():
    def __init__(self, learning_rate=0.01, threshold=0.5, num_iterations=1000,verbose = False):
        self.learning_rate = learning_rate
        self.threshold = threshold
        self.num_iterations = num_iterations
        self.__n_features = None
        self.__N_samples = None
        self._weights = None
        self.verbose = verbose

    def __initialize(self, X, with_bias=False):
        if(with_bias):
            return X
        else:
            return np.c_[X, np.ones(X.shape[0])]
         
    def fit(self, X, y, with_bias=False) -> None:
        self.__N_samples, self.__n_features = X.shape
        self._weights = np.zeros(self.__n_features+1)
        X = self.__initialize(X, with_bias=with_bias) 
        # gradient decent - loop
        for _ in range(self.num_iterations):
            y_prob = self.__sigmoid(np.dot(X, self._weights))
            gradient = (1/self.__N_samples) * np.dot(X.T, (y_prob - y))
            self._weights = self._weights - self.learning_rate*gradient
            if self.verbose and (_ % 20 == 0 or _ == self.num_iterations-1):
                loss = self.log_loss(y_prob,y)
                print(f"Loss after iteration {_}: {loss}")
            
    def predict_proba(self, X, with_bias=False):
        if(with_bias == False):
            X = self.__initialize(X, with_bias)  # add constance to X if needed
        return self.__sigmoid(np.dot(X, self._weights))
    
    def predict(self, X, with_bias=False):
        y_prob = self.predict_proba(X, with_bias) # predict probabilities
        y_pred = np.where(y_prob < self.threshold, 0, 1) # return below 0.5 as 0 and upper as 1
        return y_pred
        
    def score(self, X, y,with_bias=False) -> float: # check accuracy score
        y_pred = self.predict(X,with_bias=with_bias) # predict labels
        correct = np.sum(y == y_pred) # checking right predictions
        return correct/len(y) # take the average of correct predictions
    
    def f1_score(self,y_true,y_pred):
        # create confusion table (true positive, false negative and false positive)
        tp = np.sum((y_true == 1) & (y_pred == 1)) # true positive
        fp = np.sum((y_true == 0) & (y_pred == 1)) # false positive
        fn = np.sum((y_true == 1) & (y_pred == 0)) # false negative
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = 2 * (precision * recall) / (precision + recall)
        return recall,precision,f1
    
    def __sigmoid(self,x):
        return 1/(1+np.exp(-x))

    def log_loss(self, y_pred_prob, y):
        epsilon = 1e-5  # to avoid from log(zero) adding epsilon
        return (-y * np.log(y_pred_prob + epsilon) - (1 - y) * np.log(1 - y_pred_prob + epsilon)).mean()

#### Task 3 - Run Logistic Regression Model on Dataset Spam/Ham - Imports

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import string
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import seaborn as sns
%matplotlib inline

In [None]:
# need to run only once
nltk.download('punkt') 
nltk.download('stopwords')

#### Task 3 - Logistic Regression Model on Dataset Spam/Ham - Loading Data

In [None]:
# Task 3 - Run Logistic Regression Model on Dataset Spam/Ham - Loading Data
df = pd.read_csv('spam_ham_dataset.csv')
df.drop(['Unnamed: 0'],axis=1,inplace = True)
df.rename(columns={'label_num':'target'},inplace=True)
df['text'] = df['text'].replace(r'\n', ' ', regex=True) # remove \n

#### Task 3 - Logistic Regression Model on Dataset Spam/Ham - Preprocessing

In [None]:
# Add lens of text and show the information
# working on duplicate because after looking on this details it isn't looking helpful for prediction
df_dup = df.copy()
# this part take few seconds of run time
df_dup['num_characters'] = df_dup['text'].apply(len) # add more information about len of text
df_dup['num_words'] = df_dup['text'].apply(lambda text:len(nltk.word_tokenize(text)))
df_dup['num_sentences'] = df_dup['text'].apply(lambda text:len(nltk.sent_tokenize(text)))

##### Plot some data to get a bigger picture of the dataset

In [None]:
# Show the data informations
print("Ham:")
print(df_dup[df_dup['target'] == 0][['num_characters','num_words','num_sentences']].describe())
print("\nSpam:")
print(df_dup[df_dup['target'] == 1][['num_characters','num_words','num_sentences']].describe())
print("\nBoth:")
print(df_dup[['num_characters','num_words','num_sentences']].describe())

# plot the histogram of lens
df_dup.hist(column='num_characters',bins=200,figsize=(10,4),)
df_dup.hist(column='num_characters',by='label',bins=100,figsize=(10,4))
df_dup.hist(column='num_words',bins=200,figsize=(10,4))
df_dup.hist(column='num_words',by='label',bins=100,figsize=(10,4))
df_dup.hist(column='num_sentences',bins=200,figsize=(10,4))
df_dup.hist(column='num_sentences',by='label',bins=100,figsize=(10,4))

In [None]:
plt.figure(figsize=(12,8))
sns.histplot(df_dup[df_dup['target'] == 0]['num_characters'])
sns.histplot(df_dup[df_dup['target'] == 1]['num_characters'],color='red')

In [None]:
plt.figure(figsize=(12,8))
sns.histplot(df_dup[df_dup['target'] == 0]['num_words'])
sns.histplot(df_dup[df_dup['target'] == 1]['num_words'],color='red')

In [None]:
plt.figure(figsize=(12,8))
sns.histplot(df_dup[df_dup['target'] == 0]['num_sentences'])
sns.histplot(df_dup[df_dup['target'] == 1]['num_sentences'],color='red')

In [None]:
sns.pairplot(df_dup,hue='target')
df_withoutStrings = df_dup.drop(columns=['label','text'],inplace=False)

In [None]:
# Show correlation between lens and label
sns.heatmap(df_withoutStrings.corr(),annot=True)

##### Creating bag of words from dataset text

In [62]:
cv = CountVectorizer(lowercase = True , stop_words = 'english')
X = cv.fit_transform(df['text']).toarray() # create vector of words from emails
y = df['target'].values # take the labels
# split to train test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,random_state=0)

In [63]:
clf = LogisticRegression(learning_rate=0.1,num_iterations=200,verbose=True)
clf.fit(X_train,y_train)
y_test_pred = clf.predict(X_test)
print(f'Accuracy Score: {clf.score(X_test,y_test)}')
print(f'F1 Score: {clf.f1_score(y_test_pred,y_test)}')
print(f'Log Loss: {clf.log_loss(clf.predict_proba(X_test),y_test)}')

Loss after iteration 0: 0.6931271807599426
Loss after iteration 20: 0.3486191982037344
Loss after iteration 40: 0.2926177322922079
Loss after iteration 60: 0.2594346838695106
Loss after iteration 80: 0.23606798055787012
Loss after iteration 100: 0.2182829210082932
Loss after iteration 120: 0.20410770952466306
Loss after iteration 140: 0.19244844243847747
Loss after iteration 160: 0.18263236728304658
Loss after iteration 180: 0.17421647210277857
Loss after iteration 199: 0.16723813106048568
Accuracy Score: 0.9739130434782609
F1 Score: (0.9539473684210527, 0.9570957095709571, 0.955518945634267)
Log Loss: 0.1851059919866051


**Motivation to choose parameters**
- I shuffled (re-split and run) few times to see consistency of scores.
- The learning rate is 0.1 after trying different values and see that take to good result in few interations.
-  The num_iterations is 200 because less that 200 usually not enough for reach good result and more is not increase the score in significant value.

-  For decide which predictions are good, i used accuracy score and F1 score with confustion table.
    - I added verbose flag to the model for printing the log loss function of predictions to see the progress of learning.
    - I choosed parameters when i got Recall and Precision balanced with high score (around 95%+) and that i saw that when i changed paramaters one of scores go up but the second go down and F1 score didn't rise significantly.