# Building a Logistic Regression algorithm
* We'll implement the logistic regression algorithm using functions.
* We will only use 10_000 samples from the Ad Click-Through dataset.


## Step 1: Function that computes the prediction $\hat{y}(x)$

In [6]:
# Computing the initial prediction of y_hat.

import numpy as np
def sigmoid(input):
    return 1.0/(1 + np.exp(-input))

def compute_prediction(X, weights):
    """
    Computes the prediction y_hat based on the current weights
    @param X: A numpy array of features.
    @param weights: The computed weights for the Logistic Regression algorithm
    @return: y_hat, a prediction (values that range between 0 and 1)
    """
    z = np.dot(X, weights)
    predictions = sigmoid(z)
    return predictions



## Step 2: Function to update the weights:
$w: = w + n\frac{1}{m} \sum^{m}_{i=1}{(y^i-\hat{y}(z^i))x^i}$

This function takes as input the training data (X_train and y_train), the current weights of the model, and the learning rate. It computes the predictions using the current weights, calculates the gradient, and updates the weights according to the gradient descent update rule. The updated weights are then returned.





In [7]:
def update_weights_gd(X_train, y_train, weights, learning_rate):
    """

    @param X_train: (numpy array): The feature matrix of shape (m, n), where m is the number of samples and n is the number of features.
    @param y_train: (numpy array): The target vector of shape (m,).
    @param weights:(numpy array): The weight vector of the linear model of shape (n,).
    @param learning_rate: The learning rate for gradient descent.
    @return: numpy array: The updated weight vector of shape (n,).
    """
    # Compute the predictions using the current weights
    predictions = compute_prediction(X_train, weights)

    # Calculate the gradient by computing the dot product of the transpose of the feature matrix and the error (y_train - predictions)
    weights_delta = np.dot(X_train.T, y_train - predictions)

    # Get the number of samples in the training data
    m = y_train.shape[0]
    # Update the weights by adding the scaled gradient, where the scaling factor is the learning rate divided by the number of samples (m)
    weights += learning_rate / float(m) * weights_delta

    # Return the updated weights
    return weights


## Step 3: The function calculating the cost J(w)
$J(w) = \frac{1}{m}\sum^{m}_{i = 1}{-[y^i \log{\hat{y}(x^i)} + (1 - y^i)
\log{(1-\hat{y}(x^i))}]}$

This function takes as input the feature matrix X, the target vector y, and the weight vector of the logistic regression model. It computes the predictions using the current weights and then calculates the cross-entropy cost (also known as the loss) based on the given formula.

In [8]:
def compute_cost(X, y, weights):
    """
    Compute the cross-entropy cost (loss) for a logistic regression model.

    @param X: numpy array): The feature matrix of shape (m, n), where m is
    the number of samples and n is the number of features.
    @param y:(numpy array): The target vector of shape (m,).
    @param weights:( vnumpy array): The weight vector of the logistic
    regression model of shape (n,).

    @return float: The cross-entropy cost.
    """

    # Compute the predictions using the current weights
    predictions = compute_prediction(X, weights)

    # Calculate the cross-entropy cost using the given formula
    cost = np.mean(-y * np.log(predictions) - (1-y) * np.log(1 - predictions))

    # Return the computed cost
    return cost

## Step 4: Connecting the functions
* Updating the weights vector in each iteration
* Printing out the current cost for every 100 (this can be another value)
iterations to ensure cost is decreasing and that things are on the right track.

This function trains a logistic regression model using gradient descent. It takes as input the feature matrix X_train, the target vector y_train, the maximum number of iterations for gradient descent, the learning rate, and an optional parameter fit_intercept that indicates whether to fit the intercept term. If fit_intercept is True, an intercept term is added to the feature matrix. The weights are initialized to zeros, and gradient descent is performed for the specified number of iterations. The cost is checked and printed for every 100 iterations. The trained weight vector is returned.

In [9]:
def train_logistic_regression(X_train, y_train, max_iter, learning_rate,
                              fit_intercept = False):
    """
    Train a logistic regression model using gradient descent

    @param X_train: (numpy array): The feature matrix of shape (m, n), where m is the number of samples and n is the number of features.
    @param y_train: (numpy array): The target vector of shape (m,).
    @param max_iter: int: The maximum number of iterations for gradient descent.
    @param learning_rate: (float): The learning rate for gradient descent.
    @param fit_intercept:(bool, optional): Whether to fit the intercept term. Defaults to False.
    @return:     numpy array: The trained weight vector of the logistic regression model of shape (n,).
    """

    # Add an intercept term to the feature matrix if fit_intercept is True
    if fit_intercept:
        intercept = np.ones((X_train.shape[0], 1))
        X_train = np.hstack((intercept, X_train))

    # Initialize the weights to zeros
    weights = np.zeros(X_train.shape[1])

    # Perform gradient descent for the specified number of iterations
    for iteration in range(max_iter):
        weights = update_weights_gd(X_train, y_train, weights, learning_rate)
        # Check the cost for every 100 iterations.
        if iteration % 100 == 0:
            print(f"Cost: {compute_cost(X_train, y_train, weights)}")

    # Return the trained weights
    return weights

## Final step: Predicting new inputs.
The predict function is used to predict class probabilities using a logistic regression model. The input is a feature matrix X of shape (n_samples, n_features) and a weight vector weights of shape (n_features + 1,). The function first checks if the input matrix X has one less column than the length of the weight vector, which means it's missing the intercept term. If this is the case, the function adds an intercept column of ones to the input matrix X. Finally, the function calls the compute_prediction function with the updated X and weights to calculate the predicted class probabilities and returns the result.

In [10]:
def predict(X, weights):
    """
    Predict the class probabilities using the logistic regression model.

    @param X: numpy array of shape (n_samples, n_features), input features.
    @param weights: numpy array of shape (n_features + 1,), model weights (including the intercept term).
    @return: numpy array of shape (n_samples,), predicted class probabilities.
    """
    # Check if the number of columns in X is one less than the length of weights.
    # If true, it means the intercept term is not included in X.
    if X.shape[1] == weights.shape[0] - 1:
        # Add an intercept column with all ones to X.
        intercept = np.ones((X.shape[0], 1))
        X = np.hstack((intercept, X))

    # Compute the predicted class probabilities using the input features and the model weights.
    return compute_prediction(X, weights)

## Predicting click-through

In [11]:
import pandas as pd
n_rows = 300_000
df = pd.read_csv("./dataset/train.csv", nrows = n_rows)

# Splitting the column features from the target values
X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
y = df['click'].values

# We will only train the model using 10,000 samples
n_train = 10000
X_train = X[:n_train]
y_train = y[:n_train]
X_test = X[n_train:]
y_test = y[n_train:]

In [12]:
# Performing one-hot encoding
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown = "ignore")
X_train_enc = enc.fit_transform(X_train)
X_test_enc = enc.transform(X_test)

In [28]:
# Training the model over 10_000 iterations, learning rate of 0.01 and with bias
# Calculating the time it takes to train the model
import timeit

start_time = timeit.default_timer()
weights = train_logistic_regression(X_train_enc.toarray(), y_train, max_iter
= 10_000, learning_rate = 0.01, fit_intercept = True)
print(f"--- {(timeit.default_timer() - start_time)}.3fs seconds ---")

Cost: 0.6820019456743648
Cost: 0.4608619713011896
Cost: 0.4503715555130051
Cost: 0.4455503890097847
Cost: 0.4420611414384596
Cost: 0.4393702812833892
Cost: 0.437228041454526
Cost: 0.4354781787758496
Cost: 0.43401801289720104
Cost: 0.4327779028622343
Cost: 0.4317091585700226
Cost: 0.43077673019057455
Cost: 0.42995469288423555
Cost: 0.42922339559221634
Cost: 0.4285676184571522
Cost: 0.42797535312823465
Cost: 0.4274369752561037
Cost: 0.42694466897530536
Cost: 0.42649201676958726
Cost: 0.42607370031421204
Cost: 0.42568527750493995
Cost: 0.42532301300292674
Cost: 0.4249837472238756
Cost: 0.42466479353954484
Cost: 0.4243638565943513
Cost: 0.4240789667070855
Cost: 0.42380842671759145
Cost: 0.42355076859163654
Cost: 0.42330471776471257
Cost: 0.42306916368249065
Cost: 0.4228431353432208
Cost: 0.42262578090532044
Cost: 0.4224163506180466
Cost: 0.42221418248223747
Cost: 0.4220186901637542
Cost: 0.42182935277298567
Cost: 0.42164570619560027
Cost: 0.42146733571705797
Cost: 0.4212938697294381
Cost: 

In [36]:
# Evaluating the model
from sklearn.metrics import roc_auc_score

pred = predict(X_test_enc.toarray(), weights)
print(f'Training samples: {n_train}, AUC on testing set: {roc_auc_score(y_test, pred):.3f}')

Training samples: 10000, AUC on testing set: 0.703


## Implementing Stochastic Gradient Descent
* For each weight update, only one training sample is consumed, instead of
the complete training set.
* We just need to update the update_weights() and the
train_logistic_regression() functions

The function iterates through each data point in the training set, computes the prediction for the current data point using the current model weights, and calculates the gradient (weights_delta) for the current data point. It then updates the model weights using the learning rate and the calculated gradient.

The updated model weights are returned at the end of the function.

In [13]:
def update_weights_sgd(X_train, y_train, weights, learning_rate):
    """

    @param X_train: numpy array of shape (n_samples, n_features), input features.
    @param y_train: numpy array of shape (n_samples,), target class labels.
    @param weights: numpy array of shape (n_features + 1,), model weights (including the intercept term).
    @param learning_rate: float, the learning rate for gradient descent.
    @return: numpy array of shape (n_features + 1,), updated model weights.
    """

    # Iterate through each data point in the training set (X_train, y_train)
    for X_each, y_each in zip(X_train, y_train):
        # Compute the prediction for the current data point using the current model weights
        prediction = compute_prediction(X_each, weights)

        # Calculate the gradient (weights_delta) for the current data point
        weights_delta = X_each.T * (y_each - prediction)
        # Update the model weights using the learning rate and the calculated gradient
        weights += learning_rate* weights_delta

    # Return the updated model weights
    return weights

In [14]:
def train_logistic_regression_sgd(X_train, y_train, max_iter, learning_rate,
                                  fit_intercept = False):
    """
    Train a logistic regression model using Stochastic Gradient Descent (SGD).

     @param X_train: numpy array of shape (n_samples, n_features), input features.
    @param y_train: numpy array of shape (n_samples,), target class labels.
    @param max_iter: int, the number of iterations for weight updates.
    @param learning_rate: float, the learning rate for gradient descent.
    @param fit_intercept: bool, whether to add an intercept term to the model.
    @return: numpy array of shape (n_features + 1,), trained model weights (including the intercept term).

    """
    # If fit_intercept is True, add an intercept term to the input features
    if fit_intercept:
        intercept = np.ones((X_train.shape[0], 1))
        X_train = np.hstack((intercept, X_train))

    # Initialize the weights with zeros
    weights = np.zeros(X_train.shape[1])
    for iteration in range(max_iter):
        weights = update_weights_sgd(X_train, y_train, weights, learning_rate)
        # Check the cost for every 2 iterations.
        if iteration % 2 == 0:
            print(f"Cost: {compute_cost(X_train, y_train, weights)}")

    # Return the trained model weights
    return weights

## Using SGD and scaling the model
* We will increase the amount of samples we'll train the model with since SGD
 can take it.

In [18]:
import pandas as pd
n_rows = 300_000
df = pd.read_csv("./dataset/train.csv", nrows = n_rows)

# Splitting the column features from the target values
X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
y = df['click'].values

# We will only train the model using 100,000 samples
n_train = 100_000
X_train = X[:n_train]
y_train = y[:n_train]
X_test = X[n_train:]
y_test = y[n_train:]
# Performing one-hot encoding
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown = "ignore")
X_train_enc = enc.fit_transform(X_train)
X_test_enc = enc.transform(X_test)

In [19]:
# Training the model over 100_000 iterations, learning rate of 0.01 and with
# bias
# Calculating the time it takes to train the model
import timeit
start_time = timeit.default_timer()
weights = train_logistic_regression_sgd(X_train_enc.toarray(), y_train,
                                        max_iter = 50, learning_rate = 0.01,
                                        fit_intercept = True)
print(f"--- {(timeit.default_timer() - start_time)}.3fs seconds ---")

# Predicting values and getting ROC AUC Score
from sklearn.metrics import roc_auc_score

pred = predict(X_test_enc.toarray(), weights)
print(f'Training samples: {n_train}, AUC on testing set: {roc_auc_score(y_test, pred):.3f}')

Cost: 0.4127864859625796
Cost: 0.4078504597223988
Cost: 0.40545733114863264
Cost: 0.403811787845451
Cost: 0.4025431351250833
Cost: 0.4015053950669261
Cost: 0.40062464023567285
Cost: 0.39985799447134973
Cost: 0.3991783043895136
Cost: 0.398567258491007
Cost: 0.39801190940990816
Cost: 0.3975027566890244
Cost: 0.39703261643081
Cost: 0.3965959186801836
Cost: 0.39618825187922974
Cost: 0.39580605722362766
Cost: 0.39544641765210864
Cost: 0.3951069085504533
Cost: 0.3947854898106667
Cost: 0.39448042625798385
Cost: 0.3941902279442516
Cost: 0.39391360461804337
Cost: 0.39364943048841494
Cost: 0.39339671658454667
Cost: 0.3931545888057168
--- 103.14262400000007.3fs seconds ---
Training samples: 100000, AUC on testing set: 0.730
