<a href="https://colab.research.google.com/github/Riddick4-droid/Machine_Learning-Pt/blob/main/Ad_Clicks_Logistic_Regression_Scikit_learn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LOGISTIC REGRESSION FOR CLICK-THROUGH AD PREDICTIONS

To understand the concept of logistic regression please see my notebook
on building `logistic regression from scratch`.

In [None]:
##make basic imports
import pandas as pd
import numpy as np
from timeit import default_timer as timer
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt

In [None]:
#i will use some functions from the scratch notebook

#calculate the outputs as sigmoid values between 0 and 1
def sigmoid(x):
    #the function simply takes in an input value and returns a computed sigmoid of the value
    #the resulting values usually fall between 0 and 1 as the sigmoid function is a normalization function
    return 1.0 / (1+np.exp(-x))

#compute predictions-used in the train function
def compute_predictions(x,weights):
    #this function computes the prediction y_hat based on weights(coefficients)
    z = np.dot(x,weights)

    #since we are using logistic regression
    #all outputs must be between 0 and 1
    sig = sigmoid(z)

    #return
    return sig

#function for weight updates-used in training
def update_weights(x,y,weights,lr):
    #lets get the models prediction
    predictions = compute_predictions(weights=weights,x=x)

    #get weight changes
    #remember that the formual is usually y_hat = np.dot(x,w.T)
    #but we use change of subject to find the weights.this is
    change_in_weights = np.dot(x.T,y-predictions)

    #get sample size
    m = y.shape[0]

    #weight update
    weights += lr /float(m)*change_in_weights

    return weights

#for computing the cost function
def compute_cost(x,y,weights):
    #this function calculates the cost function
    #the cost is usually a measure of how wrong the model is

    predictions = compute_predictions(x,weights)

    #cost
    cost = np.mean(-y * np.log(predictions) - (1-y)*np.log(1-predictions))

    #return
    return cost

#the train function to get weights
#i will spice this up with a timer module
def train_logistic_regression(x,y,num_iter,lr,fit_intercept=False):
    start = timer()
    if fit_intercept:
        intercept = np.ones((x.shape[0],1))
        x = np.hstack((intercept,x))
    weights = np.zeros(x.shape[1])

    #update weights iteratively
    for iteration in range(num_iter):
        weights = update_weights(x,y,weights,lr)

        #check the cost for every 50 iterations
        if iteration % 50 == 0:
            print(f'iter:{iteration}---{round(compute_cost(x,y,weights),3)}')
    end = timer()
    print(f'trained for {(end-start):.5f}s for {num_iter} iterations')
    return weights

#make predictions or inference
def predict(x,weights):
    #we know by now that y_hat is
    #based on dot product between
    #the weights and the x
    if x.shape[1] == weights.shape[0]-1: #we ensure there is a shape match by including an intercept
        intercept = np.ones((x.shape[0],1))
        x = np.hstack((intercept,x))
    return compute_predictions(x,weights)

In [None]:
##get the data
import kagglehub
path = kagglehub.dataset_download("marius2303/ad-click-prediction-dataset")

#print path
print(path)

In [None]:
import os

# List the contents of the downloaded directory
print(os.listdir(path))

In [None]:
import pandas as pd
import os

# check in the directory is 'ad_click_dataset.csv'
df = pd.read_csv(os.path.join(path, 'ad_click_dataset.csv'))

# Display the first 5 rows of the DataFrame with .head()
display(df.head(20))

In [None]:
df['id'].nunique()

In [None]:
#check shape
print(f'there are {df.shape[0]} instances and {df.shape[1]} features')

In [None]:
#lets preprocess the data
#split the data into target and features
x = df.drop(['click','id','full_name','device_type'],axis=1).values

y = df['click'].values

In [None]:
##train test split
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.1,random_state=42)

In [None]:
#check shape
x_train.shape,y_train.shape

In [None]:
#encode the features
enc = OneHotEncoder(handle_unknown='ignore')

x_train_enc = enc.fit_transform(x_train)

x_test_enc = enc.transform(x_test)

In [None]:
##expect an increase in feature size due to encoding by creating dummies
x_train_enc.shape

In [None]:
##get weights
weights = train_logistic_regression(x_train_enc.toarray(),y_train,num_iter=1000,lr=0.01,fit_intercept=True)

In [None]:
##make predictions
predictions = predict(x_test_enc.toarray(),weights)

In [None]:
predictions.shape

In [None]:
##use the more robust roc_auc_score
from sklearn.metrics import roc_auc_score,roc_curve

print(f'Training samples: {x_train.shape[0]}, AUC on testing set: {roc_auc_score(y_test,predictions):.3f}')

In [None]:
#now i try a better way of implementing logistic regression that has the ability to apply SGD
from sklearn.linear_model import SGDClassifier

#initialize
sgd_lr = SGDClassifier(loss='log_loss',penalty=None,fit_intercept=True,max_iter=20,learning_rate='constant',eta0=0.01)

Here, 'log_loss' for the loss parameter indicates that the cost function is log loss,
penalty is the regularization term to reduce overfitting, which we will discuss further in
the next section, max_iter is the number of iterations, and the remaining two parameters
mean the learning rate is 0.01 and unchanged during the course of training. It should
be noted that the default learning_rate is 'optimal', where the learning rate slightly
decreases as more and more updates are made. This can be beneficial for finding the
optimal solution on large datasets

In [None]:
##lets fit our data to the algorithm
sgd_lr.fit(x_train_enc.toarray(),y_train)

#make predictions
pred = sgd_lr.predict_proba(x_test_enc.toarray())[:,1]

In [None]:
#check the roc score
print(f'Training samples: {x_train.shape[0]}, AUC on testing set: {roc_auc_score(y_test,pred):.3f}')

In [None]:
#i will add regularization to the model
#this regularizes the weight updates and is implemented with the penalty hyperparameter
#The regularization term is introduced in order to penalize large weights,
#as the weights now become part of the cost to minimize.
#Regularization as a result eliminates overfitting. Finally, parameter 'α' provides a trade
#off between log loss and generalization. If 'α' is too small, it is not able to compress
#large weights and the model may suffer from high variance or overfitting; on the
#other hand, if 'α' is too large, the model may become over-generalized and perform
#poorly in terms of fitting the dataset, which is the syndrome of underfitting. α is an
#important parameter to tune in order to obtain the best logistic regression model with
#regularization.

In [None]:
#initialize with aplha and penalty term as l1
sgd_lr_l1 = SGDClassifier(loss='log_loss',penalty='l1',alpha=0.0001,fit_intercept=True,max_iter=20,learning_rate='constant',eta0=0.01)

#fit
sgd_lr_l1.fit(x_train_enc.toarray(),y_train)

In [None]:
##lets obtain the absolute values of coefficients
absolute_coefs = np.abs(sgd_lr_l1.coef_)

#print the shape
print(absolute_coefs.shape)

In [None]:
#the model when combined with the penalty=l1 and an alpha value
##can be used as a feature selector because it tells which features is redundant
#by pushing their coefficients close to or exaclty 0.

In [None]:
##a more robust method for feature selection is the Random Forest method
##random forest is a form of bagging strategy over a set of decision trees
##each tree considers a random subset of the features when searching for the best
##splitting point at each node. IN decision tree, only those significant features are used to constitute tree nodes.

#lets implement it
from sklearn.ensemble import RandomForestClassifier

#initialize
rf = RandomForestClassifier(n_estimators=100,criterion='gini',min_samples_split=30,n_jobs=-1)

#fit
rf.fit(x_train_enc.toarray(),y_train)

In [None]:
##lets check feature importances
f_imp = rf.feature_importances_

print(f_imp)

In [None]:
##lets get features names from the encoder
f_names = enc.get_feature_names_out()

#attach the feature names to their importances
print(np.sort(f_imp)[:10])