In [1]:
!pip install ucimlrepo
import numpy as np
from tqdm import tqdm
from ucimlrepo import fetch_ucirepo 
import pandas as pd
from sklearn.model_selection import train_test_split



In [93]:
class LogisticRegression:
    """
    Logistic Regression Class

    Attributes
    __________
    iterations : number of iterations used for gradient descent to find optimal weights,
    learning_rate : learning rate used for gradient descent(alpha),
    tolerance : tolerance used for gradient descent convergence of difference in weights,
    regularization_parameter : regularization parameter(lambda), Default = 0,
    weights : weights of the model after fitting,
    errors : errors of the model in each iteration of gradient descent

    Methods
    _______
    fit(X_train, y_train) : train model on training set
    predict(X_test) : predict the test data using fitted model weights,
    evaluate(X_test,y_test) : predict the test data using fitted model weights and get evaluation results Accuracy,
    Recall, F1 Score, Precision
    """
    def __init__(self, iterations, learning_rate, tolerance):
        self.errors = None
        self.prev_cost = None
        self.cost = None
        self.mean = None
        self.std = None
        self.X_train = None
        self.X_test = None
        self.y_test = None
        self.y_train = None
        self.weights = None
        self.iterations = iterations
        self.learning_rate = learning_rate
        self.tolerance = tolerance

    @staticmethod
    def sigmoid(x):
        """
        Sigmoid function
        :param x: input numpy array
        :return: sigmoid(x)
        """
        return 1 / (1 + np.exp(-x))

    def gradient_calculation(self):
        """
        Calculates the gradient of the logistic regression model's loss function with respect to the model's weights.
        The gradient calculation formula is:

        ∇(loss) = X.T * (y_pred - y)
        :return:
        numpy.ndarray: A gradient vector where each element is the partial
                       derivative of the loss function with respect to the
                       corresponding weight.
        """
        y_pred = self.sigmoid(np.matmul(self.X_train, self.weights))
        return self.X_train.T.dot(y_pred - self.y_train)

    def cost_function(self):
        """
        Computes the cost for a logistic regression model using cross-entropy loss
        and adds regularization penalty if applicable.

        The cost function is defined as:

            L = -sum(y * log(y_pred) + (1 - y) * log(1 - y_pred)) 

        where:
        - y is the vector of actual class labels,
        - y_pred is the vector of predicted probabilities, computed as sigmoid(X * w),
       

        Returns:
            float: The computed cost value.
        """
        
        y_pred = self.sigmoid(np.matmul(self.X_train, self.weights))
        cost = self.y_train * np.log(y_pred) + (1 - self.y_train) * np.log(1 - y_pred)
        cost = -np.sum(cost) 
        return cost

    def gradient_descent(self):
        """
        Performs gradient descent optimization to find the optimal weights of the logistic regression model.

        The method iteratively updates the weights by moving in the direction of the negative gradient of the cost function, adjusted by the learning rate.

        Weight update formulas:
        - w = w - learning_rate * ∇(loss)
      
        The process continues for a specified number of iterations or until the improvement in cost is less than a defined tolerance, indicating convergence.

        Attributes updated during optimization:
        - self.weights: The weights vector of the model.
        - self.errors: A list of cost values at each iteration, tracking how the cost changes.

        Side effects:
        - If the change in cost between iterations is less than the tolerance, it prints a message and stops further updates.

        """
        self.errors = []
        self.weights = np.zeros(shape=(self.X_train.shape[1]))
        self.prev_cost = np.inf
        for i in tqdm(range(self.iterations), colour='WHITE'):
            self.weights -= (self.learning_rate * self.gradient_calculation())
            self.cost = self.cost_function()
            self.errors.append(self.cost)
            if self.prev_cost - self.cost <= self.tolerance:
                print('Model has stopped improving')
                break
            self.prev_cost = self.cost

    def fit(self, X_train, y_train):
        """
        Fits the logistic regression model to the training data using gradient descent.

        This method initializes the training process by setting the training data and target labels,
        and then calls the `gradient_descent` method to optimize the model's weights.

        Parameters:
            X_train (array-like): The input features of the training data.
            y_train (array-like): The target labels corresponding to the input features.

        The training process involves:
        - Storing the training data (`X_train`) and labels (`y_train`) in the instance variables.
        - Calling the `gradient_descent` method to adjust the weights based on the loss gradient.

        Note:
        - The shape of `X_train` should match the expected number of features.
        - The shape of `y_train` should correspond to the number of samples in `X_train`.
        """

        self.X_train = X_train
        self.y_train = y_train
        self.gradient_descent()

    def predict(self, X_test):
        """
        Predicts class labels for given input samples using the trained logistic regression model.

        This method computes predictions by applying the sigmoid function to the linear combination
        of input features and the learned weights. The output of the sigmoid function represents
        the probability of the input belonging to the positive class, which is then threshold at 0.5
        to produce binary class labels.

        Parameters:
            X_test (array-like): The input features of the test data.

        Returns:
            numpy.ndarray: An array of predicted class labels (0 or 1) for each input sample.

        The prediction is calculated as follows:
        - Apply the sigmoid function to (X_test * weights) to get the probability of the positive class.
        - Threshold the probabilities at 0.5 to determine the class labels.
        """
        predictions = self.sigmoid(np.matmul(X_test, self.weights))
        return np.round(predictions)

    def evaluate(self, X_test, y_test):
        """
        Evaluates the performance of the logistic regression model on a test dataset.

        This method uses the model's predictions to calculate key classification metrics:
        accuracy, precision, recall, and F1-score. These metrics provide insights into the
        effectiveness of the model in classifying positive and negative classes.

        Parameters:
            X_test (array-like): The input features of the test data.
            y_test (array-like): The actual class labels for the test data.

        The evaluation metrics are calculated as follows:
        - True Positives (TP): Correct positive predictions.
        - False Positives (FP): Incorrect positive predictions.
        - True Negatives (TN): Correct negative predictions.
        - False Negatives (FN): Incorrect negative predictions.

        Formulas:
        - Accuracy = TP / (TP + FP)
        - Recall = TP / (TP + FN)
        - Precision = TP / (TP + FP)
        - F1-score = 2 * (Precision * Recall) / (Precision + Recall)

        Side effects:
        - Prints the calculated metrics: Accuracy, Recall, Precision, and F1-score.
        """
        y_predicted = self.predict(X_test)
        true_positives = np.sum(y_predicted * y_test)
        false_positive = np.sum(y_predicted * (1 - y_test))
        true_negative = np.sum((y_predicted == 0) & (y_test == 0))
        false_negative = np.sum((y_predicted == 0) & (y_test == 1))

        accuracy = true_positives / (true_positives + false_positive)
        recall = true_positives / (true_positives + false_negative)
        precision = true_positives / (true_positives + false_positive)
        f1_score = 2 * (precision * recall) / (precision + recall)

        print('Accuracy: ', accuracy)
        print('Recall: ', recall)
        print('Precision: ', precision)
        print('F1-score: ', f1_score)


In [94]:
# fetch dataset 
room_occupancy_estimation = fetch_ucirepo(id=864) 
  
# data (as pandas dataframes) 
X = room_occupancy_estimation.data.features 
y = room_occupancy_estimation.data.targets


In [95]:
X.loc[:,'Date'] = pd.to_datetime(X.loc[:,'Date'])
X.loc[:,'Time'] = pd.to_timedelta(X.loc[:,'Time'])
Z = pd.DataFrame()

In [96]:
X

Unnamed: 0,Date,Time,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR
0,2017-12-22 00:00:00,0 days 10:49:41,24.94,24.75,24.56,25.38,121,34,53,40,0.08,0.19,0.06,0.06,390,0.769231,0,0
1,2017-12-22 00:00:00,0 days 10:50:12,24.94,24.75,24.56,25.44,121,33,53,40,0.93,0.05,0.06,0.06,390,0.646154,0,0
2,2017-12-22 00:00:00,0 days 10:50:42,25.00,24.75,24.50,25.44,121,34,53,40,0.43,0.11,0.08,0.06,390,0.519231,0,0
3,2017-12-22 00:00:00,0 days 10:51:13,25.00,24.75,24.56,25.44,121,34,53,40,0.41,0.10,0.10,0.09,390,0.388462,0,0
4,2017-12-22 00:00:00,0 days 10:51:44,25.00,24.75,24.56,25.44,121,34,54,40,0.18,0.06,0.06,0.06,390,0.253846,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10124,2018-01-11 00:00:00,0 days 08:58:07,25.06,25.13,24.69,25.31,6,7,33,22,0.09,0.04,0.06,0.08,345,0.000000,0,0
10125,2018-01-11 00:00:00,0 days 08:58:37,25.06,25.06,24.69,25.25,6,7,34,22,0.07,0.05,0.05,0.08,345,0.000000,0,0
10126,2018-01-11 00:00:00,0 days 08:59:08,25.13,25.06,24.69,25.25,6,7,34,22,0.11,0.05,0.06,0.08,345,0.000000,0,0
10127,2018-01-11 00:00:00,0 days 08:59:39,25.13,25.06,24.69,25.25,6,7,34,22,0.08,0.08,0.10,0.08,345,0.000000,0,0


In [97]:
y = np.array(y.loc[:,'Room_Occupancy_Count'])
X = X[['S1_Temp', 'S2_Temp', 'S3_Temp', 'S4_Temp', 'S1_Light',
       'S2_Light', 'S3_Light', 'S4_Light', 'S1_Sound', 'S2_Sound', 'S3_Sound',
       'S4_Sound', 'S5_CO2', 'S5_CO2_Slope', 'S6_PIR', 'S7_PIR']]
mean = np.mean(X, axis=0)
std = np.std(X, axis=0)
X = (X - mean) / std
X = np.array(X)

In [98]:
X

array([[-1.46303347, -1.35777517, -1.16233429, ...,  0.66446859,
        -0.31474885, -0.29402867],
       [-1.46303347, -1.35777517, -1.16233429, ...,  0.558817  ,
        -0.31474885, -0.29402867],
       [-1.29225542, -1.35777517, -1.30276354, ...,  0.4498638 ,
        -0.31474885, -0.29402867],
       ...,
       [-0.92223631, -0.82903247, -0.85807092, ...,  0.00414617,
        -0.31474885, -0.29402867],
       [-0.92223631, -0.82903247, -0.85807092, ...,  0.00414617,
        -0.31474885, -0.29402867],
       [-0.92223631, -0.82903247, -0.85807092, ...,  0.00414617,
        -0.31474885, -0.29402867]])

In [99]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [105]:
np.sum(y == 3)

694

In [100]:
X

array([[-1.46303347, -1.35777517, -1.16233429, ...,  0.66446859,
        -0.31474885, -0.29402867],
       [-1.46303347, -1.35777517, -1.16233429, ...,  0.558817  ,
        -0.31474885, -0.29402867],
       [-1.29225542, -1.35777517, -1.30276354, ...,  0.4498638 ,
        -0.31474885, -0.29402867],
       ...,
       [-0.92223631, -0.82903247, -0.85807092, ...,  0.00414617,
        -0.31474885, -0.29402867],
       [-0.92223631, -0.82903247, -0.85807092, ...,  0.00414617,
        -0.31474885, -0.29402867],
       [-0.92223631, -0.82903247, -0.85807092, ...,  0.00414617,
        -0.31474885, -0.29402867]])

In [111]:
class OnevAll():
    
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.logistic_regression_weights = {} 
        
    def implement_logistic_get_weights(self):
        logistic_object =  LogisticRegression(iterations=50000, learning_rate=0.0001, tolerance=0.0005)
        logistic_object.fit(self.X_train, self.y_train)
        return logistic_object.weights
   
    def implement_logistic(self):
        self.unique_y = np.unique(self.y)
        for unique in self.unique_y:
            self.y = np.where(self.y == unique, 1, 0)
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, shuffle=True, stratify=self.y)
            self.logistic_regression_weights[unique] = self.implement_logistic_get_weights()
        
        return self.logistic_regression_weights
    

In [112]:
onevall = OnevAll(X, y)
z =onevall.implement_logistic()

 98%|[37m█████████▊[0m| 48800/50000 [00:15<00:00, 3165.86it/s]


Model has stopped improving


 97%|[37m█████████▋[0m| 48568/50000 [00:15<00:00, 3206.71it/s]


Model has stopped improving


  0%|[37m          [0m| 238/50000 [00:00<00:15, 3186.80it/s]


Model has stopped improving


  0%|[37m          [0m| 114/50000 [00:00<00:19, 2619.07it/s]

Model has stopped improving





In [108]:
z

{0: array([-1.26575837,  0.03987283, -4.56872264,  3.37499022, -9.48730324,
        -6.71550117, -3.98725568,  3.23132381,  0.12342328, -0.48462689,
         0.55926653,  0.35975536,  2.43614809, -2.77757915, -0.4321476 ,
        -0.73128358]),
 1: array([-1.18271884,  0.55425309, -4.62523292,  3.48707673, -8.55638781,
        -7.10701794, -4.42643458,  3.40962668,  0.44476197, -0.34644891,
        -0.43819196, -0.16623437,  2.15152104, -3.57682922, -0.26181621,
        -0.62678846]),
 2: array([-0.01698969, -0.02590143, -0.01542664,  0.03859748,  0.00876585,
         0.06827391, -0.06140541, -0.01243823,  0.00762682, -0.01949535,
         0.01396406, -0.01170992,  0.01344386,  0.00637472,  0.0099915 ,
        -0.00058254]),
 3: array([ 2.63321093e-02, -1.19502827e-02, -1.94101205e-02,  1.21190336e-03,
         4.56882095e-02, -2.02937429e-02,  5.50958400e-04, -3.94526426e-03,
        -8.60485413e-05,  2.33772067e-03,  2.30377528e-02, -9.12886443e-03,
        -2.51805473e-02,  2.593408

In [89]:
X

array([[-1.46303347, -1.35777517, -1.16233429, ...,  0.66446859,
        -0.31474885, -0.29402867],
       [-1.46303347, -1.35777517, -1.16233429, ...,  0.558817  ,
        -0.31474885, -0.29402867],
       [-1.29225542, -1.35777517, -1.30276354, ...,  0.4498638 ,
        -0.31474885, -0.29402867],
       ...,
       [-0.92223631, -0.82903247, -0.85807092, ...,  0.00414617,
        -0.31474885, -0.29402867],
       [-0.92223631, -0.82903247, -0.85807092, ...,  0.00414617,
        -0.31474885, -0.29402867],
       [-0.92223631, -0.82903247, -0.85807092, ...,  0.00414617,
        -0.31474885, -0.29402867]])

[0 1]
