In [158]:
import numpy as np
import pandas as pd

In [188]:
class LogisticRegression:
    """
    Logistic regression model that takes in numpy arrays as input and returns numpy arrays as output.

        random_state           : For setting the seed value.
        epochs                 : No of iterations for gradient descent.
        alpha                  : Learning rate.
        regularization_term    : Used for regularization of weights.
        decision_boundary      : Used for setting value of decision_boundary.
        show_cost_iteration    : Set True if the user wants to see the iterations and corresponding cost.
        cost_iteration_interval: Set an integer value if show_cost_iteration is true.

    ### Attributes:
        self.W             : Weight matrix.
        self.b             : Bias value.
        self.n_observations: No of observations in input data.
        self.n_attributes  : No of attributes in input data.
    
    ### Methods: 
        fit(X, y) -> LogisticRegression:
        Used for training the model.
            X: Input feature matrix for training.
            y: Input label vector for training.
        predict(X) -> ArrayLike:
        Used for predicting labels.
            X: Input feature matrix for predicting labels.
    """

    def __init__(self, random_state=0, epochs=1000, alpha=0.001, regularization_term=100, decision_boundary=0.5, show_cost_iteration=False, cost_iteration_interval=None):
        np.random.seed(random_state)
        self.epochs = epochs
        self.alpha = alpha
        self.regularization_term = regularization_term
        self.decision_boundary = decision_boundary
        self.show_cost_iteration = show_cost_iteration
        self.cost_iteration_interval = cost_iteration_interval

    def _sigmoid(self, Z):
        return 1.0 / (1 + np.exp(-Z))

    def _predict_y(self, X):
        self.y_pred = np.matmul(X, self.W) + self.b
        return self._sigmoid(self.y_pred)

    def _cost_function(self, X, y):
        # Calculating cost part.
        left = np.dot(y.T, np.log(self._predict_y(X)))
        right = np.dot((1-y).T, np.log(1-self._predict_y(X)))
        cost_part = - (left + right)[0,0] /self.n_observations 
        
        # Calculating regularization part.
        regularization_part = np.sum(np.square(self.W), axis=0)[0] * (self.regularization_term/(2 * self.n_observations))

        return cost_part + regularization_part
    
    def fit(self, X, y):
        self.n_observations = X.shape[0]
        # Finding no of features.
        if (self.n_observations != X.size):
            self.n_attributes = X.shape[1]
        else:
            self.n_attributes = 1

        # Raise error if no of observations is not same for X and y.
        if (self.n_observations != y.shape[0]):
            raise ValueError("The number of observations aren't same for X and y.")
        
        # Initialize values for weights and biases.
        self.W = 0.1 * np.random.random((self.n_attributes, 1))
        self.b = 0.1 * np.random.random()

        # Converting input X and y to numpy arrays.
        if (type(X) == pd.DataFrame or type(X) == pd.Series):
            X = X.to_numpy()
            if (self.n_attributes == 1):
                X = X.reshape((self.n_observations,1))
        if (type(y) == pd.Series):
            y = y.to_numpy()
        y = y.reshape((self.n_observations,1))

        # Variable to store cost history.
        self.cost_history = np.array([])

        # Iterating for training.
        for i in range(self.epochs):
            
            # Calculating the gradients of weights and bias.
            diff_y = (self._predict_y(X) - y)
            gradient_cost_part = np.matmul(X.T, diff_y) / self.n_observations 
            gradient_regularization_part = self.W * (self.regularization_term / self.n_observations)
            gradient_W = gradient_cost_part + gradient_regularization_part
            gradient_b = np.sum(diff_y, axis=0) / self.n_observations

            # Updating weights and bias.
            self.W = self.W - self.alpha * gradient_W
            self.b = self.b - self.alpha * gradient_b
            
            # Saving cost function for auditing purposes.
            cost = self._cost_function(X, y)
            self.cost_history = np.append(self.cost_history, cost)

            # Used for displaying iteration and corresponding cost.
            if (self.show_cost_iteration and i%self.cost_iteration_interval == 0):
                print(f"Iteration: {str(i)}\t Cost: {str(cost)}")

        return self
    
    def _decision_boundary_calculation(self, predictions):
        if (predictions > self.decision_boundary):
            return 1
        else:
            return 0
    
    def predict(self, X):
        # Check whether input X is in correct form or not.
        if (X.shape[1] != self.n_attributes):
            raise ValueError("Number of dimensions in test data is not same as trained data.")
        
        # Convert to numpy array.
        if (type(X) == pd.DataFrame or type(X) == pd.Series):
            X = X.to_numpy()
            if (self.n_attributes == 1):
                X = X.reshape((self.n_observations,1))

        # Predicting values.
        decision_boundary = np.vectorize(self._decision_boundary_calculation)
        return decision_boundary(self._predict_y(X)).flatten()



In [172]:
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)
pd.Series(y).value_counts()

1    357
0    212
dtype: int64

In [167]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=100, train_size=0.75)

In [174]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [194]:
model1 = LogisticRegression(decision_boundary=0.5)
model1.fit(X_train, y_train)
predictions1 = model1.predict(X_test)
predictions1

array([1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0])

In [195]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions1))

0.972027972027972


In [196]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predictions1))

[[51  2]
 [ 2 88]]


In [186]:
from sklearn.linear_model import LogisticRegression
model2 = LogisticRegression()
model2.fit(X_train, y_train)
predictions2 = model2.predict(X_test)
print(accuracy_score(y_test, predictions2))
print(confusion_matrix(y_test, predictions2))

0.972027972027972
[[52  1]
 [ 3 87]]
