<h2 style="color:red">Logistic Regression from scratch using python</h2>

In [60]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

<h2>Explanation of Logistic Regression</h2>

<center><img src="logistic regression.jpg" height = "500" width = "700" ></center>

In [61]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, iterations = 1000):
        self.learning_rate = learning_rate
        self.iterations= iterations
        self.losses = []

    # passing datset to model
    def fit(self, X,y):
        rows, cols = X.shape
        self.weights = np.zeros(cols)
        self.bias = 0
        # gradient descent
        for i in range(self.iterations):
            value = self.update(X)
            self.losses.append(self.binary_cross_entropy(y,value))
            derivative_z = value - y
            derivative_weight = (1/rows) * np.dot(X.T, derivative_z)
            derivative_bias = (1/rows) * np.sum(derivative_z)
            # updating weights
            self.weights -= self.learning_rate * derivative_weight
            self.bias -= self.learning_rate * derivative_bias
                
    

    # Sigmoid funtion
    def sigmoid(self,x):
        return 1/(1 + np.exp(-x))
    
    # Implementing Binary Cross entropy
    def binary_cross_entropy(self, y_true, y_pred):
        epsilon = 1e-9
        y1 = y_true * np.log(y_pred + epsilon)
        y2 = (1-y_true) * np.log(1-y_pred + epsilon)
        return -np.mean(y1 + y2)
    
    # updating weights
    def update(self,X):
        z = np.dot(X, self.weights) + self.bias
        value = self.sigmoid(z)
        return value
    # predict function
    def predict(self, X):
        thresold = 0.5
        y_prediction = np.dot(X,self.weights) + self.bias
        y_predicted = self.sigmoid(y_prediction)
        predicted_class = [1 if i > thresold else 0 for i in y_predicted]
        return np.array(predicted_class)

<h2 style="color:blue">Using Breast Cancer Dataset for testing</h2>

In [62]:
dataset = load_breast_cancer()
df = pd.DataFrame(data= dataset.data, columns=dataset.feature_names)
df['target'] = dataset.target
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [63]:
df.shape

(569, 31)

In [64]:
X = dataset.data
y = dataset.target

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8, random_state=0)

In [66]:
model = LogisticRegression(learning_rate=0.01, iterations=1000)
model.fit(X_train, y_train)

  return 1/(1 + np.exp(-x))


In [67]:
y_preds = model.predict(X_test)
y_preds

  return 1/(1 + np.exp(-x))


array([0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 1])

In [68]:
from sklearn.metrics import confusion_matrix, accuracy_score
matrix = confusion_matrix(y_preds, y_test)
matrix

array([[42,  2],
       [ 5, 65]], dtype=int64)

In [69]:
accuracy_score(y_preds, y_test)

0.9385964912280702

## Checking the accurary from inbuilt algorithm.

In [70]:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(dataset.data ,dataset.target,train_size=0.8, random_state=0)

lmodel = LogisticRegression().fit(X_train, y_train)

y_predtions = lmodel.predict(X_test)

print(y_predtions)

print(confusion_matrix(y_predtions, y_test))

print(accuracy_score(y_predtions, y_test))


[0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 0 0 0 0 1 1 0 1 1 0 1 0 1 0 1 0 1 0 1
 0 1 0 0 1 0 1 0 0 1 1 1 0 0 1 0 1 1 1 1 1 1 0 0 0 1 1 0 1 0 0 0 1 0 0 1 1
 0 1 1 1 1 1 0 0 0 1 0 1 1 1 0 0 1 0 0 0 1 1 0 1 1 1 1 1 1 1 0 1 0 1 0 0 1
 0 0 1]
[[45  4]
 [ 2 63]]
0.9473684210526315


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
