In [54]:
# Importing all necessary Libraries

import numpy as np # numpy used for mathematical operation on array
import pandas as pd  # pandas used for data manipulation on dataframe
import seaborn as sns 
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings( "ignore" )

In [55]:
# Read the data with pandas

df = pd.read_csv("iris.csv", header=0)
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


# EDA Process 

In [56]:
# Checking the shape of the data
df.shape

(150, 5)

In [57]:
# Reading random Rows of the data

df.sample()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
113,5.7,2.5,5.0,2.0,virginica


In [58]:
#Reading the name of the columns

df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [59]:
#finding the datatype of each feature

df.dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

In [60]:
 # checking the information of the dataset
    
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [61]:
#for the description 

df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [62]:
#total number of missing value in a table of each column

df.isnull().sum() 

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [63]:
dataset = df

In [64]:
# Converting non numerical column into numerical

from sklearn.preprocessing import LabelEncoder

Label = LabelEncoder() 
dataset['species'] = Label.fit_transform(df[ 'species'])


In [65]:
dataset

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [66]:
# Separating the output from the dataset

X = dataset.loc[:,dataset.columns!="species"]


In [67]:
# Creating output column

y = dataset["species"]


In [68]:
# Checking the five rows of the input columns

X.sample(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
2,4.7,3.2,1.3,0.2
111,6.4,2.7,5.3,1.9
126,6.2,2.8,4.8,1.8
143,6.8,3.2,5.9,2.3
36,5.5,3.5,1.3,0.2


In [83]:
# Checking the five rows of the output columns

y.sample(5)

100    2
96     1
146    2
41     0
112    2
Name: species, dtype: int32

# train - test split

In [70]:
# Importing the train test split 

from sklearn.model_selection import train_test_split

In [71]:
# Separating the Training and testing Data

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.30,random_state=0)

# Model without sklearn

In [72]:
y_train = np.array(y_train).reshape((-1, 1))

In [82]:
class LogisticRegression():
    def __init__(self, learning_rate=0.001, num_iterations=10000, y_pred = None):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.bias = None
        self.weights = None
    
    
    def softmax(self, z):
        exp_scores = np.exp(z)
        return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    
            
    def fit(self, X, y,learning_rate=0.001, num_iterations=10000):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))
        self.weights = np.zeros((n_features, n_classes))
        self.bias = np.zeros(n_classes)
              
        for i in range(num_iterations):
            Z = np.dot(X, self.weights) + self.bias
            LR = self.softmax(Z)
            loss = -np.mean(np.sum(y * np.log(LR), axis=1))
            dZ = LR - y
            
            dW = 1 / n_samples * np.dot(X.T, dZ)
            db = 1 / n_samples * np.sum(dZ, axis=0)
            
            self.weights -= learning_rate * dW
            self.bias -= learning_rate * db
            if (i+1) % 100 == 0:
                print("Iteration {} - loss: {:.4f}".format(i+1, loss))   
                
                
    
            

    def predict(self, X, threshold = 0.33):
        Z = np.dot(X, self.weights) + self.bias
        LR = self.softmax(Z)
        class_pred = []
        for i in range(LR.shape[0]):
            above_threshold = np.where(LR[i] >= threshold)[0]
            if len(above_threshold) > 0:
                class_pred.append(above_threshold[np.argmax(LR[i, above_threshold])])
            else:
                class_pred.append(np.argmax(LR[i]))
        return np.array(class_pred)

    


In [74]:

model = LogisticRegression(learning_rate=0.001, num_iterations=10000, y_pred = None)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Iteration 100 - loss: 3.4528
Iteration 200 - loss: 3.4528
Iteration 300 - loss: 3.4528
Iteration 400 - loss: 3.4528
Iteration 500 - loss: 3.4528
Iteration 600 - loss: 3.4528
Iteration 700 - loss: 3.4528
Iteration 800 - loss: 3.4528
Iteration 900 - loss: 3.4528
Iteration 1000 - loss: 3.4528
Iteration 1100 - loss: 3.4528
Iteration 1200 - loss: 3.4528
Iteration 1300 - loss: 3.4528
Iteration 1400 - loss: 3.4528
Iteration 1500 - loss: 3.4528
Iteration 1600 - loss: 3.4528
Iteration 1700 - loss: 3.4528
Iteration 1800 - loss: 3.4528
Iteration 1900 - loss: 3.4528
Iteration 2000 - loss: 3.4528
Iteration 2100 - loss: 3.4528
Iteration 2200 - loss: 3.4528
Iteration 2300 - loss: 3.4528
Iteration 2400 - loss: 3.4528
Iteration 2500 - loss: 3.4528
Iteration 2600 - loss: 3.4528
Iteration 2700 - loss: 3.4528
Iteration 2800 - loss: 3.4528
Iteration 2900 - loss: 3.4528
Iteration 3000 - loss: 3.4528
Iteration 3100 - loss: 3.4528
Iteration 3200 - loss: 3.4528
Iteration 3300 - loss: 3.4528
Iteration 3400 - lo

In [75]:
# Importing accuracy-score and confusion_matrix package

from sklearn.metrics import accuracy_score

In [84]:
import time

# Accuracy score of Logistic Regression model without using sklearn package

In [76]:
# Checking the accuracy Score

accuracy_score(y_test, y_pred)

0.35555555555555557

In [85]:
m = LogisticRegression(learning_rate=0.001, num_iterations=10000, y_pred = None)

start_time = time.time()
m.fit(X_train, y_train)
end_time = time.time()
elapsed_time = end_time - start_time

Iteration 100 - loss: 3.4528
Iteration 200 - loss: 3.4528
Iteration 300 - loss: 3.4528
Iteration 400 - loss: 3.4528
Iteration 500 - loss: 3.4528
Iteration 600 - loss: 3.4528
Iteration 700 - loss: 3.4528
Iteration 800 - loss: 3.4528
Iteration 900 - loss: 3.4528
Iteration 1000 - loss: 3.4528
Iteration 1100 - loss: 3.4528
Iteration 1200 - loss: 3.4528
Iteration 1300 - loss: 3.4528
Iteration 1400 - loss: 3.4528
Iteration 1500 - loss: 3.4528
Iteration 1600 - loss: 3.4528
Iteration 1700 - loss: 3.4528
Iteration 1800 - loss: 3.4528
Iteration 1900 - loss: 3.4528
Iteration 2000 - loss: 3.4528
Iteration 2100 - loss: 3.4528
Iteration 2200 - loss: 3.4528
Iteration 2300 - loss: 3.4528
Iteration 2400 - loss: 3.4528
Iteration 2500 - loss: 3.4528
Iteration 2600 - loss: 3.4528
Iteration 2700 - loss: 3.4528
Iteration 2800 - loss: 3.4528
Iteration 2900 - loss: 3.4528
Iteration 3000 - loss: 3.4528
Iteration 3100 - loss: 3.4528
Iteration 3200 - loss: 3.4528
Iteration 3300 - loss: 3.4528
Iteration 3400 - lo

In [86]:
N =  X_train.shape[0] #number of samples in the training set
d = X_train.shape[1] # number of features
time_complexity = elapsed_time / (N**2 * d)

In [87]:
print("time complexity of fitting a logistic regression model without using sklearn is {:.10f}".format(time_complexity))

time complexity of fitting a logistic regression model is 0.0000810991


# Model with sklearn

In [77]:
#importing LogisticRegression package

from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression()
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)

In [78]:
# Importing accuracy-score and confusion_matrix package

from sklearn.metrics import accuracy_score

# Accuracy score of Logistic Regression model using sklearn package

In [79]:
# Checking the accuracy Score

accuracy_score(y_test, y_pred)

0.9777777777777777

In [88]:
m1 = LogisticRegression()

start_time = time.time()
m1.fit(X_train, y_train)
end_time = time.time()
elapsed_time = end_time - start_time

Iteration 100 - loss: 3.4528
Iteration 200 - loss: 3.4528
Iteration 300 - loss: 3.4528
Iteration 400 - loss: 3.4528
Iteration 500 - loss: 3.4528
Iteration 600 - loss: 3.4528
Iteration 700 - loss: 3.4528
Iteration 800 - loss: 3.4528
Iteration 900 - loss: 3.4528
Iteration 1000 - loss: 3.4528
Iteration 1100 - loss: 3.4528
Iteration 1200 - loss: 3.4528
Iteration 1300 - loss: 3.4528
Iteration 1400 - loss: 3.4528
Iteration 1500 - loss: 3.4528
Iteration 1600 - loss: 3.4528
Iteration 1700 - loss: 3.4528
Iteration 1800 - loss: 3.4528
Iteration 1900 - loss: 3.4528
Iteration 2000 - loss: 3.4528
Iteration 2100 - loss: 3.4528
Iteration 2200 - loss: 3.4528
Iteration 2300 - loss: 3.4528
Iteration 2400 - loss: 3.4528
Iteration 2500 - loss: 3.4528
Iteration 2600 - loss: 3.4528
Iteration 2700 - loss: 3.4528
Iteration 2800 - loss: 3.4528
Iteration 2900 - loss: 3.4528
Iteration 3000 - loss: 3.4528
Iteration 3100 - loss: 3.4528
Iteration 3200 - loss: 3.4528
Iteration 3300 - loss: 3.4528
Iteration 3400 - lo

In [89]:
N1 =  X_train.shape[0] #number of samples in the training set
d1 = X_train.shape[1] # number of features
time_complexity1 = elapsed_time / (N1**2 * d1)

In [90]:
print("time complexity of fitting a logistic regression model using sklearn is {:.10f}".format(time_complexity1))

time complexity of fitting a logistic regression model using sklearn is 0.0000841134
