# Logistic Regression Implementation

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.datasets import make_classification

In [4]:
#Create the data
x, y = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)

In [7]:
pd.DataFrame(x)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.964799,-0.066449,0.986768,-0.358079,0.997266,1.181890,-1.615679,-1.210161,-0.628077,1.227274
1,-0.916511,-0.566395,-1.008614,0.831617,-1.176962,1.820544,1.752375,-0.984534,0.363896,0.209470
2,-0.109484,-0.432774,-0.457649,0.793818,-0.268646,-1.836360,1.239086,-0.246383,-1.058145,-0.297376
3,1.750412,2.023606,1.688159,0.006800,-1.607661,0.184741,-2.619427,-0.357445,-1.473127,-0.190039
4,-0.224726,-0.711303,-0.220778,0.117124,1.536061,0.597538,0.348645,-0.939156,0.175915,0.236224
...,...,...,...,...,...,...,...,...,...,...
995,-1.367638,1.462255,-1.154918,-0.290454,-0.413424,0.032396,1.545490,1.428760,1.687092,1.072542
996,-1.514876,-3.221016,-1.300744,0.395599,-0.527994,1.353069,1.777506,-1.680870,1.798510,0.034272
997,1.674633,1.754933,1.586154,0.018402,-1.514470,0.321593,-2.417694,0.692723,-1.503850,0.225264
998,-0.778609,-0.835689,-0.194842,1.097220,0.180071,-0.272933,-0.533188,-0.497354,2.472138,0.867187


In [8]:
from sklearn.model_selection import train_test_split

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)

In [10]:
#Model training 

In [11]:
from sklearn.linear_model import LogisticRegression
regression = LogisticRegression()

In [12]:
regression.fit(x_train, y_train)

In [13]:
y_pred = regression.predict(x_test)

In [15]:
#For probability

# regression.predict_proba(x_test)

In [16]:
#Performance metrics 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [17]:
score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print(f"Score : {score}")
print(f"Confusion matrix : {cm}")
print(f"Classification Report : {cr}")





Score : 0.8466666666666667
Confusion matrix : [[118  17]
 [ 29 136]]
Classification Report :               precision    recall  f1-score   support

           0       0.80      0.87      0.84       135
           1       0.89      0.82      0.86       165

    accuracy                           0.85       300
   macro avg       0.85      0.85      0.85       300
weighted avg       0.85      0.85      0.85       300



# Hyperparameter tuning and cross validation

In [18]:
model = LogisticRegression()

In [19]:
penalty = ['l1', 'l2', 'elasticnet']
c_values=[100, 10, 1.0, 0.01]
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'] 

In [21]:
params = dict(penalty=penalty, C=c_values, solver=solver)

In [22]:
#Grid Search CV

In [26]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
cv = StratifiedKFold()

In [27]:
grid = GridSearchCV(estimator=model, param_grid=params, scoring='accuracy', cv=cv, n_jobs=-1)

In [28]:
grid

In [29]:
grid.fit(x_train, y_train)

160 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Asus\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Asus\anaconda3\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Asus\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Asus\anaconda3\Lib\site-packages\sklear

In [30]:
grid.best_params_

{'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'}

In [31]:
grid.best_score_

np.float64(0.8785714285714287)

In [32]:
y_pred = grid.predict(x_test)

In [33]:
score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print(f"Score : {score}")
print(f"Confusion matrix : {cm}")
print(f"Classification Report : {cr}")

Score : 0.8533333333333334
Confusion matrix : [[124  11]
 [ 33 132]]
Classification Report :               precision    recall  f1-score   support

           0       0.79      0.92      0.85       135
           1       0.92      0.80      0.86       165

    accuracy                           0.85       300
   macro avg       0.86      0.86      0.85       300
weighted avg       0.86      0.85      0.85       300



In [34]:
# Randamised Search CV

In [35]:
from sklearn.model_selection import RandomizedSearchCV


TypeError: RandomizedSearchCV.__init__() missing 2 required positional arguments: 'estimator' and 'param_distributions'

In [36]:
model = LogisticRegression()
randomcv = RandomizedSearchCV(estimator=model, param_distributions=params, cv=5, scoring='accuracy')

In [37]:
randomcv.fit(x_train, y_train)

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Asus\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Asus\anaconda3\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Asus\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Asus\anaconda3\Lib\site-packages\sklearn\l

In [38]:
randomcv.best_params_

{'solver': 'sag', 'penalty': 'l2', 'C': 0.01}

In [39]:
randomcv.best_score_

np.float64(0.8771428571428572)

In [41]:
y_pred = randomcv.predict(x_test)

In [42]:
score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print(f"Score : {score}")
print(f"Confusion matrix : {cm}")
print(f"Classification Report : {cr}")

Score : 0.8533333333333334
Confusion matrix : [[124  11]
 [ 33 132]]
Classification Report :               precision    recall  f1-score   support

           0       0.79      0.92      0.85       135
           1       0.92      0.80      0.86       165

    accuracy                           0.85       300
   macro avg       0.86      0.86      0.85       300
weighted avg       0.86      0.85      0.85       300

