In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.datasets import make_classification

In [3]:
# creake the Dataset 
X,y=make_classification(n_samples =1000,n_features=10,n_classes=2,random_state=42)

In [4]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.964799,-0.066449,0.986768,-0.358079,0.997266,1.181890,-1.615679,-1.210161,-0.628077,1.227274
1,-0.916511,-0.566395,-1.008614,0.831617,-1.176962,1.820544,1.752375,-0.984534,0.363896,0.209470
2,-0.109484,-0.432774,-0.457649,0.793818,-0.268646,-1.836360,1.239086,-0.246383,-1.058145,-0.297376
3,1.750412,2.023606,1.688159,0.006800,-1.607661,0.184741,-2.619427,-0.357445,-1.473127,-0.190039
4,-0.224726,-0.711303,-0.220778,0.117124,1.536061,0.597538,0.348645,-0.939156,0.175915,0.236224
...,...,...,...,...,...,...,...,...,...,...
995,-1.367638,1.462255,-1.154918,-0.290454,-0.413424,0.032396,1.545490,1.428760,1.687092,1.072542
996,-1.514876,-3.221016,-1.300744,0.395599,-0.527994,1.353069,1.777506,-1.680870,1.798510,0.034272
997,1.674633,1.754933,1.586154,0.018402,-1.514470,0.321593,-2.417694,0.692723,-1.503850,0.225264
998,-0.778609,-0.835689,-0.194842,1.097220,0.180071,-0.272933,-0.533188,-0.497354,2.472138,0.867187


In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.25 , random_state=42)

In [6]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()

In [7]:
logistic.fit(X_train,y_train)

In [8]:
y_pred = logistic.predict(X_test)

In [9]:
# logistic.predict_proba(X_test)

In [10]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [11]:
score = accuracy_score(y_test,y_pred)
print(score)

0.84


In [12]:
cm = confusion_matrix(y_test,y_pred)
print(cm)

[[ 98  15]
 [ 25 112]]


In [13]:
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.80      0.87      0.83       113
           1       0.88      0.82      0.85       137

    accuracy                           0.84       250
   macro avg       0.84      0.84      0.84       250
weighted avg       0.84      0.84      0.84       250



# Hyperparameter Tuning and Cross-Validation

In [14]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

penalty = ['l1','l2','elasticnet']
c_value = [100,10,1.0,0.01,0.1]
solver = ['newton-cg','lbfgs','liblinear','sag','saga']

In [15]:
params=dict(penalty=penalty,C=c_value,solver=solver)

In [16]:
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold()

## GridSearchCV 

In [17]:
# find out the best parameter for this particular data set using GridSeachCV techinque
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=model,param_grid=params,scoring='accuracy',cv=cv,n_jobs=-1 )

In [18]:
grid

In [19]:
grid.fit(X_train,y_train)

In [20]:
grid.best_params_

{'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'}

In [21]:
grid.best_score_

0.8773333333333333

In [22]:
y_pred=grid.predict(X_test)

In [23]:
score=accuracy_score(y_pred,y_test)
print(score)
report = classification_report(y_pred,y_test)
print(report)
cm = confusion_matrix(y_pred,y_test)
print(cm)

0.84
              precision    recall  f1-score   support

           0       0.91      0.77      0.84       133
           1       0.78      0.91      0.84       117

    accuracy                           0.84       250
   macro avg       0.85      0.84      0.84       250
weighted avg       0.85      0.84      0.84       250

[[103  30]
 [ 10 107]]


# Randomized SearchCV

In [24]:
# This will pick up the random parameter that is suitable for this data set 

from sklearn.model_selection import RandomizedSearchCV

model = LogisticRegression()
randomcv=RandomizedSearchCV(estimator=model,param_distributions=params,cv=5)


In [25]:
randomcv.fit(X_train,y_train)

In [26]:
randomcv.best_score_

0.876

In [27]:
randomcv.best_params_

{'solver': 'liblinear', 'penalty': 'l2', 'C': 0.01}

In [28]:
y_pred=randomcv.predict(X_test)

In [29]:
score=accuracy_score(y_pred,y_test)
print(score)
report = classification_report(y_pred,y_test)
print(report)
cm = confusion_matrix(y_pred,y_test)
print(cm)

0.84
              precision    recall  f1-score   support

           0       0.91      0.77      0.84       133
           1       0.78      0.91      0.84       117

    accuracy                           0.84       250
   macro avg       0.85      0.84      0.84       250
weighted avg       0.85      0.84      0.84       250

[[103  30]
 [ 10 107]]


# Logistic Regression for Multi Class Classification Problem

In [30]:
# creake the Dataset 
X,y=make_classification(n_samples =1000,n_features=10,n_informative=3,n_classes=3,random_state=42)

In [31]:
y

array([1, 2, 1, 1, 2, 1, 1, 0, 2, 2, 2, 0, 1, 0, 2, 1, 2, 1, 0, 1, 1, 1,
       2, 2, 1, 2, 2, 0, 1, 2, 0, 1, 0, 2, 0, 1, 0, 0, 2, 0, 1, 0, 2, 1,
       2, 0, 2, 0, 2, 0, 0, 0, 2, 1, 1, 1, 1, 2, 0, 0, 2, 1, 0, 0, 1, 2,
       0, 2, 2, 1, 0, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0,
       0, 1, 1, 2, 1, 0, 1, 2, 0, 2, 0, 0, 2, 1, 0, 2, 0, 2, 2, 0, 2, 0,
       0, 0, 0, 2, 1, 2, 1, 0, 2, 0, 0, 2, 1, 0, 2, 1, 0, 0, 2, 2, 0, 0,
       0, 2, 1, 2, 2, 0, 2, 0, 1, 2, 1, 2, 1, 1, 2, 0, 0, 1, 0, 2, 0, 0,
       0, 1, 1, 2, 1, 2, 2, 0, 0, 0, 1, 1, 0, 2, 1, 2, 2, 2, 1, 1, 0, 2,
       1, 0, 2, 1, 2, 1, 2, 0, 1, 1, 0, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2,
       1, 2, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 0, 2, 2, 2, 0,
       2, 0, 1, 1, 2, 1, 2, 2, 2, 2, 0, 1, 0, 0, 0, 0, 2, 2, 2, 1, 1, 1,
       0, 2, 0, 0, 1, 0, 2, 0, 1, 2, 2, 2, 1, 2, 1, 0, 1, 1, 1, 2, 2, 2,
       2, 2, 2, 0, 0, 0, 2, 1, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 0, 0, 2, 2,
       2, 1, 1, 1, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 1,

In [32]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.25 , random_state=42)

In [33]:
from sklearn.linear_model import LogisticRegression

# Initialize a Logistic Regression model
model = LogisticRegression()

# Create a logistic regression model with all hyperparameters
logistic = LogisticRegression(
    penalty='l2',                   # Specifies the norm used in the penalization (regularization)
    dual=False,                     # Dual formulation is only for 'l2' penalty with liblinear solver
    tol=1e-4,                       # Tolerance for stopping criteria
    C=1.0,                          # Inverse of regularization strength; must be a positive float
    fit_intercept=True,             # Specifies if a constant (bias or intercept) should be added to the decision function
    intercept_scaling=1,            # Useful only when the solver is 'liblinear'
    class_weight=None,              # Weights associated with classes in the form {class_label: weight}
    random_state=42,                # Seed of the pseudo random number generator
    solver='lbfgs',                 # Algorithm to use in the optimization problem
    max_iter=1000,                   # Maximum number of iterations taken for the solvers to converge
    multi_class='ovr',              # If the option chosen is 'ovr', a binary problem is fit for each label
    verbose=0,                      # For the liblinear and lbfgs solvers set verbose to any positive number for verbosity
    warm_start=False,               # When set to True, reuse the solution of the previous call to fit as initialization
    n_jobs=None,                    # Number of CPU cores used when parallelizing over classes if multi_class='ovr'
    l1_ratio=None                   # The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1
)

# Fit the model on the training data
logistic.fit(X_train, y_train)

# Predict the labels for the test data
y_pred = logistic.predict(X_test)


In [34]:
score=accuracy_score(y_pred,y_test)
print(score)
report = classification_report(y_pred,y_test)
print(report)
cm = confusion_matrix(y_pred,y_test)
print(cm)

0.668
              precision    recall  f1-score   support

           0       0.74      0.68      0.71        94
           1       0.41      0.58      0.48        59
           2       0.84      0.71      0.77        97

    accuracy                           0.67       250
   macro avg       0.67      0.66      0.65       250
weighted avg       0.70      0.67      0.68       250

[[64 27  3]
 [15 34 10]
 [ 7 21 69]]


## RandomizedSearchCV 


In [35]:
from sklearn.model_selection import RandomizedSearchCV

model = LogisticRegression()
randomcv=RandomizedSearchCV(estimator=model,param_distributions=params,cv=5)
randomcv.fit(X_train,y_train)
y_pred=randomcv.predict(X_test)
randomcv.best_params_

{'solver': 'saga', 'penalty': 'l1', 'C': 100}

In [36]:
score=accuracy_score(y_pred,y_test)
print(score)
report = classification_report(y_pred,y_test)
print(report)
cm = confusion_matrix(y_pred,y_test)
print(cm)

0.696
              precision    recall  f1-score   support

           0       0.71      0.73      0.72        83
           1       0.54      0.59      0.56        74
           2       0.84      0.74      0.79        93

    accuracy                           0.70       250
   macro avg       0.70      0.69      0.69       250
weighted avg       0.71      0.70      0.70       250

[[61 20  2]
 [19 44 11]
 [ 6 18 69]]


# Hyperparameter tuning and Cross-Validation

In [37]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [38]:
params=dict(penalty=penalty,C=c_value,solver=solver)

In [39]:
penalty = ['l1','l2','elasticnet']
c_value = [100,10,1.0,0.01,0.1]
solver = ['newton-cg','lbfgs','liblinear','sag','saga']

In [40]:
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold()

# GridSearchCV

In [41]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=model,param_grid=params,scoring='accuracy',cv=cv,n_jobs=-1 )

In [42]:
grid

In [43]:
grid.fit(X_train,y_train)

In [44]:
y_pred=grid.predict(X_test)

In [45]:
grid.best_params_

{'C': 0.01, 'penalty': 'l1', 'solver': 'saga'}

In [46]:
grid.best_score_

0.6586666666666666

In [47]:
score=accuracy_score(y_pred,y_test)
print(score)
report = classification_report(y_pred,y_test)
print(report)
cm = confusion_matrix(y_pred,y_test)
print(cm)

0.684
              precision    recall  f1-score   support

           0       0.74      0.69      0.72        93
           1       0.43      0.61      0.50        57
           2       0.88      0.72      0.79       100

    accuracy                           0.68       250
   macro avg       0.68      0.67      0.67       250
weighted avg       0.73      0.68      0.70       250

[[64 26  3]
 [15 35  7]
 [ 7 21 72]]
