In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.datasets import make_classification


In [4]:
# Create a synthetic dataset

X,y =make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)

In [5]:
pd.DataFrame(X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.964799,-0.066449,0.986768,-0.358079,0.997266,1.18189,-1.615679,-1.210161,-0.628077,1.227274
1,-0.916511,-0.566395,-1.008614,0.831617,-1.176962,1.820544,1.752375,-0.984534,0.363896,0.20947
2,-0.109484,-0.432774,-0.457649,0.793818,-0.268646,-1.83636,1.239086,-0.246383,-1.058145,-0.297376
3,1.750412,2.023606,1.688159,0.0068,-1.607661,0.184741,-2.619427,-0.357445,-1.473127,-0.190039
4,-0.224726,-0.711303,-0.220778,0.117124,1.536061,0.597538,0.348645,-0.939156,0.175915,0.236224


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)


In [13]:

from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression()



In [14]:
logistic.fit(X_train, y_train)

In [15]:
y_pred = logistic.predict(X_test)
print(y_pred)

[0 1 0 1 0 1 0 0 0 0 0 1 0 1 0 0 1 1 1 1 0 0 1 0 0 1 0 0 1 1 1 1 0 0 0 0 1
 1 1 1 0 1 1 0 0 0 1 1 1 1 0 1 0 0 1 0 1 0 1 0 1 0 0 1 1 1 0 0 1 1 1 1 1 0
 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0 1 1 1 1 1 1 1 0 0 1 0 1 0 1 0 0 1 0 1 1 1 1
 1 1 1 1 0 0 1 0 0 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 0 1 1 0 0 0 0 0 1 0
 0 0 1 0 0 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 1 1 1 1 1
 0 0 0 0 1 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 1 1 1 1 1 0 0 0 0 0 1 0 1 0 1 1 0
 0 1 1 1 0 1 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 0 0 1 1 1 1
 0 1 0 1 1 0 0 0 1 1 0 1 1 0 0 1 0 0 0 0 1 1 0 1 0 1 1 1 0 0 1 0 1 1 0 1 1
 1 1 1 0]


In [18]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [20]:
score=accuracy_score(y_test, y_pred)
print(score)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.8466666666666667
[[118  17]
 [ 29 136]]
              precision    recall  f1-score   support

           0       0.80      0.87      0.84       135
           1       0.89      0.82      0.86       165

    accuracy                           0.85       300
   macro avg       0.85      0.85      0.85       300
weighted avg       0.85      0.85      0.85       300



#Hyperparameter Tuning and cross validation
    

#GRid Search CV, considers all the possible combinations of the hyperparameters and selects the best one


In [21]:
model = LogisticRegression()
penalty = ['l1', 'l2', 'elasticnet']
c_values=[100,10,1.0,0.1,0.01] #c_values shows the regularization strength. 100 means no regularization and 0.01 means a lot of regularization.
solver=['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'] #solver is the algorithm used to optimize the cost function



In [23]:
params=dict(penalty=penalty, C=c_values, solver=solver)

In [25]:
from sklearn.model_selection import StratifiedKFold

cv= StratifiedKFold() #stratified kfold is used to split the data into k folds. fold is a subset of the data

In [27]:
##gridsearchCV

from sklearn.model_selection import GridSearchCV

grid= GridSearchCV(estimator=model, param_grid=params, cv=cv, scoring='accuracy', n_jobs=-1) ##cv means cross validation
grid 


In [29]:
grid.fit(X_train, y_train)

200 fits failed out of a total of 375.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/sriramsohan/Documents/projects/testing_models/pipeline_inf_env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/sriramsohan/Documents/projects/testing_models/pipeline_inf_env/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/sriramsohan/Documents/projects/testing_models/pipeline_inf_env/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1194, in fit


In [30]:
grid.best_params_

{'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'}

In [31]:
grid.best_score_

np.float64(0.8785714285714287)

In [33]:
score=accuracy_score(y_test, y_pred)
print(score)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.8466666666666667
[[118  17]
 [ 29 136]]
              precision    recall  f1-score   support

           0       0.80      0.87      0.84       135
           1       0.89      0.82      0.86       165

    accuracy                           0.85       300
   macro avg       0.85      0.85      0.85       300
weighted avg       0.85      0.85      0.85       300



##Randomized Search CV

In [34]:
from sklearn.model_selection import RandomizedSearchCV
randomcv = RandomizedSearchCV(estimator=model, param_distributions=params, n_iter=10, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)

In [35]:
randomcv.fit(X_train, y_train)

30 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/sriramsohan/Documents/projects/testing_models/pipeline_inf_env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/sriramsohan/Documents/projects/testing_models/pipeline_inf_env/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/sriramsohan/Documents/projects/testing_models/pipeline_inf_env/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1194, in fit
   

In [37]:
randomcv.best_params_

{'solver': 'saga', 'penalty': 'l2', 'C': 0.01}

In [38]:
randomcv.best_score_

np.float64(0.8785714285714287)

In [39]:
ypred = randomcv.predict(X_test)

In [40]:
score=accuracy_score(y_test, ypred)
print(score)
print(confusion_matrix(y_test, ypred))
print(classification_report(y_test, ypred))

0.8533333333333334
[[124  11]
 [ 33 132]]
              precision    recall  f1-score   support

           0       0.79      0.92      0.85       135
           1       0.92      0.80      0.86       165

    accuracy                           0.85       300
   macro avg       0.86      0.86      0.85       300
weighted avg       0.86      0.85      0.85       300



In [42]:
##logistic regression with multiple classes
X,y =make_classification(n_samples=1000, n_features=10, n_informative=5, n_classes=3, random_state=42)

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)


In [44]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(multi_class='ovr')
logistic.fit(X_train, y_train)

y_pred = logistic.predict(X_test)



In [45]:
y_pred

array([0, 2, 1, 2, 2, 2, 0, 0, 1, 2, 1, 2, 2, 1, 1, 0, 2, 2, 1, 2, 0, 1,
       2, 1, 0, 0, 0, 1, 2, 2, 0, 2, 2, 2, 1, 0, 2, 0, 0, 0, 0, 1, 0, 0,
       0, 2, 2, 0, 1, 1, 2, 0, 1, 2, 2, 2, 1, 2, 2, 1, 2, 0, 1, 0, 1, 0,
       0, 2, 0, 1, 2, 0, 2, 1, 2, 2, 1, 2, 2, 0, 0, 0, 0, 2, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1,
       1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 0, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1,
       2, 2, 0, 1, 2, 1, 1, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 2, 0, 1, 0, 0,
       1, 1, 1, 2, 0, 0, 0, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2,
       2, 0, 0, 1, 0, 2, 1, 1, 2, 0, 1, 1, 1, 0, 0, 0, 0, 1, 2, 0, 0, 1,
       2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 0, 1, 2, 1, 0, 0, 2, 1, 1, 0, 2,
       2, 0, 1, 2, 0, 0, 0, 0, 1, 2, 0, 1, 1, 2, 1, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 2, 1, 0, 1, 0, 1, 2, 2, 0, 1, 2, 2, 2, 1, 1, 0, 2, 1, 2, 0,
       0, 1, 0, 1, 0, 2, 2, 0, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0])

In [46]:
score = accuracy_score(y_test, y_pred)
print(score)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.6966666666666667
[[74  6 10]
 [10 74 12]
 [26 27 61]]
              precision    recall  f1-score   support

           0       0.67      0.82      0.74        90
           1       0.69      0.77      0.73        96
           2       0.73      0.54      0.62       114

    accuracy                           0.70       300
   macro avg       0.70      0.71      0.70       300
weighted avg       0.70      0.70      0.69       300

