In [4]:
# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Import the dataset

In [6]:
pima = pd.read_csv("pima_indian_diabetes.csv")
pima.head()

Unnamed: 0,No_Times_Pregnant,Plasma_Glucose,Diastolic_BP,Triceps,Insulin,BMI,Age,Diabetes
0,1,89,66,23,94,28.1,21,0
1,0,137,40,35,168,43.1,33,1
2,3,78,50,32,88,31.0,26,1
3,2,197,70,45,543,30.5,53,1
4,1,189,60,23,846,30.1,59,1


### Check the columns present in the dataset

In [7]:
pima.columns

Index(['No_Times_Pregnant', 'Plasma_Glucose', 'Diastolic_BP', 'Triceps',
       'Insulin', 'BMI', 'Age', 'Diabetes'],
      dtype='object')

### Check the shape of the dataset

In [8]:
pima.shape

(784, 8)

### Check the descriptive statistics of the given dataset

In [9]:
pima.describe()

Unnamed: 0,No_Times_Pregnant,Plasma_Glucose,Diastolic_BP,Triceps,Insulin,BMI,Age,Diabetes
count,784.0,784.0,784.0,784.0,784.0,784.0,784.0,784.0
mean,3.30102,122.627551,70.663265,29.145408,156.056122,33.086224,30.864796,0.331633
std,3.209373,30.841068,12.488109,10.509706,118.765777,7.02317,10.194261,0.4711
min,0.0,56.0,24.0,7.0,14.0,18.2,21.0,0.0
25%,1.0,99.0,62.0,21.0,76.75,28.4,23.0,0.0
50%,2.0,119.0,70.0,29.0,125.5,33.2,27.0,0.0
75%,5.0,143.0,78.0,37.0,190.0,37.1,36.0,1.0
max,17.0,198.0,110.0,63.0,846.0,67.1,81.0,1.0


### Check the info of the dataset

In [10]:
pima.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 784 entries, 0 to 783
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   No_Times_Pregnant  784 non-null    int64  
 1   Plasma_Glucose     784 non-null    int64  
 2   Diastolic_BP       784 non-null    int64  
 3   Triceps            784 non-null    int64  
 4   Insulin            784 non-null    int64  
 5   BMI                784 non-null    float64
 6   Age                784 non-null    int64  
 7   Diabetes           784 non-null    int64  
dtypes: float64(1), int64(7)
memory usage: 49.1 KB


### Check for the missing value present in the dataset

In [11]:
pima.isnull().sum()

No_Times_Pregnant    0
Plasma_Glucose       0
Diastolic_BP         0
Triceps              0
Insulin              0
BMI                  0
Age                  0
Diabetes             0
dtype: int64

In [12]:
### Checking the Diabetes Rate
diabetes = (sum(pima['Diabetes'])/len(pima['Diabetes'].index))*100
diabetes

33.16326530612245

### Test-Train Split

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
# Putting feature variable to X
X = pima.drop(['Diabetes'], axis=1)

X.head()

Unnamed: 0,No_Times_Pregnant,Plasma_Glucose,Diastolic_BP,Triceps,Insulin,BMI,Age
0,1,89,66,23,94,28.1,21
1,0,137,40,35,168,43.1,33
2,3,78,50,32,88,31.0,26
3,2,197,70,45,543,30.5,53
4,1,189,60,23,846,30.1,59


In [15]:
# Putting response variable to y
y = pima['Diabetes']

y.head()

0    0
1    1
2    1
3    1
4    1
Name: Diabetes, dtype: int64

In [16]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

In [17]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 548 entries, 193 to 520
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   No_Times_Pregnant  548 non-null    int64  
 1   Plasma_Glucose     548 non-null    int64  
 2   Diastolic_BP       548 non-null    int64  
 3   Triceps            548 non-null    int64  
 4   Insulin            548 non-null    int64  
 5   BMI                548 non-null    float64
 6   Age                548 non-null    int64  
dtypes: float64(1), int64(6)
memory usage: 34.2 KB


###  Feature Scaling

In [18]:
from sklearn.preprocessing import StandardScaler

In [19]:
X_train.head()

Unnamed: 0,No_Times_Pregnant,Plasma_Glucose,Diastolic_BP,Triceps,Insulin,BMI,Age
193,1,109,60,8,182,25.4,21
543,0,137,68,14,148,24.8,21
537,0,126,84,29,215,30.7,24
330,1,167,74,17,144,23.4,33
265,6,129,90,7,326,19.6,60


In [20]:
y_train.head()

193    0
543    0
537    0
330    1
265    0
Name: Diabetes, dtype: int64

In [21]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_train.head()

Unnamed: 0,No_Times_Pregnant,Plasma_Glucose,Diastolic_BP,Triceps,Insulin,BMI,Age
0,-0.688122,-0.421477,-0.853262,-2.055113,0.240151,-1.106727,-0.979643
1,-1.004739,0.492349,-0.218384,-1.475045,-0.058427,-1.19099,-0.979643
2,-1.004739,0.133346,1.051371,-0.024875,0.529948,-0.3624,-0.668248
3,-0.688122,1.471449,0.257774,-1.185011,-0.093554,-1.387605,0.265935
4,0.894963,0.231256,1.52753,-2.151791,1.504719,-1.921274,3.068486


In [22]:
X_test = pd.DataFrame(scaler.transform(X_test),columns = X_test.columns)
X_test.head()

Unnamed: 0,No_Times_Pregnant,Plasma_Glucose,Diastolic_BP,Triceps,Insulin,BMI,Age
0,-0.688122,1.014535,0.892652,1.231939,2.901013,1.027948,-0.772046
1,0.261729,-0.845754,-1.170701,-1.185011,-0.927818,-1.752747,-0.460652
2,1.844815,1.601995,3.114724,-0.508265,0.749491,1.702055,2.445697
3,-0.688122,-1.36794,0.257774,-1.765079,-0.831219,-0.460707,-0.875845
4,-0.371505,-0.747844,-0.853262,-1.185011,0.046953,0.466191,-0.979643


<a id=section602></a>
### Model Fit and Predict

<b>In sklearn.linear_model.LogisticRegression, C actually refers to the inverse regularization strength, 𝟏/λ 
- Interpretation: 
- C is high, lambda is low, less regularisation. Model is complex and will overfit
- C is low, lambda is high, high regularisation. Model is simple and will underfit</b>


<a id=section602></a>
### L2 (Ridge) Regularisation

In [23]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

In [34]:
degree = 1
poly = PolynomialFeatures(degree = degree, include_bias=False)

THRESHOLD = 0.65

#C_values = [10000, 1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001]
#lamdba = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]

C_values = [10, 1, 0.5, 0.1, 0.05, 0.01, 0.001]
#lamdba = [0.1, 1, 2, 10, 20, 100, 1000]

df = pd.DataFrame()
for c in C_values:
    ridgeclf = LogisticRegression(C=c) # By default using L2 Regularisation (Ridge)
    
    pipe = Pipeline([('polynomial_features',poly), ('logistic_regression',ridgeclf)])
    pipe.fit(X_train, y_train)
    
    s = pd.Series(pipe.named_steps['logistic_regression'].coef_[0]).round(6)
    
    # predict using train data
    train_pred_prob = pipe.predict_proba(X_train)
    train_pred = np.where(train_pred_prob[:,1] > THRESHOLD, 1, 0)
    s["Train Accuracy"] = str(round(accuracy_score(y_train,train_pred) *100,2))+"%"
    
    # predict using test data
    test_pred_prob = pipe.predict_proba(X_test)
    test_pred = np.where(test_pred_prob[:,1] > THRESHOLD, 1, 0)
    s["Test Accuracy"] = str(round(accuracy_score(y_test,test_pred) *100,2))+"%"
    
    col = "   C="+str(c)
    df[col] = s
    
feature = pd.Series(poly.get_feature_names())
df.insert(0,'feature',feature)
print("\033[1m            L2 Regularisation(Ridge)    ", "Threshold:",THRESHOLD, "     Degree:",degree)
print()
print("              Over fitting----------------> Going towards------------> Under fitting \033[0m")
df

[1m            L2 Regularisation(Ridge)     Threshold: 0.65      Degree: 1

              Over fitting----------------> Going towards------------> Under fitting [0m


Unnamed: 0,feature,C=10,C=1,C=0.5,C=0.1,C=0.05,C=0.01,C=0.001
0,x0,0.188533,0.188399,0.188188,0.185295,0.180582,0.146442,0.047464
1,x1,1.20973,1.18616,1.16165,1.01003,0.884097,0.500661,0.11023
2,x2,-0.010757,-0.005585,-0.000288,0.030411,0.052472,0.088779,0.03881
3,x3,0.197832,0.19823,0.19847,0.196177,0.189622,0.144763,0.045015
4,x4,-0.065673,-0.054627,-0.043321,0.021837,0.067952,0.139311,0.056342
5,x5,0.370728,0.36249,0.354109,0.306072,0.270475,0.173623,0.049883
6,x6,0.43867,0.432984,0.427088,0.390483,0.359222,0.248404,0.071063
Train Accuracy,,78.1%,78.28%,78.28%,77.01%,76.28%,71.53%,66.79%
Test Accuracy,,79.66%,80.08%,80.08%,80.51%,79.66%,75.42%,66.95%


#### Observation
- <b>As C = 10, no weightage to regularisation(0.1). Model is very complex(overfitting). As C decreases, we are giving more weightage to regularisation. Model becomes simpler and fit better on test data. But then it starts underfitting.
- Coefficients become simpler. 
- C = 0.5 seems a sweet spot, Lambda is 20(with threshold as 0.65)
- You can play around with different thresholds and different degrees also</b>


<a id=section602></a>
### L1 (Lasso) Regularisation

In [35]:
degree = 1
poly = PolynomialFeatures(degree = degree, include_bias=False)

THRESHOLD = 0.65


C_values = [10, 1, 0.5, 0.1, 0.05, 0.01, 0.001]
#lamdba = [0.1, 1, 2, 10, 20, 100, 1000]

df = pd.DataFrame()
for c in C_values:
    lassoclf = LogisticRegression(penalty='l1', C=c, solver='liblinear')
    
    pipe = Pipeline([('polynomial_features',poly), ('logistic_regression',lassoclf)])
    pipe.fit(X_train, y_train)
    feature_names = ['x']
    s = pd.Series(pipe.named_steps['logistic_regression'].coef_[0]).round(6)
    
    # predict using train data
    train_pred_prob = pipe.predict_proba(X_train)
    train_pred = np.where(train_pred_prob[:,1] > THRESHOLD, 1, 0)
    s["Train accuracy"] = str(round(accuracy_score(y_train,train_pred) *100,2))+"%"
    
    # predict using test data
    test_pred_prob = pipe.predict_proba(X_test)
    test_pred = np.where(test_pred_prob[:,1] > THRESHOLD, 1, 0)
    s["Test accuracy"] = str(round(accuracy_score(y_test,test_pred) *100,2))+"%"
    
    col = "   C="+str(c)
    df[col] = s
    
feature = pd.Series(poly.get_feature_names())
df.insert(0,'feature',feature)
print("\033[1m            L1 Regularisation(Lasso)    ", "Threshold:",THRESHOLD, "     Degree:",degree)
print()
print("              Over fitting----------------> Going towards------------> Under fitting \033[0m")
df

[1m            L1 Regularisation(Lasso)     Threshold: 0.65      Degree: 1

              Over fitting----------------> Going towards------------> Under fitting [0m


Unnamed: 0,feature,C=10,C=1,C=0.5,C=0.1,C=0.05,C=0.01,C=0.001
0,x0,0.187339,0.177221,0.166488,0.093294,0.016138,0,0
1,x1,1.20851,1.17631,1.14312,1.00483,0.879324,0.253773,0
2,x2,-0.008651,0,0,0,0,0,0
3,x3,0.197,0.189587,0.181636,0.120837,0.062743,0,0
4,x4,-0.064143,-0.039812,-0.013487,0,0,0,0
5,x5,0.368983,0.351109,0.335056,0.254081,0.176256,0,0
6,x6,0.438024,0.430329,0.424086,0.382501,0.34477,0,0
Train accuracy,,78.1%,78.1%,77.92%,77.19%,75.18%,66.79%,66.79%
Test accuracy,,79.66%,79.66%,79.24%,79.24%,78.81%,67.8%,66.95%


#### Observation
<b>As C = 10, no weightage to regularisation(0.1). Model is very complex(overfitting). As C decreases, we are giving more weightage to regularisation. Model becomes simpler and fit better on test data. But then it starts underfitting.
- Coefficients become simpler. 
- C = 0.5 seems a sweet spot, Lambda is 20(with threshold as 0.65)
- Also a few feature coefficients become zero.
- Lasso regression not only helps in reducing over-fitting but it can help us in feature selection</b>

### Summary
#### Regularization is really important!
#### It can make a big difference for getting good performance. You usually will want to tune the regularization strength when you build a classifier.
#### L2 and Ridge regularizations are equivalent. The same for L1 and Lasso.
#### Different Penalty options impact the coefficients differently. Where L2(Ridge) generally leads to smaller coefficients, L1(Lasso) results in sparse coefficient vectors with just a few higher value coefficients.