# 

In [6]:
# Importing necessary libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_selection import (VarianceThreshold, SelectKBest, f_regression, mutual_info_regression, 
    RFE, RFECV)
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures

# Load the data
df = pd.read_csv('diabetes.tab.txt', sep='\t')
df.head()

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y
0,59,2,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151
1,48,1,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75
2,72,2,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141
3,24,1,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206
4,50,1,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135


In [7]:
df = pd.get_dummies(df,columns=["SEX"],drop_first=True,dtype=int)
df.head()

Unnamed: 0,AGE,BMI,BP,S1,S2,S3,S4,S5,S6,Y,SEX_2
0,59,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151,1
1,48,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75,0
2,72,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141,1
3,24,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206,0
4,50,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135,0


In [8]:
target=df['Y']
features=df.drop('Y',axis=1)
features.head()

Unnamed: 0,AGE,BMI,BP,S1,S2,S3,S4,S5,S6,SEX_2
0,59,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,1
1,48,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,0
2,72,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,1
3,24,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,0
4,50,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,0


In [9]:
x_train,x_test,y_train,y_test = train_test_split(features,target,test_size=0.2,random_state=22)

In [10]:
# Initialize the scaler
scaler = StandardScaler()

# Scale every feature except the binary column - female
transformed_training_features = scaler.fit_transform(x_train.iloc[:,:-1])
transformed_testing_features = scaler.transform(x_test.iloc[:,:-1])

# Convert the scaled features into a DataFrame
X_train_transformed = pd.DataFrame(scaler.transform(x_train.iloc[:,:-1]), 
                                   columns=x_train.columns[:-1], 
                                   index=x_train.index)
X_test_transformed = pd.DataFrame(scaler.transform(x_test.iloc[:,:-1]), 
                                  columns=x_train.columns[:-1], 
                                  index=x_test.index)

# Add binary column back in
X_train_transformed['SEX_2'] = features['SEX_2']
X_test_transformed['SEX_2'] = features['SEX_2']

X_train_transformed.head()

Unnamed: 0,AGE,BMI,BP,S1,S2,S3,S4,S5,S6,SEX_2
400,-0.5172,0.958061,1.936805,-0.372416,-0.73506,1.605548,-0.837408,-0.727075,-0.220574,0
409,-1.126026,-0.25239,1.199326,1.209708,1.569708,-0.894306,1.551319,0.627438,-0.132593,1
266,-1.126026,-1.32581,0.240604,-0.167326,-0.761705,2.772146,-1.63365,-1.815618,0.043367,1
193,0.167729,-0.115358,-1.381849,1.502694,1.802849,-0.894306,1.551319,0.8143,0.395288,1
130,-1.354335,2.031481,2.231797,-0.050131,-0.082265,-0.144349,-0.041166,0.463531,1.53903,1


In [11]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_poly_train = pd.DataFrame(poly.fit_transform(X_train_transformed),
                            columns=poly.get_feature_names_out(X_train_transformed.columns))
X_poly_test = pd.DataFrame(poly.transform(X_test_transformed),
                           columns=poly.get_feature_names_out(X_test_transformed.columns))
X_poly_train.head()

Unnamed: 0,AGE,BMI,BP,S1,S2,S3,S4,S5,S6,SEX_2,...,S4^2,S4 S5,S4 S6,S4 SEX_2,S5^2,S5 S6,S5 SEX_2,S6^2,S6 SEX_2,SEX_2^2
0,-0.5172,0.958061,1.936805,-0.372416,-0.73506,1.605548,-0.837408,-0.727075,-0.220574,0.0,...,0.701252,0.608858,0.18471,-0.0,0.528638,0.160373,-0.0,0.048653,-0.0,0.0
1,-1.126026,-0.25239,1.199326,1.209708,1.569708,-0.894306,1.551319,0.627438,-0.132593,1.0,...,2.406591,0.973357,-0.205695,1.551319,0.393679,-0.083194,0.627438,0.017581,-0.132593,1.0
2,-1.126026,-1.32581,0.240604,-0.167326,-0.761705,2.772146,-1.63365,-1.815618,0.043367,1.0,...,2.668813,2.966084,-0.070846,-1.63365,3.296468,-0.078738,-1.815618,0.001881,0.043367,1.0
3,0.167729,-0.115358,-1.381849,1.502694,1.802849,-0.894306,1.551319,0.8143,0.395288,1.0,...,2.406591,1.263239,0.613217,1.551319,0.663085,0.321883,0.8143,0.156252,0.395288,1.0
4,-1.354335,2.031481,2.231797,-0.050131,-0.082265,-0.144349,-0.041166,0.463531,1.53903,1.0,...,0.001695,-0.019081,-0.063355,-0.041166,0.214861,0.713387,0.463531,2.368613,1.53903,1.0


In [12]:
def run_model(model, X_train, X_test, y_train, y_test, display=True):
    
    train_r2 = model.score(X_train, y_train)
    y_pred_train = model.predict(X_train)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    
    test_r2 = model.score(X_test, y_test)
    y_pred_test = model.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    if (display):
        print('Training R^2:', train_r2)
        print('Training Root Mean Squared Error:', train_rmse)
        print('\n----------------\n')
        print('Testing R^2:', test_r2)
        print('Testing Root Mean Squared Error:', test_rmse)
        
    return test_r2, test_rmse

In [13]:
lr_poly = LinearRegression()
lr_poly.fit(X_poly_train, y_train)

poly_r2, poly_rmse = run_model(lr_poly, X_poly_train, X_poly_test, y_train, y_test)

Training R^2: 0.6227142853140841
Training Root Mean Squared Error: 47.09711676445004

----------------

Testing R^2: 0.24472529845463808
Testing Root Mean Squared Error: 68.04661040353594


In [14]:
#filter method
#variance Threshold
selector = VarianceThreshold()
reduced_feature_train = pd.DataFrame(selector.fit_transform(X_poly_train), columns=X_poly_train.columns, index=X_poly_train.index)
reduced_feature_test = pd.DataFrame(selector.transform(X_poly_test), columns=X_poly_test.columns, index=X_poly_test.index)

lr = LinearRegression()
lr.fit(reduced_feature_train, y_train)
reduced_r2, reduced_rmse = run_model(lr, reduced_feature_train, reduced_feature_test, y_train, y_test)

print('\n----------------\n')
print(f"{reduced_feature_train.shape[1]} out of {X_poly_train.shape[1]} features used")
print('Baseline R-Squared:', round(poly_r2, 2))
print('Reduced R-Squared: ', round(reduced_r2, 2))

Training R^2: 0.6227142853140812
Training Root Mean Squared Error: 47.09711676445023

----------------

Testing R^2: 0.24472529845467272
Testing Root Mean Squared Error: 68.04661040353437

----------------

65 out of 65 features used
Baseline R-Squared: 0.24
Reduced R-Squared:  0.24


In [15]:
np.linspace(np.percentile(selector.variances_,10), np.percentile(selector.variances_, 90), 10)

array([0.4583632 , 0.62566803, 0.79297287, 0.9602777 , 1.12758253,
       1.29488736, 1.46219219, 1.62949702, 1.79680185, 1.96410668])

In [16]:
threshold_ranges = np.linspace(np.percentile(selector.variances_,10), np.percentile(selector.variances_, 90), 10)
reduced_r2s = []
for thresh in threshold_ranges:
    selector = VarianceThreshold(thresh)
    reduced_feature_train = selector.fit_transform(X_poly_train)
    reduced_feature_test = selector.transform(X_poly_test)
    lr = LinearRegression()
    lr.fit(reduced_feature_train, y_train)
    reduced_r2, reduced_rmse = run_model(lr, reduced_feature_train, reduced_feature_test, y_train, y_test, display=False)
    reduced_r2s.append(reduced_r2)
    
    print('Variance threshold:', thresh)
    print(f"{reduced_feature_train.shape[1]} out of {X_poly_train.shape[1]} features used")
    print('Baseline R-Squared:', round(poly_r2, 2))
    print('Reduced R-Squared: ', round(reduced_r2, 2))
    print('\n--------------------------------------------------------------------\n')

Variance threshold: 0.45836320383520807
58 out of 65 features used
Baseline R-Squared: 0.24
Reduced R-Squared:  0.21

--------------------------------------------------------------------

Variance threshold: 0.6256680347792201
54 out of 65 features used
Baseline R-Squared: 0.24
Reduced R-Squared:  0.25

--------------------------------------------------------------------

Variance threshold: 0.7929728657232322
53 out of 65 features used
Baseline R-Squared: 0.24
Reduced R-Squared:  0.27

--------------------------------------------------------------------

Variance threshold: 0.9602776966672442
38 out of 65 features used
Baseline R-Squared: 0.24
Reduced R-Squared:  0.42

--------------------------------------------------------------------

Variance threshold: 1.1275825276112563
15 out of 65 features used
Baseline R-Squared: 0.24
Reduced R-Squared:  0.07

--------------------------------------------------------------------

Variance threshold: 1.2948873585552683
11 out of 65 features use