In [311]:
# Importing libraries
from sklearn.datasets import load_diabetes
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Load the diabetes dataset
diabetes=load_diabetes()

In [313]:
# Print the keys of the dataset to understand its structure
print(diabetes.keys())

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])


In [315]:
# Create a DataFrame from the diabetes dataset for easier handling and exploration
#  Display the first few rows
df=pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
df.head().T

Unnamed: 0,0,1,2,3,4
age,0.038076,-0.001882,0.085299,-0.089063,0.005383
sex,0.05068,-0.044642,0.05068,-0.044642,-0.044642
bmi,0.061696,-0.051474,0.044451,-0.011595,-0.036385
bp,0.021872,-0.026328,-0.00567,-0.036656,0.021872
s1,-0.044223,-0.008449,-0.045599,0.012191,0.003935
s2,-0.034821,-0.019163,-0.034194,0.024991,0.015596
s3,-0.043401,0.074412,-0.032356,-0.036038,0.008142
s4,-0.002592,-0.039493,-0.002592,0.034309,-0.002592
s5,0.019907,-0.068332,0.002861,0.022688,-0.031988
s6,-0.017646,-0.092204,-0.02593,-0.009362,-0.046641


In [317]:
# Define a list of regressors (models) to be evaluated
# Note: LogisticRegression is typically used for classification, not regression
regressors = [LinearRegression, Lasso, Ridge, LogisticRegression]
models = [regressor() for regressor in regressors]

# Print the models to confirm their instantiation
models

[LinearRegression(), Lasso(), Ridge(), LogisticRegression()]

In [319]:
# Split the dataset into features (X) and target variable (y)
X, y = diabetes.data, diabetes.target

In [320]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,test_size=0.3)
print('We are using', len(X_train),'samples for training the', len(models),'models and',len(y_test),'samples for testing them later.')

We are using 309 samples for training the 4 models and 133 samples for testing them later.


In [323]:
# Normalize the feature data
X_train_scaled = preprocessing.normalize(X_train)
X_test_scaled = preprocessing.normalize(X_test)

In [325]:
mae = []
mse = []
r_sq = []

# Iterate over each model, fit it to the training data, and evaluate its performance
for m in models:
    reg = m.fit(X_train_scaled, y_train)
    y_pred_2 = reg.predict(X_test_scaled)
    mse.append(mean_squared_error(y_test, y_pred_2))
    mae.append(mean_absolute_error(y_test,y_pred_2))
    r_sq.append(r2_score(y_test,y_pred_2))

In [302]:
# Print the performance metrics for each model
for i,m in enumerate(models):
    print(f"{str(m)}") 
    print(f"MAE: {mae[i]}, MSE: {mse[i]}, R_SQUARED: {r_sq[i]}\n")

LinearRegression()
MAE: 40.736898019933435, MSE: 2720.386759603146, R_SQUARED: 0.49606675284241863

Lasso()
MAE: 40.900624425802896, MSE: 2687.6607682381223, R_SQUARED: 0.5021290213918486

Ridge()
MAE: 40.667764498676306, MSE: 2712.643203305392, R_SQUARED: 0.4975011942709764

LogisticRegression()
MAE: 56.954887218045116, MSE: 5291.736842105263, R_SQUARED: 0.019741542068624884

