In [2]:
# Below as comments in the next cell results are explained
# Import necessary libraries
from sklearn.datasets import load_diabetes
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np


# Load the Dataset and Convert to DataFrame
data = load_diabetes()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['progression'] = data.target
df.to_csv("diabetes.csv", index=False)  # Save to CSV for completeness

# Preprocess the Data
# Load the data from CSV
df = pd.read_csv("diabetes.csv")

# Check for missing values
print("Missing values in dataset:\n", df.isnull().sum())

# Standardize the features
X = df.drop('progression', axis=1)  # Features
y = df['progression']  # Target variable

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Model Selection
# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=5)
}

# Train and Evaluate Each Model on the Test Set
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=5)

# Train and evaluate each model
test_results = {}

for name, model in models.items():
    # Train the model on the training set
    model.fit(X_train, y_train)
    # Predict on the test set
    y_pred = model.predict(X_test)
    # Evaluate Mean Squared Error
    test_mse = mean_squared_error(y_test, y_pred)
    test_results[name] = test_mse
    print(f"{name} - Test set Mean Squared Error: {test_mse}")

# Select the best model based on test set performance
best_model_name = min(test_results, key=test_results.get)
best_model = models[best_model_name]
print(f"\nBest model based on test set: {best_model_name}")


Missing values in dataset:
 age            0
sex            0
bmi            0
bp             0
s1             0
s2             0
s3             0
s4             0
s5             0
s6             0
progression    0
dtype: int64
Linear Regression - Test set Mean Squared Error: 2981.5873043126116
Ridge Regression - Test set Mean Squared Error: 2980.976193653905
Random Forest Regressor - Test set Mean Squared Error: 2991.539735955056

Best model based on test set: Ridge Regression


In [None]:
# Missing values in dataset:
#  age            0
# sex            0
# bmi            0
# bp             0
# s1             0
# s2             0
# s3             0
# s4             0
# s5             0
# s6             0
# progression    0
# dtype: int64
# As everything is 0 means no emepty or incomplete data field

# Performance of each model as Mean Squared Error meaning sum of (predicted value - actual value)^2 divided by no of times predicted:
# Linear Regression - Test set Mean Squared Error: 2981.5873043126107
# Ridge Regression - Test set Mean Squared Error: 2980.9761936539035
# Random Forest Regressor - Test set Mean Squared Error: 2991.539735955056

# Best model based on test set: Ridge Regression

# The best model to use is (based on this test) is Ridge Regression