In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_absolute_error,mean_squared_error,mean_absolute_percentage_error
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [2]:
from sklearn import datasets
diabetes_X,diabetes_y = datasets.load_diabetes(return_X_y=True)

In [3]:
diabetes = pd.DataFrame(diabetes_X,columns = ["age","sex","bmi","bp","s1","s2","s3","s4","s5","s6"])
diabetes["dis_prog"] = pd.DataFrame(diabetes_y)

In [4]:
X = diabetes.iloc[:,:-1]
y = diabetes.iloc[:,-1]

In [14]:
# List to store degree from 0 to 9 and also to store R2, MAE, MAPE
degrees = []
r2_scores = []
mae_scores = []
mape_scores = []


for degree in range(10):
    # Creating Polynomiall Fuction
    poly_features = PolynomialFeatures(degree=degree)
    X_poly = poly_features.fit_transform(X)
    
    # Performing cross validation using 10 splits,here the default  scoring is -mean absolute error and % error 
    #so we have used (-) lower values of mae and mape is better. 
    model = LinearRegression()
    r2_score = cross_val_score(model, X_poly, y, cv=10, scoring='r2')
    mae =  -cross_val_score(model, X_poly, y, cv=10, scoring='neg_mean_absolute_error')
    mape = -cross_val_score(model, X_poly, y, cv = 10, scoring  = 'neg_mean_absolute_percentage_error')
       
    degrees.append(degree)
    r2_scores.append(np.mean(r2_score))
    mae_scores.append(np.mean(mae))
    mape_scores.append(np.mean(mape))
   
   #Now we have created the dataframe

results_df = pd.DataFrame({'Degree': degrees,'R' : r2_scores,  'MAE' : mae_scores,'MAPE': mape_scores})

#Printing the table of cross validation
print(results_df)

   Degree          R         MAE      MAPE
0       0  -0.039767   65.948459  0.622416
1       1   0.461960   44.223084  0.394551
2       2   0.379861   45.873376  0.392139
3       3 -37.884452  208.731680  1.706457
4       4 -72.819234  329.077072  2.704894
5       5 -65.606865  312.433209  2.567480
6       6 -65.480725  312.239540  2.565924
7       7 -65.478739  312.235990  2.565909
8       8 -65.478613  312.235823  2.565908
9       9 -65.477435  312.235857  2.565900


# My inferences on Cross-validation results

The R2 values are positive for degrees 0, 1, and 2, which indicate a good fit. But the values decrease as the degree increases, which means  model do not fit in data.
The MAE values is decreasing at 0 and 1 degree but after that it increases which indicates overfitting.
The MAPE values for degree 0,1,2 is decreasing but after that is increasing significantly thats shows higher percentage difference between predicted and actual values.

# Calculating Mean and Standard Deviation of MAE, MAPE, R2 

In [19]:
mean_r_squared = results_df['R'].mean()
standard_dev_r2_score = results_df['R'].std()
mean_mae = results_df['MAE'].mean()
standard_deviation_mae = results_df['MAE'].std()
mean_mape = results_df['MAPE'].mean()
standard_mape = results_df['MAPE'].std()

# Printing the Mean and Std of R2, MAE and MAPE

In [17]:
print(f"Mean R-Squared: {mean_r_squared :.4f}")
print(f"Standard Deviation of R-Squared: {standard_dev_r2_score:.4f}")
print(f"Mean MAE: {mean_mae:.4f}")
print(f"Standard Deviation of MAE: {standard_deviation_mae:.4f}")
print(f"Mean MAPE: {mean_mape:.4f}")
print(f"Standard Deviation of MAPE: {standard_mape:.4f}")

Mean R-Squared: -43.7424
Standard Deviation of R-Squared: 31.7253
Mean MAE: 225.5234
Standard Deviation of MAE: 124.3845
Mean MAPE: 1.8652
Standard Deviation of MAPE: 1.0036


In [18]:
r2_best_model = results_df['R'].idxmax()
mae_best_model = results_df['MAE'].idxmin()
mape_best_model = results_df['MAPE'].idxmin()


print("\nBest Model based on R2:")
print(results_df.loc[r2_best_model])

print("\nBest Model based on MAE:")
print(results_df.loc[mae_best_model])

print("\nBest Model based on MAPE:")
print(results_df.loc[mape_best_model])


Best Model based on R2:
Degree     1.000000
R          0.461960
MAE       44.223084
MAPE       0.394551
Name: 1, dtype: float64

Best Model based on MAE:
Degree     1.000000
R          0.461960
MAE       44.223084
MAPE       0.394551
Name: 1, dtype: float64

Best Model based on MAPE:
Degree     2.000000
R          0.379861
MAE       45.873376
MAPE       0.392139
Name: 2, dtype: float64


# Inferences 

## 1)The polynomial degree of the good model is 1 because
## 3)The average difference between the predicted and actual values is shown by the mean absolute error which is 44.22.
## 4)The average percentage difference between predicted and actual values is shown by the Mean Absolute Percentage Error which is 0.394551 which is 39%.
## 5)Here we can see that model with degree 1 and degree 2 has slight difference of 0.0021 which we can neglate, so over all Model with degree of polynomial degree 1 is good as compared to other model. 