In [1]:
import numpy as np
from sklearn import datasets
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.tree import DecisionTreeRegressor
import statsmodels.api as sm
from sklearn.svm import SVC, SVR
from sklearn.dummy import DummyRegressor

Train and Test functions

In [2]:
def score_model(model,X_test,Y_test):
    Y_predict = model.predict(X_test)
    R2_score = metrics.r2_score(Y_test,Y_predict)
    print("R2_score:",R2_score)
    MSE = metrics.mean_squared_error(Y_test,Y_predict)
    MSE_sqrt = np.sqrt(MSE)
    print("MSE:",MSE_sqrt)
    return R2_score, MSE_sqrt

def train_model(model,X_train,Y_train):
    model.fit(X_train,Y_train)
    return model

In [3]:
X, Y = datasets.load_diabetes(return_X_y=True)
full_database = pd.DataFrame(X,columns=['age','sex','bmi','bp','s1_tc','s2_ldl','s3_hdl','s4_tch','s5_ltg','s6_glu'])
full_database['disease_progression'] = Y

x = full_database.drop('disease_progression',axis=1)
y = full_database['disease_progression']

X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.3,random_state=100)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])
print("Total:", X.shape[0])

Train size: 309
Test size: 133
Total: 442


In [4]:
print("Linear Regression:")

model_LR = LinearRegression()
model_LR = train_model(model_LR,X_train,Y_train)
R2_LR,MSE_LR = score_model(model_LR,X_test,Y_test)

Linear Regression:
R2_score: 0.4929329680070459
MSE: 51.83774341914194


In [5]:
print("Linear Regression without s1 and s4:")

model_LR_2 = LinearRegression()
model_LR_2 = train_model(model_LR_2,X_train.drop(['s1_tc','s4_tch'],axis=1),Y_train)
R2_LR_2,MSE_LR_2 = score_model(model_LR_2,X_test.drop(['s1_tc','s4_tch'],axis=1),Y_test)

Linear Regression without s1 and s4:
R2_score: 0.49419934639731344
MSE: 51.772971670702425


In [7]:
print("Ordinary Least Squares (statsmodels):")

X_train_plus_constant = sm.add_constant(X_train)
model_OLS = sm.OLS(Y_train,X_train_plus_constant,hasconst=True).fit()
model_OLS.summary()

Ordinary Least Squares (statsmodels):


0,1,2,3
Dep. Variable:,disease_progression,R-squared:,0.518
Model:,OLS,Adj. R-squared:,0.502
Method:,Least Squares,F-statistic:,32.01
Date:,"Sun, 13 Mar 2022",Prob (F-statistic):,9.909999999999999e-42
Time:,15:50:00,Log-Likelihood:,-1673.8
No. Observations:,309,AIC:,3370.0
Df Residuals:,298,BIC:,3411.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,153.7556,3.166,48.569,0.000,147.526,159.986
age,-25.0226,74.035,-0.338,0.736,-170.719,120.674
sex,-205.0057,75.173,-2.727,0.007,-352.943,-57.069
bmi,572.7009,80.247,7.137,0.000,414.778,730.624
bp,308.4990,76.720,4.021,0.000,157.517,459.481
s1_tc,-1000.9232,514.684,-1.945,0.053,-2013.799,11.952
s2_ldl,606.5425,428.025,1.417,0.158,-235.791,1448.876
s3_hdl,207.3958,251.698,0.824,0.411,-287.934,702.726
s4_tch,135.2240,192.534,0.702,0.483,-243.675,514.123

0,1,2,3
Omnibus:,1.316,Durbin-Watson:,2.019
Prob(Omnibus):,0.518,Jarque-Bera (JB):,1.287
Skew:,0.059,Prob(JB):,0.525
Kurtosis:,2.707,Cond. No.,230.0


In [8]:
print("SVC model:")

model_SVC = SVC(max_iter=1000)
model_SVC = train_model(model_SVC,X_train,Y_train)
R2_SVC,MSE_SVC = score_model(model_SVC,X_test,Y_test)

SVC model:
R2_score: 0.09173594363744875
MSE: 69.37763068235859


In [9]:
print("SVC model without s1 and s4:")

model_SVC_2 = SVC()
model_SVC_2 = train_model(model_SVC_2,X_train.drop(['s1_tc','s4_tch'],axis=1),Y_train)
R2_SVC_2,MSE_SVC_2 = score_model(model_SVC_2,X_test.drop(['s1_tc','s4_tch'],axis=1),Y_test)

SVC model without s1 and s4:
R2_score: 0.20325372087558424
MSE: 64.97905966916359


In [10]:
print("SVR model:")

model_SVR = SVR()
model_SVR = train_model(model_SVR,X_train,Y_train)
R2_SVR,MSE_SVR = score_model(model_SVR,X_test,Y_test)

SVR model:
R2_score: 0.18511210145620693
MSE: 65.7146704730941


In [11]:
print("DummyRegressor:")

dummy = DummyRegressor(strategy="median")
dummy = train_model(dummy,X_train,Y_train)
R2_dummy,MSE_dummy = score_model(dummy,X_test,Y_test)

DummyRegressor:
R2_score: -1.7932367515793857e-05
MSE: 72.79763897683429


In [12]:
print("Decision Tree Regressor:")

model_tree = DecisionTreeRegressor(random_state=100)
model_tree = train_model(model_tree,X_train,Y_train)
R2_tree,MSE_tree = score_model(model_tree,X_test,Y_test)

Decision Tree Regressor:
R2_score: -0.0004520855460765638
MSE: 72.81343964187657


In [13]:
print("Random Forest Regressor:")

model_forest = RandomForestRegressor(random_state=100)
model_forest = train_model(model_forest,X_train,Y_train)
R2_forest,MSE_forest = score_model(model_forest,X_test,Y_test)

Random Forest Regressor:
R2_score: 0.4451330652545733
MSE: 54.22603162009145
