Loading Bibs

In [62]:
import numpy as np
from sklearn import datasets
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.tree import DecisionTreeRegressor
import statsmodels.api as sm
from sklearn.svm import SVC, SVR
from sklearn.dummy import DummyRegressor
from sklearn.feature_selection import RFE
import pandas_profiling


Loading Dataset

In [36]:
X, Y = datasets.load_diabetes(return_X_y=True)
full_database = pd.DataFrame(X,columns=['age','sex','bmi','bp','s1_tc','s2_ldl','s3_hdl','s4_tch','s5_ltg','s6_glu'])
full_database['disease_progression'] = Y

x = full_database.drop('disease_progression',axis=1)
y = full_database['disease_progression']

X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.2,random_state=100)


In [37]:
def score_model(model,X_test,Y_test):
    Y_predict = model.predict(X_test)
    R2_score = metrics.r2_score(Y_test,Y_predict)
    print("R2_score:",R2_score)
    MSE = metrics.mean_squared_error(Y_test,Y_predict)
    MSE_sqrt = np.sqrt(MSE)
    print("MSE:",MSE_sqrt)
    return R2_score, MSE_sqrt

def train_model(model,X_train,Y_train):
    model.fit(X_train,Y_train)
    return model

LINEAR REGRESSION

In [64]:
select = RFE(estimator = LinearRegression(), n_features_to_select=7,step=1)
select.fit(X_train,Y_train)
X_train_transform = select.transform(X_train)
X_test_transform = select.transform(X_test)

print(select.support_)

[False  True  True  True  True  True False  True  True False]


In [65]:
pandas_profiling.ProfileReport(full_database)

Summarize dataset:  45%|████▌     | 55/122 [00:06<00:08,  8.11it/s, scatter s6_glu, s1_tc]              


KeyboardInterrupt: 

In [66]:
model_LR = LinearRegression()
model_LR = train_model(model_LR,X_train_transform,Y_train)
R2_LR,MSE_LR = score_model(model_LR,X_test_transform,Y_test)

R2_score: 0.509217541737952
MSE: 51.0688896531946


Ordinary Least Squares (OLS)

In [82]:
X_train_plus_constant = sm.add_constant(X_train)
X_test_plus_constant = sm.add_constant(X_test)
model_OLS = sm.OLS(Y_train,X_train_plus_constant,hasconst=True).fit()

R2_OLS, MSE_OLS = score_model(model_OLS,X_test_plus_constant,Y_test)
# Y_pred = model_OLS.predict(X_train_plus_constant)

R2_score: 0.5041759376283341
MSE: 51.33052418788047


In [84]:
X_train_plus_constant = sm.add_constant(X_train_transform)
X_test_plus_constant = sm.add_constant(X_test_transform)
model_OLS = sm.OLS(Y_train,X_train_plus_constant,hasconst=True).fit()

R2_OLS, MSE_OLS = score_model(model_OLS,X_test_plus_constant,Y_test)
# Y_pred = model_OLS.predict(X_train_plus_constant)

R2_score: 0.5092175417379519
MSE: 51.0688896531946


Random Forest Regressor

In [85]:
model_forest = RandomForestRegressor(random_state=100)
model_forest = train_model(model_forest,X_train,Y_train)
R2_forest,MSE_forest = score_model(model_forest,X_test,Y_test)

R2_score: 0.4525668507687324
MSE: 53.935839859487814
