In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn import ensemble
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [2]:
data=pd.read_csv('../../_data/8_features_c2db.csv')

In [7]:
df=data.values
df=np.array(df)

for i in range(8):
  df[:,i]=(df[:,i]-df[:,i].min())/(df[:,i].max()- df[:,i].min())
  
data_train, data_test = train_test_split(df, test_size=0.1, random_state=0)
x_train=data_train[:,:8]
x_test=data_test[:,:8]
y_train=data_train[:,8]
y_test=data_test[:,8]

In [8]:
rf = RandomForestRegressor(criterion = 'squared_error', bootstrap=False, max_features=0.8, max_depth=20,
                         min_samples_split=5, n_estimators=15000, min_samples_leaf=3, random_state=0) 
rf.fit(x_train, y_train)
rf_train_pred=rf.predict(x_train)
rf_test_pred=rf.predict(x_test)

In [9]:
print("Model evaluation - Test Set:")
print('r^2:',r2_score(y_test, rf_test_pred))
print('RSE', mean_squared_error(y_test, rf_test_pred)) 
print('MAE', mean_absolute_error(y_test, rf_test_pred)) 
print('RMSE:',np.sqrt(mean_squared_error(y_test,rf_test_pred)))

Model evaluation - Test Set:
r^2: 0.9001968969881061
RSE 0.06611689644625561
MAE 0.10156060422036761
RMSE: 0.2571320603235925


In [10]:
params = {'n_estimators':21000, 'max_depth': 21, 'min_samples_split': 5,
          'max_features':0.8, 'learning_rate': 0.001, 'loss': 'squared_error',
          'random_state':0, 'subsample': 0.85}

gbr = ensemble.GradientBoostingRegressor(**params) 
gbr.fit(x_train, y_train)
gbr_train_pred = gbr.predict(x_train)
gbr_test_pred = gbr.predict(x_test)

In [11]:
print("Model evaluation - Test Set:")
print('r^2:',r2_score(y_test, gbr_test_pred))
print('RSE', mean_squared_error(y_test, gbr_test_pred)) 
print('MAE', mean_absolute_error(y_test, gbr_test_pred)) 
print('RMSE:',np.sqrt(mean_squared_error(y_test,gbr_test_pred)))

Model evaluation - Test Set:
r^2: 0.9113951921677259
RSE 0.058698324273431586
MAE 0.0959922214107397
RMSE: 0.24227737053516077


In [12]:
kernel = 'rbf'
C = [50]
gamma = [50] 
epsilon = [0.2]

params_dict = {
    'C': C,
    'gamma': gamma, 
    'epsilon': epsilon
}

svr = SVR()

gsCV = GridSearchCV(
    estimator=svr, 
    param_grid=params_dict, 
    n_jobs=2,
    scoring='r2',
    cv=6 
)

gsCV.fit(x_train, y_train)
svr = SVR(C=gsCV.best_params_['C'], kernel=kernel, 
          gamma=gsCV.best_params_['gamma'],
          epsilon=gsCV.best_params_['epsilon'])
svr.fit(x_train, y_train)
svr_train_pred = svr.predict(x_train)
svr_test_pred = svr.predict(x_test)

In [13]:
print("Model evaluation - Test Set:")
print('r^2:',r2_score(y_test, svr_test_pred))
print('RSE', mean_squared_error(y_test, svr_test_pred)) 
print('RAE', mean_absolute_error(y_test, svr_test_pred)) 
print('RMSE:',np.sqrt(mean_squared_error(y_test,svr_test_pred)))

Model evaluation - Test Set:
r^2: 0.7379921025275658
RSE 0.17357325075575677
RAE 0.24362127145230222
RMSE: 0.4166212317630449
