In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import quantile_transform
from modeling import *
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error 
import pickle






In [17]:
df_E10 = pd.read_csv('./output/E10.csv')
df_SP98 = pd.read_csv('./output/SP98.csv')


## E10:

In [18]:
X_train_E10, X_test_E10, y_train_E10, y_test_E10 = tr_test_split(df_E10)

In [19]:
model = GradientBoostingRegressor()

param_grid = {
    'max_depth': [20, 30, 40, 50, 60],
    'min_samples_split': [4,5,6,7,8,9],
    'n_estimators': [600, 700, 750, 760, 800, 1000],
    'max_features': ['sqrt']
}


grid_search_E10 = GridSearchCV(estimator = model, param_grid = param_grid, cv = 8, n_jobs = -1, verbose = 2)

model_grid = grid_search_E10.fit(X_train_E10, y_train_E10)

Fitting 8 folds for each of 180 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   34.0s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:   52.1s
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed:  1.2min finished


In [26]:
best_model_E10 = model_grid.best_params_
best_model_E10

{'max_depth': 20,
 'max_features': 'sqrt',
 'min_samples_split': 9,
 'n_estimators': 760}

In [21]:
y_pred_E10 = model_grid.predict(X_test_E10)
print("MSE", round(mean_squared_error(y_test_E10,y_pred_E10)),2)


MSE 0.0 2


In [22]:
filename = './models/E10_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(best_model_E10, file)

## SP98:

In [23]:
X_train_SP98, X_test_SP98, y_train_SP98, y_test_SP98 = tr_test_split(df_SP98)

In [24]:
model = GradientBoostingRegressor()

param_grid = {
    'max_depth': [20, 30, 40, 50, 60],
    'min_samples_split': [4,5,6,7,8,9],
    'n_estimators': [600, 700, 750, 760, 800, 1000],
    'max_features': ['sqrt']
}


grid_search_SP98 = GridSearchCV(estimator = model, param_grid = param_grid, cv = 8, n_jobs = -1, verbose = 2)

model_grid = grid_search_SP98.fit(X_train_SP98, y_train_SP98)

Fitting 8 folds for each of 180 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   36.9s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:   57.3s
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed:  1.4min finished


In [25]:
best_model_SP98 = model_grid.best_params_
best_model_SP98

{'max_depth': 20,
 'max_features': 'sqrt',
 'min_samples_split': 9,
 'n_estimators': 760}

In [10]:
y_pred_SP98 = model_grid.predict(X_test_SP98)
print("MSE", round(mean_squared_error(y_test_SP98,y_pred_SP98)),2)



MSE 0.0 2


In [15]:
filename = './models/SP98_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(best_model_SP98, file)

#### We save the models with the best params in order to use them for the prediction

In [28]:
#filename = './models/E10_model.pkl'
#with open(filename, 'rb') as file:  
 #   E10_model = pickle.load(file)

### E10 prediction:

In [36]:
model_predict = GradientBoostingRegressor(max_depth=20,
 max_features='sqrt',
 min_samples_split=9,
 n_estimators=760) 
model_predict = model_predict.fit(X_train_E10, y_train_E10)
y_pred = model_predict.predict(X_test_E10)

y_test_E10['consume'] = y_pred
#sub = predict[['id','price']]
#sub.to_csv('subm_.csv', index=False, header=True)

In [37]:
y_test_E10['consume']

array([5.30057923, 4.92650416, 4.50699638, 4.3270897 , 5.14236764,
       5.47659444, 4.2269705 , 4.55306345, 4.92291412, 4.25141756,
       5.1121963 , 5.19937972, 5.42054901, 4.86942359, 5.84915296,
       5.08719628, 4.42451097, 4.04382378, 4.87221601, 5.00261942,
       4.65742433, 4.92494097, 5.59166205, 4.80207511, 4.62014879,
       3.79953873, 5.60437236, 4.72519006, 4.09676133, 4.77162672,
       3.97247058, 4.5461982 ])

In [48]:
y_test_E10['consume'].mean()

4.800874202715468

In [44]:
pr_E10 = y_test_E10[['consume']]
pr_E10.to_csv('./output/consume_prediction_E10.csv', index=False, header=True)

### SP98 prediction:

In [40]:
model_predict = GradientBoostingRegressor(max_depth=20,
 max_features='sqrt',
 min_samples_split=9,
 n_estimators=760) 
model_predict = model_predict.fit(X_train_SP98, y_train_SP98)
y_pred = model_predict.predict(X_test_SP98)

y_test_SP98['consume'] = y_pred
#sub = predict[['id','price']]
#sub.to_csv('subm_.csv', index=False, header=True)

In [41]:
y_test_SP98['consume']

array([4.73033728, 5.03738541, 4.65037619, 5.47124293, 5.29255253,
       5.25280918, 4.55849099, 4.08929633, 4.41631015, 5.28279613,
       4.21469447, 5.11937726, 5.55811743, 4.47273541, 4.77781213,
       4.2354028 , 4.37310262, 4.92908122, 4.00667748, 3.76641921,
       4.90971144, 4.1949122 , 4.29412465, 4.31708964, 5.52157228,
       4.91146141, 4.3667349 , 4.606166  , 4.3813723 , 5.26972757,
       6.10464969, 5.04680708, 4.35291083, 6.65051021, 8.99298606,
       4.58135871, 3.85025751, 5.68602804, 5.21711466, 4.54538085,
       4.81236989, 4.35452116, 4.76988342, 4.99805729, 4.11501941,
       4.25233096])

In [49]:
y_test_SP98['consume'].mean()

4.855175550633847

In [46]:
pr_SP98 = y_test_SP98[['consume']]
pr_SP98.to_csv('./output/consume_prediction_E10.csv', index=False, header=True)

As we can see, the mean of the consume predicted is a little bit smaller in the E10 gas type. If we refresh our memory, we can remember that this gas type was also 0,34 cents per liter cheaper