![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)
### 1. Import libreries

In [40]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)
### 2. Load data

In [4]:
df = pd.read_csv("../data/data_clean.csv")

![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)
### 3.Feature engineering

In [7]:
df['gasoline'] = df.gas_type.apply(lambda x: 0 if x =='E10' else 1)

In [8]:
df_gas = df.copy()
X = df_gas[['distance','speed','temp_outside','AC','rain','sun','gasoline']]
y = df_gas['consume']
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                y, 
                                                test_size=0.15, 
                                                random_state=46)

In [17]:
gb = GradientBoostingRegressor()

parameter_space = {'learning_rate': [0.01, 0.001],
                   'n_estimators': [3000,5000, 7000],
               'max_features': [1,0.5],
               'max_depth': [2,3,5],
               'min_samples_leaf': [10,15]}

grid_search = GridSearchCV(gb,
                   param_grid=parameter_space,
                   verbose=1,
                    n_jobs= -1,
                   cv=5)

grid_search.fit(X_train, y_train)

gb_best = grid_search.best_estimator_
y_predict_test = gb_best.predict(X_test) 

score = mean_squared_error(y_test, y_predict_test)

params = (grid_search.best_params_)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  2.7min finished


In [18]:
score

0.4827371631118899

In [19]:
params

{'learning_rate': 0.001,
 'max_depth': 5,
 'max_features': 0.5,
 'min_samples_leaf': 15,
 'n_estimators': 3000}

In [20]:
gb_best.fit(X,y)

GradientBoostingRegressor(learning_rate=0.001, max_depth=5, max_features=0.5,
                          min_samples_leaf=15, n_estimators=3000)

In [26]:
df.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,AC,rain,sun,E10,SP98,gasoline
0,28.0,5.0,26,21.5,12,E10,0,0,0,1,0,0
1,12.0,4.2,30,21.5,13,E10,0,0,0,1,0,0
2,11.2,5.5,38,21.5,15,E10,0,0,0,1,0,0
3,12.9,3.9,36,21.5,14,E10,0,0,0,1,0,0
4,18.5,4.5,46,21.5,15,E10,0,0,0,1,0,0


In [21]:
l = LinearRegression()

In [31]:
x = df.drop(['consume','gas_type'],axis=1)

In [32]:
x

Unnamed: 0,distance,speed,temp_inside,temp_outside,AC,rain,sun,E10,SP98,gasoline
0,28.0,26,21.5,12,0,0,0,1,0,0
1,12.0,30,21.5,13,0,0,0,1,0,0
2,11.2,38,21.5,15,0,0,0,1,0,0
3,12.9,36,21.5,14,0,0,0,1,0,0
4,18.5,46,21.5,15,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
383,16.0,39,24.5,18,0,0,0,0,1,1
384,16.1,38,25.0,31,1,0,0,0,1,1
385,16.0,45,25.0,19,0,0,0,0,1,1
386,15.4,42,25.0,31,1,0,0,0,1,1


In [33]:
y = df['consume']

In [34]:
l.fit(x,y)

LinearRegression()

In [35]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [36]:
l.fit(x_train,y_train)

LinearRegression()

In [37]:
y_pred=l.predict(x_test)

In [38]:
l.coef_,l.intercept_

(array([ 0.00515061, -0.02411683, -0.15047909, -0.03631087,  0.42096231,
         0.62794039, -0.06115135,  0.02794376, -0.02794376, -0.02794376]),
 9.490279765921446)

In [42]:
print(metrics.mean_squared_error(y_test,y_pred))
print(np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

0.7476946880297476
0.8646934069540183
