In [1]:
#import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

In [2]:
#import the clean ds
%store -r ds_clean

In [3]:
ds_clean_1 = ds_clean

In [4]:
ds_clean_1.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,AC,rain,sun
0,28.0,5.0,26,21.5,12,E10,0,0,0
1,12.0,4.2,30,21.5,13,E10,0,0,0
2,11.2,5.5,38,21.5,15,E10,0,0,0
3,12.9,3.9,36,21.5,14,E10,0,0,0
4,18.5,4.5,46,21.5,15,E10,0,0,0


In [9]:
ds_clean_1.to_pickle("./dummy.pkl")

In order to use gas type it must be transformed into numerical

In [10]:
ds_clean_1["gas_type"].replace({"E10": 0, "SP98": 1}, inplace=True)

In [11]:
ds_clean_1['temp_inside'] = ds_clean['temp_inside'].astype(float)
ds_clean_1['temp_inside'].head(3)

0    21.5
1    21.5
2    21.5
Name: temp_inside, dtype: float64

In [12]:
ds_clean_1.dtypes

distance        float64
consume         float64
speed             int64
temp_inside     float64
temp_outside      int64
gas_type          int64
AC                int64
rain              int64
sun               int64
dtype: object

# Lets predict!

## Divide the dataset into train/test

As we know, we want to predict the column 'consume', hence why we do the following

In [13]:
X = ds_clean_1.drop(columns = 'consume')
y = ds_clean_1['consume']

Once its separated, we train the model using several algorithms and cheching wich one is more accurate

In [14]:
#I choose using the 80% of the data to train the model and 20% to test it
X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8)

Before we start with the models, many machine learning algorithms perform better when numerical input variables are scaled to a standard range. The two most popular techniques for scaling numerical data prior to modeling are normalization and standardization. 

In [15]:
##-------- Normalization ----------
norm = MinMaxScaler()

X_train = norm.fit_transform(X_train)
X_test = norm.transform(X_test)

In [16]:
##-------- Standarization ----------
stan = StandardScaler()
stan.fit(X_train)

X_train = stan.transform(X_train)
X_test = stan.transform(X_test)

Due to the fact that we are trying to predict a numerical value, the algorithms we will be using are regression ones

## Model 1: Linear Regression

In [17]:
#Train the model
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [18]:
y_pred_train_lr = lr.predict(X_train)
y_pred_test_lr = lr.predict(X_test)

In [19]:
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_lr ))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_lr ))
print('TRAIN : R2 Score:', r2_score(y_train, y_pred_train_lr ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_lr )))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_lr ))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_lr ))
print('TEST : R2 Score:', r2_score(y_test, y_pred_test_lr))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_lr )))

TRAIN : Mean Absolute Error: 0.635119937340884
TRAIN : Mean Squared Error: 0.8875523406451414
TRAIN : R2 Score: 0.2132941045974729
TRAIN : Root Mean Squared Error: 0.9420999631913491
----------------------------------------------------------
TEST : Mean Absolute Error: 0.5895310327901375
TEST : Mean Squared Error: 0.6876987748696028
TEST : R2 Score: 0.14951359870501546
TEST : Root Mean Squared Error: 0.8292760546823976


In [15]:
#Cross validation
scores_lr = cross_val_score(lr, X_train, y_train, cv=3, scoring='neg_root_mean_squared_error')
scores_lr

array([-0.96749131, -1.13898933, -0.90572254])

## Model 2: Random Forest

In [16]:
randforest = RandomForestRegressor(n_estimators=20, random_state=10)
randforest.fit(X_train, y_train)

RandomForestRegressor(n_estimators=20, random_state=10)

In [17]:
y_pred_train_randforest = randforest.predict(X_train)
y_pred_test_randforest = randforest.predict(X_test)

In [18]:
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_randforest))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_randforest))
print('TRAIN : R2 Score:', r2_score(y_train, y_pred_train_randforest ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_randforest)))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_randforest ))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_randforest ))
print('TRAIN : R2 Score:', r2_score(y_test, y_pred_test_randforest ))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_randforest )))

TRAIN : Mean Absolute Error: 0.1698602150537635
TRAIN : Mean Squared Error: 0.0660295788530466
TRAIN : R2 Score: 0.943783654119695
TRAIN : Root Mean Squared Error: 0.256962212889457
----------------------------------------------------------
TEST : Mean Absolute Error: 0.4524893162393164
TEST : Mean Squared Error: 0.44582490206552716
TRAIN : R2 Score: 0.25395830697450117
TEST : Root Mean Squared Error: 0.6677012071769282


In [19]:
scores_randforest = cross_val_score(randforest, X_train, y_train, cv=3, scoring='neg_root_mean_squared_error')
scores_randforest

array([-0.69507876, -0.7534212 , -0.66103093])

## Model 3 : Lasso

In [20]:
lasso = Lasso()
lasso.fit(X_train, y_train)

Lasso()

In [21]:
y_pred_train_lasso = lasso.predict(X_train)
y_pred_test_lasso = lasso.predict(X_test)

In [22]:
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_lasso ))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_lasso ))
print('TRAIN : R2 Score:', r2_score(y_train, y_pred_train_lasso ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_lasso )))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_lasso ))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_lasso ))
print('TEST : R2 Score:', r2_score(y_test, y_pred_test_lasso))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_lasso )))

TRAIN : Mean Absolute Error: 0.6947970863683662
TRAIN : Mean Squared Error: 1.1745619146722164
TRAIN : R2 Score: 0.0
TRAIN : Root Mean Squared Error: 1.0837720769018808
----------------------------------------------------------
TEST : Mean Absolute Error: 0.640198511166253
TEST : Mean Squared Error: 0.6356911870647561
TEST : R2 Score: -0.0637632111664197
TEST : Root Mean Squared Error: 0.7973024439099357


In [23]:
scores_lasso = cross_val_score(lasso, X_train, y_train, cv=3, scoring='neg_root_mean_squared_error')
scores_lasso

array([-1.02657808, -1.29138892, -0.92456082])

## Model 4: Elastic Net

In [24]:
elnet = ElasticNet(random_state=0)
elnet.fit(X_train, y_train)

ElasticNet(random_state=0)

In [25]:
y_pred_train_elnet = elnet.predict(X_train)
y_pred_test_elnet = elnet.predict(X_test)

In [26]:
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_elnet))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_elnet))
print('TRAIN : R2 Score:', r2_score(y_train, y_pred_train_elnet ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_elnet)))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_elnet))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_elnet))
print('TRAIN : R2 Score:', r2_score(y_test, y_pred_test_elnet ))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_elnet)))

TRAIN : Mean Absolute Error: 0.6947970863683662
TRAIN : Mean Squared Error: 1.1745619146722164
TRAIN : R2 Score: 0.0
TRAIN : Root Mean Squared Error: 1.0837720769018808
----------------------------------------------------------
TEST : Mean Absolute Error: 0.640198511166253
TEST : Mean Squared Error: 0.6356911870647561
TRAIN : R2 Score: -0.0637632111664197
TEST : Root Mean Squared Error: 0.7973024439099357


In [27]:
scores_elnet = cross_val_score(elnet, X_train, y_train, cv=3, scoring='neg_root_mean_squared_error')
scores_elnet

array([-1.02657808, -1.29138892, -0.92456082])

## Model 5 : Decission tree

In [33]:
from sklearn import preprocessing
lab_enc = preprocessing.LabelEncoder()
training_scores_encoded = lab_enc.fit_transform(y_train)

In [34]:
tree = DecisionTreeClassifier(max_depth = 3)
tree.fit(X_train, training_scores_encoded)

DecisionTreeClassifier(max_depth=3)

In [35]:
y_pred_train_tree = tree.predict(X_train)
y_pred_test_tree = tree.predict(X_test)

In [36]:
#TRAIN:
print('TRAIN : Mean Absolute Error:', metrics.mean_absolute_error(y_train, y_pred_train_tree))
print('TRAIN : Mean Squared Error:', metrics.mean_squared_error(y_train, y_pred_train_tree))
print('TRAIN : R2 Score:', r2_score(y_train, y_pred_train_tree ))
print('TRAIN : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_tree)))
print ('----------------------------------------------------------')
#TEST:
print('TEST : Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test_tree ))
print('TEST : Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test_tree ))
print('TRAIN : R2 Score:', r2_score(y_test, y_pred_test_tree ))
print('TEST : Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test_tree )))

TRAIN : Mean Absolute Error: 6.598064516129033
TRAIN : Mean Squared Error: 62.015612903225794
TRAIN : R2 Score: -51.79893050212888
TRAIN : Root Mean Squared Error: 7.874999231950807
----------------------------------------------------------
TEST : Mean Absolute Error: 5.9230769230769225
TEST : Mean Squared Error: 48.02871794871795
TRAIN : R2 Score: -79.37107368004135
TEST : Root Mean Squared Error: 6.930275459800854


In [38]:
scores_tree = cross_val_score(tree, X_train, training_scores_encoded, cv=3, scoring='neg_root_mean_squared_error')
scores_tree



array([-7.44466768, -7.59087342, -8.29083012])

The results of these models are not very good, the R2 should be close to 1 and the mse to 0, and as we can see with the previouse examples it is not

In [39]:
%store ds_clean_1

Stored 'ds_clean_1' (DataFrame)
