# 1. Import libreries

In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)
### 2. Load data

In [8]:
data= pd.read_csv('../data/measurements.csv', sep=",", decimal=",")

![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)
### 3. Exploring  data

In [9]:
data.head(5)

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
0,28.0,5.0,26,21.5,12,,E10,0,0,0,45.0,E10
1,12.0,4.2,30,21.5,13,,E10,0,0,0,,
2,11.2,5.5,38,21.5,15,,E10,0,0,0,,
3,12.9,3.9,36,21.5,14,,E10,0,0,0,,
4,18.5,4.5,46,21.5,15,,E10,0,0,0,,


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   distance       388 non-null    float64
 1   consume        388 non-null    float64
 2   speed          388 non-null    int64  
 3   temp_inside    376 non-null    float64
 4   temp_outside   388 non-null    int64  
 5   specials       93 non-null     object 
 6   gas_type       388 non-null    object 
 7   AC             388 non-null    int64  
 8   rain           388 non-null    int64  
 9   sun            388 non-null    int64  
 10  refill liters  13 non-null     float64
 11  refill gas     13 non-null     object 
dtypes: float64(4), int64(5), object(3)
memory usage: 36.5+ KB


In [7]:
data.describe()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,AC,rain,sun,refill liters
count,388.0,388.0,388.0,376.0,388.0,388.0,388.0,388.0,13.0
mean,19.652835,4.912371,41.927835,21.929521,11.358247,0.07732,0.123711,0.082474,37.115385
std,22.667837,1.033172,13.598524,1.010455,6.991542,0.267443,0.329677,0.275441,8.587282
min,1.3,3.3,14.0,19.0,-5.0,0.0,0.0,0.0,10.0
25%,11.8,4.3,32.75,21.5,7.0,0.0,0.0,0.0,37.6
50%,14.6,4.7,40.5,22.0,10.0,0.0,0.0,0.0,38.0
75%,19.0,5.3,50.0,22.5,16.0,0.0,0.0,0.0,39.0
max,216.1,12.2,90.0,25.5,31.0,1.0,1.0,1.0,45.0


In [10]:
data.specials.unique()

array([nan, 'AC rain', 'AC', 'rain', 'snow', 'AC snow',
       'half rain half sun', 'sun', 'AC sun', 'sun ac', 'ac', 'AC Sun',
       'ac rain'], dtype=object)

![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)
### 3.  Cleaning data

We have decided to remove the *special* columns because the data is in the *AC, rain, sun* columns. And *refill gas, refill liters* are in *gas_type*.

In [11]:
data.drop(columns = ['specials','refill gas','refill liters'],axis = 1, inplace = True)

In [12]:
data.head(5)

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,AC,rain,sun
0,28.0,5.0,26,21.5,12,E10,0,0,0
1,12.0,4.2,30,21.5,13,E10,0,0,0
2,11.2,5.5,38,21.5,15,E10,0,0,0
3,12.9,3.9,36,21.5,14,E10,0,0,0
4,18.5,4.5,46,21.5,15,E10,0,0,0


We show the nulls in ascending order

In [13]:
data.isnull().sum().sort_values(ascending=False)

temp_inside     12
sun              0
rain             0
AC               0
gas_type         0
temp_outside     0
speed            0
consume          0
distance         0
dtype: int64

As we have 12 nulls in *temp_inside* we are going to fill them with the average temperature.

In [14]:
mean_temp_inside = data['temp_inside'].mean()

In [15]:
data = data.fillna(mean_temp_inside)

In [17]:
data.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,AC,rain,sun
0,28.0,5.0,26,21.5,12,E10,0,0,0
1,12.0,4.2,30,21.5,13,E10,0,0,0
2,11.2,5.5,38,21.5,15,E10,0,0,0
3,12.9,3.9,36,21.5,14,E10,0,0,0
4,18.5,4.5,46,21.5,15,E10,0,0,0


In [18]:
data.isnull().sum().sort_values(ascending=False)

sun             0
rain            0
AC              0
gas_type        0
temp_outside    0
temp_inside     0
speed           0
consume         0
distance        0
dtype: int64

In [19]:
gas_type_dum=pd.get_dummies(data['gas_type'])

In [20]:
gas_type_dum

Unnamed: 0,E10,SP98
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
383,0,1
384,0,1
385,0,1
386,0,1


In [21]:
data=pd.concat([data,gas_type_dum],axis=1)

In [22]:
data.head(5)

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,AC,rain,sun,E10,SP98
0,28.0,5.0,26,21.5,12,E10,0,0,0,1,0
1,12.0,4.2,30,21.5,13,E10,0,0,0,1,0
2,11.2,5.5,38,21.5,15,E10,0,0,0,1,0
3,12.9,3.9,36,21.5,14,E10,0,0,0,1,0
4,18.5,4.5,46,21.5,15,E10,0,0,0,1,0


In [23]:
data.to_csv('../data/data_clean.csv',index = False, header=True)

In [146]:
l = LinearRegression()

In [147]:
x = data.drop(['consume'],axis=1)

In [148]:
y = data['consume']

In [149]:
l.fit(x,y)

LinearRegression()

In [150]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [151]:
l.fit(x_train,y_train)

LinearRegression()

In [152]:
y_pred=l.predict(x_test)

In [156]:
l.coef_,l.intercept_

(array([ 0.00515061, -0.02411683, -0.15047909, -0.03631087,  0.42096231,
         0.62794039, -0.06115135,  0.04191564, -0.04191564]),
 9.476307887416771)

In [155]:
print(metrics.mean_squared_error(y_test,y_pred))
print(metrics.mean_absolute_error(y_test,y_pred))
print(np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

0.747694688029747
0.6689496242764844
0.864693406954018
