In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from cleaning import * 


In [2]:
df_gas = pd.read_csv("./data/measurements.csv", sep=",", decimal=",")


In [3]:
df_gas

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
0,28.0,5.0,26,21.5,12,,E10,0,0,0,45.0,E10
1,12.0,4.2,30,21.5,13,,E10,0,0,0,,
2,11.2,5.5,38,21.5,15,,E10,0,0,0,,
3,12.9,3.9,36,21.5,14,,E10,0,0,0,,
4,18.5,4.5,46,21.5,15,,E10,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
383,16.0,3.7,39,24.5,18,,SP98,0,0,0,,
384,16.1,4.3,38,25.0,31,AC,SP98,1,0,0,,
385,16.0,3.8,45,25.0,19,,SP98,0,0,0,,
386,15.4,4.6,42,25.0,31,AC,SP98,1,0,0,,


In [4]:
df_gas.columns = df_gas.columns.str.strip().str.lower()
df_gas.head(5)

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,ac,rain,sun,refill liters,refill gas
0,28.0,5.0,26,21.5,12,,E10,0,0,0,45.0,E10
1,12.0,4.2,30,21.5,13,,E10,0,0,0,,
2,11.2,5.5,38,21.5,15,,E10,0,0,0,,
3,12.9,3.9,36,21.5,14,,E10,0,0,0,,
4,18.5,4.5,46,21.5,15,,E10,0,0,0,,


In [5]:
df_gas.isnull().sum()


distance           0
consume            0
speed              0
temp_inside       12
temp_outside       0
specials         295
gas_type           0
ac                 0
rain               0
sun                0
refill liters    375
refill gas       375
dtype: int64

In [6]:
df_gas.shape

(388, 12)

In [7]:
 df_gas = df_gas.fillna(0)

As we can see, the columns that follow 'specials' contain all the information that 'specials' contains, so that, in order to avoid duplicate information, we proceed to eliminate this 'specials' column.

In [8]:
df_gas.drop('specials',
  axis='columns', inplace=True)

In [9]:
df_gas.shape

(388, 11)

Now with fill the NaN values of temp_inside with the mean of the column data

In [10]:
df_gas.dtypes

distance         float64
consume          float64
speed              int64
temp_inside      float64
temp_outside       int64
gas_type          object
ac                 int64
rain               int64
sun                int64
refill liters    float64
refill gas        object
dtype: object

In [11]:
df_gas['temp_inside'].fillna(value = df_gas['temp_inside'].mean().round(3), inplace=True)

In [12]:
df_gas

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,ac,rain,sun,refill liters,refill gas
0,28.0,5.0,26,21.5,12,E10,0,0,0,45.0,E10
1,12.0,4.2,30,21.5,13,E10,0,0,0,0.0,0
2,11.2,5.5,38,21.5,15,E10,0,0,0,0.0,0
3,12.9,3.9,36,21.5,14,E10,0,0,0,0.0,0
4,18.5,4.5,46,21.5,15,E10,0,0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
383,16.0,3.7,39,24.5,18,SP98,0,0,0,0.0,0
384,16.1,4.3,38,25.0,31,SP98,1,0,0,0.0,0
385,16.0,3.8,45,25.0,19,SP98,0,0,0,0.0,0
386,15.4,4.6,42,25.0,31,SP98,1,0,0,0.0,0


Now let's see what's up with the NaN values of the two last columns. The first step is to rename them so they don't have any blank spaces.

In [13]:
df_gas = df_gas.rename(columns = {"refill liters":"refill_liters", "refill gas":"refill_gas"})

In [14]:
df_gas

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,ac,rain,sun,refill_liters,refill_gas
0,28.0,5.0,26,21.5,12,E10,0,0,0,45.0,E10
1,12.0,4.2,30,21.5,13,E10,0,0,0,0.0,0
2,11.2,5.5,38,21.5,15,E10,0,0,0,0.0,0
3,12.9,3.9,36,21.5,14,E10,0,0,0,0.0,0
4,18.5,4.5,46,21.5,15,E10,0,0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
383,16.0,3.7,39,24.5,18,SP98,0,0,0,0.0,0
384,16.1,4.3,38,25.0,31,SP98,1,0,0,0.0,0
385,16.0,3.8,45,25.0,19,SP98,0,0,0,0.0,0
386,15.4,4.6,42,25.0,31,SP98,1,0,0,0.0,0


In [15]:
del df_gas['refill_liters']
del df_gas['refill_gas']

In [16]:
df_gas

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,ac,rain,sun
0,28.0,5.0,26,21.5,12,E10,0,0,0
1,12.0,4.2,30,21.5,13,E10,0,0,0
2,11.2,5.5,38,21.5,15,E10,0,0,0
3,12.9,3.9,36,21.5,14,E10,0,0,0
4,18.5,4.5,46,21.5,15,E10,0,0,0
...,...,...,...,...,...,...,...,...,...
383,16.0,3.7,39,24.5,18,SP98,0,0,0
384,16.1,4.3,38,25.0,31,SP98,1,0,0
385,16.0,3.8,45,25.0,19,SP98,0,0,0
386,15.4,4.6,42,25.0,31,SP98,1,0,0


We add two new columns formed by the cummulative number of km of distance and the cummulative number of liters consumed

In [17]:
df_gas['km_total'] = df_gas['distance'].cumsum()
df_gas['consume_liter_total'] = df_gas['consume']/100*df_gas['distance']

In [18]:
df_gas.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,ac,rain,sun,km_total,consume_liter_total
0,28.0,5.0,26,21.5,12,E10,0,0,0,28.0,1.4
1,12.0,4.2,30,21.5,13,E10,0,0,0,40.0,0.504
2,11.2,5.5,38,21.5,15,E10,0,0,0,51.2,0.616
3,12.9,3.9,36,21.5,14,E10,0,0,0,64.1,0.5031
4,18.5,4.5,46,21.5,15,E10,0,0,0,82.6,0.8325


In [19]:
print('DISTANCE:')
gas_type_sum(df_gas, 'E10','distance')
gas_type_std(df_gas, 'E10','distance')
gas_type_mean(df_gas, 'E10','distance')
gas_type_sum(df_gas, 'SP98','distance')
gas_type_std(df_gas, 'SP98','distance')
gas_type_mean(df_gas, 'SP98','distance')
print('\n')
print(100*'*')
print('\n')
print('SPEED:')
gas_type_std(df_gas, 'E10','speed')
gas_type_mean(df_gas, 'E10','speed')
gas_type_std(df_gas, 'SP98','speed')
gas_type_mean(df_gas, 'SP98','speed')
print('\n')
print(100*'*')
print('\n')
print('CONSUME:')
gas_type_mean(df_gas, 'E10','consume')
gas_type_mean(df_gas, 'SP98','consume')

DISTANCE:
Total distance with E10: 3375.4
Std of distance with E10: 20.307234165044786
Mean of distance with E10: 21.09625
Total distance with SP98: 4249.9
Std of distance with SP98: 24.17959845593116
Mean of distance with SP98: 18.639912280701754


****************************************************************************************************


SPEED:
Std of speed with E10: 14.077948711978848
Mean of speed with E10: 43.50625
Std of speed with SP98: 13.170121570119575
Mean of speed with SP98: 40.82017543859649


****************************************************************************************************


CONSUME:
Mean of consume with E10: 4.93125
Mean of consume with SP98: 4.899122807017544


4.899122807017544

Now, in order to change all categorical variables to numerical, we'll change the gas_type column to boolean:

In [20]:
df_gas['gas_type'] = df_gas['gas_type'].map({'SP98': 1, 'E10': 0}) 
#We've changed string to integer

In [22]:
sp98_price = 1.46
E10_price = 1.38
df_gas['price_per_travel'] = df_gas.consume * df_gas.gas_type.apply(lambda x: E10_price if x == 0 else sp98_price)

In [23]:
df_gas.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,ac,rain,sun,km_total,consume_liter_total,price_per_travel
0,28.0,5.0,26,21.5,12,0,0,0,0,28.0,1.4,6.9
1,12.0,4.2,30,21.5,13,0,0,0,0,40.0,0.504,5.796
2,11.2,5.5,38,21.5,15,0,0,0,0,51.2,0.616,7.59
3,12.9,3.9,36,21.5,14,0,0,0,0,64.1,0.5031,5.382
4,18.5,4.5,46,21.5,15,0,0,0,0,82.6,0.8325,6.21


In [24]:
sorted_columns = ['distance', 'consume', 'speed','temp_inside', 'temp_outside', 'gas_type', 'ac', 'rain', 'sun', 'km_total','consume_liter_total', 'price_per_travel']
df_fas = df_gas.reindex(sorted_columns, axis=1)

In [25]:
df_gas.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,ac,rain,sun,km_total,consume_liter_total,price_per_travel
0,28.0,5.0,26,21.5,12,0,0,0,0,28.0,1.4,6.9
1,12.0,4.2,30,21.5,13,0,0,0,0,40.0,0.504,5.796
2,11.2,5.5,38,21.5,15,0,0,0,0,51.2,0.616,7.59
3,12.9,3.9,36,21.5,14,0,0,0,0,64.1,0.5031,5.382
4,18.5,4.5,46,21.5,15,0,0,0,0,82.6,0.8325,6.21


In [26]:
df_gas.to_csv('./data/clean_measure.csv', index=False)