In [24]:
import pandas as pd
from sklearn.preprocessing import StandardScaler


In [11]:
df = pd.read_csv("input/clean_measures.csv", index_col = 0)
df.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,external_temp
0,28.0,5.0,26,21.5,12,,E10,0,0,0,45.0,COLD
1,12.0,4.2,30,21.5,13,,E10,0,0,0,,COLD
2,11.2,5.5,38,21.5,15,,E10,0,0,0,,COLD
3,12.9,3.9,36,21.5,14,,E10,0,0,0,,COLD
4,18.5,4.5,46,21.5,15,,E10,0,0,0,,COLD


Specials is duplicated in AC, rain and sun, Refill liters only has 13/388 values. 

In [13]:
df.drop(columns=['specials', 'refill liters'], inplace=True)

In [15]:
df.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,AC,rain,sun,external_temp
0,28.0,5.0,26,21.5,12,E10,0,0,0,COLD
1,12.0,4.2,30,21.5,13,E10,0,0,0,COLD
2,11.2,5.5,38,21.5,15,E10,0,0,0,COLD
3,12.9,3.9,36,21.5,14,E10,0,0,0,COLD
4,18.5,4.5,46,21.5,15,E10,0,0,0,COLD


In [27]:
standard = df[['distance','speed','temp_inside','temp_outside']]
dummies = df[['gas_type','external_temp']]

# Standardization

Machine learning algorithms don't perform well when the input numerical attributes have very different scales.

Standarization takes the max-min of each column and put it on the same range.

In [34]:
#Initialize
scaler = StandardScaler()
scaled = scaler.fit_transform(standard)
scaled = pd.DataFrame(scaled,columns=standard.columns)
scaled.head()

Unnamed: 0,distance,speed,temp_inside,temp_outside
0,0.368714,-1.172804,-0.425643,0.091908
1,-0.338044,-0.878274,-0.425643,0.235123
2,-0.373381,-0.289216,-0.425643,0.521552
3,-0.298288,-0.43648,-0.425643,0.378338
4,-0.050923,0.299843,-0.425643,0.521552


In [35]:
scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   distance      388 non-null    float64
 1   speed         388 non-null    float64
 2   temp_inside   376 non-null    float64
 3   temp_outside  388 non-null    float64
dtypes: float64(4)
memory usage: 12.2 KB


# Get Dummies + Standard

In [37]:
dum = pd.get_dummies(dummies)
dum.head()

Unnamed: 0,gas_type_E10,gas_type_SP98,external_temp_COLD,external_temp_HOT
0,1,0,1,0
1,1,0,1,0
2,1,0,1,0
3,1,0,1,0
4,1,0,1,0


In [45]:
scaled_dum = scaler.fit_transform(dum)
scaled_dum = pd.DataFrame(scaled_dum,columns=dum.columns)
scaled_dum.head()

Unnamed: 0,gas_type_E10,gas_type_SP98,external_temp_COLD,external_temp_HOT
0,1.193734,-1.193734,0.362209,-0.362209
1,1.193734,-1.193734,0.362209,-0.362209
2,1.193734,-1.193734,0.362209,-0.362209
3,1.193734,-1.193734,0.362209,-0.362209
4,1.193734,-1.193734,0.362209,-0.362209


# Final dataset

In [46]:
result = pd.concat([scaled, scaled_dum,df.consume], axis=1)
result.head()

Unnamed: 0,distance,speed,temp_inside,temp_outside,gas_type_E10,gas_type_SP98,external_temp_COLD,external_temp_HOT,consume
0,0.368714,-1.172804,-0.425643,0.091908,1.193734,-1.193734,0.362209,-0.362209,5.0
1,-0.338044,-0.878274,-0.425643,0.235123,1.193734,-1.193734,0.362209,-0.362209,4.2
2,-0.373381,-0.289216,-0.425643,0.521552,1.193734,-1.193734,0.362209,-0.362209,5.5
3,-0.298288,-0.43648,-0.425643,0.378338,1.193734,-1.193734,0.362209,-0.362209,3.9
4,-0.050923,0.299843,-0.425643,0.521552,1.193734,-1.193734,0.362209,-0.362209,4.5


In [48]:
result.to_csv("input/normal_measures.csv")