# Exploring and cleaning

In [18]:
import pandas as pd
import os


In [19]:
# Import the dataset we want to analyze
data = pd.read_csv("./data/measurements.csv")
data.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
0,28,5,26,215,12,,E10,0,0,0,45.0,E10
1,12,42,30,215,13,,E10,0,0,0,,
2,112,55,38,215,15,,E10,0,0,0,,
3,129,39,36,215,14,,E10,0,0,0,,
4,185,45,46,215,15,,E10,0,0,0,,


In [20]:
# What type of data do we have in each column?
data.dtypes

distance         object
consume          object
speed             int64
temp_inside      object
temp_outside      int64
specials         object
gas_type         object
AC                int64
rain              int64
sun               int64
refill liters    object
refill gas       object
dtype: object

In [21]:
# Transform object to number
def comma_to_float(df,column_name):
    index = df.columns.get_loc(column_name)
    for i in range(len(df[column_name])):
        value = df.iloc[i,index]
        value_list = value.split(',')
        if len(value_list) == 2:
            new_value = float(''.join(value_list)) / 10
            data.iloc[i,index] = new_value
        else:
            data.iloc[i,index] = float(value)

In [22]:
comma_to_float(data, 'distance')
data['distance'] = data['distance'].astype(float)
comma_to_float(data, 'consume')
data['consume'] = data['consume'].astype(float)
data['speed'] = data['speed'].astype(float)


In [23]:
#See data
data

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
0,28.0,5.0,26.0,215,12,,E10,0,0,0,45,E10
1,12.0,4.2,30.0,215,13,,E10,0,0,0,,
2,11.2,5.5,38.0,215,15,,E10,0,0,0,,
3,12.9,3.9,36.0,215,14,,E10,0,0,0,,
4,18.5,4.5,46.0,215,15,,E10,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
383,16.0,3.7,39.0,245,18,,SP98,0,0,0,,
384,16.1,4.3,38.0,25,31,AC,SP98,1,0,0,,
385,16.0,3.8,45.0,25,19,,SP98,0,0,0,,
386,15.4,4.6,42.0,25,31,AC,SP98,1,0,0,,


In [24]:
# What type of data do we have in each column?
data.dtypes

distance         float64
consume          float64
speed            float64
temp_inside       object
temp_outside       int64
specials          object
gas_type          object
AC                 int64
rain               int64
sun                int64
refill liters     object
refill gas        object
dtype: object

In [25]:
# We calculate null ratio for each column. Only two of our columns have more than 10% of null data.
# We have 3 columns with more than 70% null values
perc_null_col = data.isnull().sum().apply(lambda x: x/data.shape[0]).sort_values(ascending=False)
perc_null_col

refill gas       0.966495
refill liters    0.966495
specials         0.760309
temp_inside      0.030928
sun              0.000000
rain             0.000000
AC               0.000000
gas_type         0.000000
temp_outside     0.000000
speed            0.000000
consume          0.000000
distance         0.000000
dtype: float64

In [26]:
#Look for uniques values per column
data["specials"].value_counts(dropna=False)

NaN                   295
rain                   32
sun                    27
AC rain                 9
ac                      8
AC                      6
snow                    3
sun ac                  3
half rain half sun      1
ac rain                 1
AC snow                 1
AC Sun                  1
AC sun                  1
Name: specials, dtype: int64

In [27]:
data["gas_type"].value_counts(dropna=False)

SP98    228
E10     160
Name: gas_type, dtype: int64

In [28]:
data["AC"].value_counts(dropna=False)

0    358
1     30
Name: AC, dtype: int64

In [29]:
data["refill gas"].value_counts(dropna=False)

NaN     375
SP98      8
E10       5
Name: refill gas, dtype: int64

In [30]:
data["specials"].value_counts(dropna=False)

NaN                   295
rain                   32
sun                    27
AC rain                 9
ac                      8
AC                      6
snow                    3
sun ac                  3
half rain half sun      1
ac rain                 1
AC snow                 1
AC Sun                  1
AC sun                  1
Name: specials, dtype: int64

In [31]:
# Drop non usefull columns
drop_cols = ["refill gas","refill liters", "specials"]
data = data.drop(drop_cols, axis =1)
data

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,AC,rain,sun
0,28.0,5.0,26.0,215,12,E10,0,0,0
1,12.0,4.2,30.0,215,13,E10,0,0,0
2,11.2,5.5,38.0,215,15,E10,0,0,0
3,12.9,3.9,36.0,215,14,E10,0,0,0
4,18.5,4.5,46.0,215,15,E10,0,0,0
...,...,...,...,...,...,...,...,...,...
383,16.0,3.7,39.0,245,18,SP98,0,0,0
384,16.1,4.3,38.0,25,31,SP98,1,0,0
385,16.0,3.8,45.0,25,19,SP98,0,0,0
386,15.4,4.6,42.0,25,31,SP98,1,0,0


In [57]:
# Exporting our cleaned dataset
data.to_csv("output/data.csv", index = False)