In [1]:
import pandas as pd
import sys

In [2]:

sys.path.append("..")


# Dataset Loading

In [3]:
df = pd.read_csv('../data/measurements.csv')

# Dataset Exploration

In [4]:
df.sample(5)

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
42,117,46,37,215,6,,E10,0,0,0,,
186,386,46,44,225,10,,E10,0,0,0,,
257,124,47,55,225,14,sun,E10,0,0,1,,
219,19,44,58,225,17,sun,SP98,0,0,1,,
14,124,47,46,215,11,,E10,0,0,0,,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   distance       388 non-null    object
 1   consume        388 non-null    object
 2   speed          388 non-null    int64 
 3   temp_inside    376 non-null    object
 4   temp_outside   388 non-null    int64 
 5   specials       93 non-null     object
 6   gas_type       388 non-null    object
 7   AC             388 non-null    int64 
 8   rain           388 non-null    int64 
 9   sun            388 non-null    int64 
 10  refill liters  13 non-null     object
 11  refill gas     13 non-null     object
dtypes: int64(5), object(7)
memory usage: 36.5+ KB


In [6]:
df.specials.unique()

array([nan, 'AC rain', 'AC', 'rain', 'snow', 'AC snow',
       'half rain half sun', 'sun', 'AC sun', 'sun ac', 'ac', 'AC Sun',
       'ac rain'], dtype=object)

In [7]:
df.describe()

Unnamed: 0,speed,temp_outside,AC,rain,sun
count,388.0,388.0,388.0,388.0,388.0
mean,41.927835,11.358247,0.07732,0.123711,0.082474
std,13.598524,6.991542,0.267443,0.329677,0.275441
min,14.0,-5.0,0.0,0.0,0.0
25%,32.75,7.0,0.0,0.0,0.0
50%,40.5,10.0,0.0,0.0,0.0
75%,50.0,16.0,0.0,0.0,0.0
max,90.0,31.0,1.0,1.0,1.0


# Dataset Cleaning

First let's convert numeric columns like distance, consume into real numeric columns

In [8]:
df = df.replace(',','.',regex=True)

In [9]:
df = df.astype({'distance': 'float32', 'consume': 'float32', 'temp_inside': 'float32'})

Column specials does not add any value, many NAN values and use of AC or sunny day is already in other columns and snow day is quite rare in Spain, so we drop those columns

In [10]:
df = df.drop(['specials'], axis = 1)

Refill liters and refill gas does not any value as well based on description on dataset, so we also drop it

In [11]:
df = df.drop(['refill liters','refill gas'], axis = 1)

Finally we drop the 12 cases with nan values in temp_inside, because they are few cases and also importance of that column seems only to affect when AC is activated

In [12]:
df = df.dropna().reset_index(drop=True)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 376 entries, 0 to 375
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   distance      376 non-null    float32
 1   consume       376 non-null    float32
 2   speed         376 non-null    int64  
 3   temp_inside   376 non-null    float32
 4   temp_outside  376 non-null    int64  
 5   gas_type      376 non-null    object 
 6   AC            376 non-null    int64  
 7   rain          376 non-null    int64  
 8   sun           376 non-null    int64  
dtypes: float32(3), int64(5), object(1)
memory usage: 22.2+ KB


In [14]:
df.to_csv('../data/clean_data.csv')