In [1]:
import pandas as pd
import re

In [2]:
consumos = pd.read_csv("measurements.csv")

# 1. Data cleaning 

First we are going to review the data

In [3]:
consumos.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
0,28,5,26,215,12,,E10,0,0,0,45.0,E10
1,12,42,30,215,13,,E10,0,0,0,,
2,112,55,38,215,15,,E10,0,0,0,,
3,129,39,36,215,14,,E10,0,0,0,,
4,185,45,46,215,15,,E10,0,0,0,,


In [4]:
consumos.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   distance       388 non-null    object
 1   consume        388 non-null    object
 2   speed          388 non-null    int64 
 3   temp_inside    376 non-null    object
 4   temp_outside   388 non-null    int64 
 5   specials       93 non-null     object
 6   gas_type       388 non-null    object
 7   AC             388 non-null    int64 
 8   rain           388 non-null    int64 
 9   sun            388 non-null    int64 
 10  refill liters  13 non-null     object
 11  refill gas     13 non-null     object
dtypes: int64(5), object(7)
memory usage: 36.5+ KB


In [5]:
consumos.describe()

Unnamed: 0,speed,temp_outside,AC,rain,sun
count,388.0,388.0,388.0,388.0,388.0
mean,41.927835,11.358247,0.07732,0.123711,0.082474
std,13.598524,6.991542,0.267443,0.329677,0.275441
min,14.0,-5.0,0.0,0.0,0.0
25%,32.75,7.0,0.0,0.0,0.0
50%,40.5,10.0,0.0,0.0,0.0
75%,50.0,16.0,0.0,0.0,0.0
max,90.0,31.0,1.0,1.0,1.0


There are several numeric columns that are categorized as object, let's change the comma to point and convert them to float

In [6]:
consumos.distance = consumos.distance.apply(lambda x: re.sub(',','.', x))

In [7]:
consumos.distance = pd.to_numeric(consumos.distance)

In [8]:
consumos.consume = consumos.consume.apply(lambda x: re.sub(',','.', x))
consumos.consume = pd.to_numeric(consumos.consume)

In [9]:
consumos.temp_inside = consumos.temp_inside.astype('str').apply(lambda x: x.replace(',','.'))
consumos.temp_inside = pd.to_numeric(consumos.temp_inside, errors='coerce')

In [10]:
consumos.temp_inside = consumos.temp_inside.fillna(consumos.temp_inside.median())

In [11]:
consumos.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   distance       388 non-null    float64
 1   consume        388 non-null    float64
 2   speed          388 non-null    int64  
 3   temp_inside    388 non-null    float64
 4   temp_outside   388 non-null    int64  
 5   specials       93 non-null     object 
 6   gas_type       388 non-null    object 
 7   AC             388 non-null    int64  
 8   rain           388 non-null    int64  
 9   sun            388 non-null    int64  
 10  refill liters  13 non-null     object 
 11  refill gas     13 non-null     object 
dtypes: float64(3), int64(5), object(4)
memory usage: 36.5+ KB


In [16]:
consumos.rename(columns={'refill liters':'refill_liters', 'refill gas':'refill_gas'},inplace=True)

In [17]:
consumos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 388 entries, 0 to 387
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   distance       388 non-null    float64
 1   consume        388 non-null    float64
 2   speed          388 non-null    int64  
 3   temp_inside    388 non-null    float64
 4   temp_outside   388 non-null    int64  
 5   specials       93 non-null     object 
 6   gas_type       388 non-null    object 
 7   AC             388 non-null    int64  
 8   rain           388 non-null    int64  
 9   sun            388 non-null    int64  
 10  refill_liters  13 non-null     object 
 11  refill_gas     13 non-null     object 
dtypes: float64(3), int64(5), object(4)
memory usage: 39.4+ KB


In [18]:
consumos.refill_liters = pd.to_numeric(consumos.refill_liters, errors='coerce')

Now we are going to fill the null values

In [19]:
consumos.specials.unique()

array([nan, 'AC rain', 'AC', 'rain', 'snow', 'AC snow',
       'half rain half sun', 'sun', 'AC sun', 'sun ac', 'ac', 'AC Sun',
       'ac rain'], dtype=object)

In [28]:
consumos['snow'] = [1 if re.match('snow',x) else 0 for x in consumos.specials.astype('str')]

In [33]:
consumos.drop(['specials', 'refill_liters', 'refill_gas'], axis=1, inplace=True)

In [34]:
consumos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 388 entries, 0 to 387
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   distance      388 non-null    float64
 1   consume       388 non-null    float64
 2   speed         388 non-null    int64  
 3   temp_inside   388 non-null    float64
 4   temp_outside  388 non-null    int64  
 5   gas_type      388 non-null    object 
 6   AC            388 non-null    int64  
 7   rain          388 non-null    int64  
 8   sun           388 non-null    int64  
 9   snow          388 non-null    int64  
dtypes: float64(3), int64(6), object(1)
memory usage: 41.4+ KB
