In [2]:
# import libraries
import pandas as pd

In [3]:
# get to know the dataset
# change the dataset separation to ','
df = pd.read_csv('dataset/measurements.csv', decimal = ',')
df.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
0,28.0,5.0,26,21.5,12,,E10,0,0,0,45.0,E10
1,12.0,4.2,30,21.5,13,,E10,0,0,0,,
2,11.2,5.5,38,21.5,15,,E10,0,0,0,,
3,12.9,3.9,36,21.5,14,,E10,0,0,0,,
4,18.5,4.5,46,21.5,15,,E10,0,0,0,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   distance       388 non-null    float64
 1   consume        388 non-null    float64
 2   speed          388 non-null    int64  
 3   temp_inside    376 non-null    float64
 4   temp_outside   388 non-null    int64  
 5   specials       93 non-null     object 
 6   gas_type       388 non-null    object 
 7   AC             388 non-null    int64  
 8   rain           388 non-null    int64  
 9   sun            388 non-null    int64  
 10  refill liters  13 non-null     float64
 11  refill gas     13 non-null     object 
dtypes: float64(4), int64(5), object(3)
memory usage: 36.5+ KB


**We find a 12 column dataset, with 387 rows. Each row give us information about one different trip, including distance, consume, average speed, temperature inside, temperature outside, special data (about weather), the gas type used, if the air conditioner was on/off, if it was raining or sunny, and  the refilled of gas**

**Columns "refill liters" and "refill gas", and "specials" have a wide range of NAN values**


## Distance

In [5]:
# check info
distance_sorted = sorted(df.distance.unique())

In [6]:
# to float
df.distance = pd.to_numeric(df.distance)
# check nans after convert to float
print('nans = ', df.distance.isna().sum())

nans =  0


## Consume

In [7]:
consume_sorted = sorted(df.consume.unique())

## Average speed in km/h

In [8]:
speed_sorted = sorted(df.speed.unique())

## Temperature inside the car

In [9]:
temp_inside_sorted = sorted(df.temp_inside.unique())
print('Have found', df.temp_inside.isna().sum(), 'empty values in this serie')

Have found 12 empty values in this serie


In [10]:
print(df.temp_inside.value_counts())

21.5    133
22.0    102
22.5     59
20.0     25
21.0     13
23.0     13
25.0     12
24.5      7
20.5      4
24.0      3
25.5      2
23.5      2
19.0      1
Name: temp_inside, dtype: int64


In [11]:
# 3% of our data are null, so we decided to replace it for the average temperature
print(round((df.temp_inside.isna().sum())/(df.shape[0]), 2 ))

0.03


In [12]:
mean_temp = round(df.temp_inside.mean(), 2) 
df.temp_inside.fillna(mean_temp, inplace = True)

In [13]:
df.temp_inside.isna().sum()

0

## Temperature outside

In [14]:
df.temp_outside.unique()

array([12, 13, 15, 14, 10, 11,  6,  4,  9,  0,  5,  3,  8,  1,  2,  7, -3,
       17, 18, -5, 16, 19, 21, 20, 25, 23, 27, 24, 26, 22, 30, 31, 28])

## Specials

In [15]:
df.specials.unique()

array([nan, 'AC rain', 'AC', 'rain', 'snow', 'AC snow',
       'half rain half sun', 'sun', 'AC sun', 'sun ac', 'ac', 'AC Sun',
       'ac rain'], dtype=object)

In [16]:
print(round((df.specials.isna().sum())/(df.shape[0]), 2 ))

0.76


**Specials column doesn't give us relevant information, and the 76% of values are nulls, so we decided to drop it**

## Gas Type

In [17]:
df.gas_type.unique()

array(['E10', 'SP98'], dtype=object)

In [18]:
df["gas_type"] = df["gas_type"].astype('category')
df["gas_type_coded"] = df["gas_type"].cat.codes

## Air Conditionair

In [19]:
df.AC.unique()

array([0, 1])

## Rain

In [20]:
df.rain.unique()

array([0, 1])

## Sun

In [21]:
df.sun.unique()

array([0, 1])

## Refill liters

In [22]:
df['refill liters'].unique()

array([45. ,  nan, 37.6, 37.7, 38. , 38.3, 10. , 39. , 41. , 37. , 37.2])

In [23]:
print(round((df['refill liters'].isna().sum())/(df.shape[0]), 2 ))

0.97


## Refill gas 

In [24]:
df['refill gas'].unique()

array(['E10', nan, 'SP98'], dtype=object)

In [25]:
print(round((df['refill gas'].isna().sum())/(df.shape[0]), 2 ))

0.97


**97% of values in refill liters and refill gas are empty values**

**We decided to drop "refill liters" and "refill gas" columns, as well as "specials"**

In [26]:
df = df.drop(columns = ['specials', 'refill gas', 'refill liters'])

## Resultant dataset after cleaning

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   distance        388 non-null    float64 
 1   consume         388 non-null    float64 
 2   speed           388 non-null    int64   
 3   temp_inside     388 non-null    float64 
 4   temp_outside    388 non-null    int64   
 5   gas_type        388 non-null    category
 6   AC              388 non-null    int64   
 7   rain            388 non-null    int64   
 8   sun             388 non-null    int64   
 9   gas_type_coded  388 non-null    int8    
dtypes: category(1), float64(3), int64(5), int8(1)
memory usage: 25.3 KB


In [28]:
df.describe()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,AC,rain,sun,gas_type_coded
count,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0
mean,19.652835,4.912371,41.927835,21.929536,11.358247,0.07732,0.123711,0.082474,0.587629
std,22.667837,1.033172,13.598524,0.994666,6.991542,0.267443,0.329677,0.275441,0.492897
min,1.3,3.3,14.0,19.0,-5.0,0.0,0.0,0.0,0.0
25%,11.8,4.3,32.75,21.5,7.0,0.0,0.0,0.0,0.0
50%,14.6,4.7,40.5,22.0,10.0,0.0,0.0,0.0,1.0
75%,19.0,5.3,50.0,22.5,16.0,0.0,0.0,0.0,1.0
max,216.1,12.2,90.0,25.5,31.0,1.0,1.0,1.0,1.0


In [30]:
df.to_csv('dataset/df_clean.csv', index = False)