In [1]:
#imports: 
import pandas as pd
import numpy as np

### Data Discovery & cleaning

In [5]:
measurements_df = pd.read_csv('data/measurements.csv')
measurements_2_df = pd.read_excel('data/measurements2.xlsx')

In [6]:
measurements_df.shape

(388, 12)

In [8]:
measurements_df.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
0,28,5,26,215,12,,E10,0,0,0,45.0,E10
1,12,42,30,215,13,,E10,0,0,0,,
2,112,55,38,215,15,,E10,0,0,0,,
3,129,39,36,215,14,,E10,0,0,0,,
4,185,45,46,215,15,,E10,0,0,0,,


In [9]:
measurements_df.isna().sum()

distance           0
consume            0
speed              0
temp_inside       12
temp_outside       0
specials         295
gas_type           0
AC                 0
rain               0
sun                0
refill liters    375
refill gas       375
dtype: int64

In [17]:
measurements_df.dtypes

distance         object
consume          object
speed             int64
temp_inside      object
temp_outside      int64
specials         object
gas_type         object
AC                int64
rain              int64
sun               int64
refill liters    object
refill gas       object
dtype: object

In [35]:

columns_to_convert = ['distance', 'consume', 'temp_inside', 'refill liters']
for col in columns_to_convert:
    measurements_df[col] = measurements_df[col].astype(str).str.replace(',', '.').astype(float)
measurements_df[columns_to_convert] = measurements_df[columns_to_convert].astype(float)

measurements_df['refill liters'].fillna(0, inplace=True)
temp_inside_median = measurements_df['temp_inside'].median()
measurements_df['temp_inside'].fillna(temp_inside_median, inplace=True)

In [31]:
measurements_df['gas_type'].value_counts()

gas_type
SP98    228
E10     160
Name: count, dtype: int64

In [27]:
#Checking Data Consistency: 

E10_df = measurements_df[measurements_df['gas_type'] == 'E10']
E10_df['refill gas'].value_counts()

refill gas
E10    5
Name: count, dtype: int64

In [28]:
sum(E10_df['refill liters'])

198.0

In [29]:
sum(E10_df['consume'])

789.0000000000002

In [32]:
SP98_df = measurements_df[measurements_df['gas_type'] == 'SP98']
SP98_df['refill gas'].value_counts()

refill gas
SP98    8
Name: count, dtype: int64

In [33]:
sum(SP98_df['refill liters'])

284.5

In [34]:
sum(SP98_df['consume'])

1116.9999999999998

In [None]:
#Visibly no errors in lines on the consume between SP98 and E10 that match the refill. I only expected that the sum of refill would be close to the sum of consume but it is not the case. So we'll conduct analysis on consume.  

### Exploratory Data Analysis

In [36]:
numerical_columns = measurements_df.select_dtypes(include='number')

In [37]:
measurements_df.describe()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,AC,rain,sun,refill liters
count,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0
mean,19.652835,4.912371,41.927835,21.931701,11.358247,0.07732,0.123711,0.082474,1.243557
std,22.667837,1.033172,13.598524,0.994741,6.991542,0.267443,0.329677,0.275441,6.856419
min,1.3,3.3,14.0,19.0,-5.0,0.0,0.0,0.0,0.0
25%,11.8,4.3,32.75,21.5,7.0,0.0,0.0,0.0,0.0
50%,14.6,4.7,40.5,22.0,10.0,0.0,0.0,0.0,0.0
75%,19.0,5.3,50.0,22.5,16.0,0.0,0.0,0.0,0.0
max,216.1,12.2,90.0,25.5,31.0,1.0,1.0,1.0,45.0
