# Exploring data

In [1]:
import pandas as pd

In [2]:
data1 = pd.read_csv("input/measurements.csv")
data1.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
0,28,5,26,215,12,,E10,0,0,0,45.0,E10
1,12,42,30,215,13,,E10,0,0,0,,
2,112,55,38,215,15,,E10,0,0,0,,
3,129,39,36,215,14,,E10,0,0,0,,
4,185,45,46,215,15,,E10,0,0,0,,


In [3]:
data2 = pd.read_csv("input/measurements2.csv", sep = ";")
data2.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
0,28.0,5.0,26,21.5,12,,E10,0,0,0,45.0,E10
1,12.0,4.2,30,21.5,13,,E10,0,0,0,,
2,11.2,5.5,38,21.5,15,,E10,0,0,0,,
3,12.9,3.9,36,21.5,14,,E10,0,0,0,,
4,18.5,4.5,46,21.5,15,,E10,0,0,0,,


In [4]:
data1.info(), data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   distance       388 non-null    object
 1   consume        388 non-null    object
 2   speed          388 non-null    int64 
 3   temp_inside    376 non-null    object
 4   temp_outside   388 non-null    int64 
 5   specials       93 non-null     object
 6   gas_type       388 non-null    object
 7   AC             388 non-null    int64 
 8   rain           388 non-null    int64 
 9   sun            388 non-null    int64 
 10  refill liters  13 non-null     object
 11  refill gas     13 non-null     object
dtypes: int64(5), object(7)
memory usage: 25.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   distance       388 non-null    float64
 1   

(None, None)

Data 1 and data 2 are the same, however data 2 types are more convenient since they are converted to float and it is easier to operate with them.

In [29]:
df = pd.read_csv("input/measurements2.csv", sep = ";")

## Null Values

In [30]:
null_cols = df.isnull().sum()
null_cols[null_cols > 0]

temp_inside       12
specials         295
refill liters    375
refill gas       375
dtype: int64

- Refill liters column seems useless but we might add them when enriching the data so, let's set it aside at the moment.
- Refill gas column is duplicated since gas_type column contain more detailed information. We can delete it.

In [31]:
df.drop(columns=['refill gas'], inplace=True)

## Standardize data

### Specials

In [33]:
df.specials.value_counts()

rain                  32
sun                   27
AC rain                9
ac                     8
AC                     6
snow                   3
sun ac                 3
ac rain                1
AC sun                 1
AC snow                1
half rain half sun     1
AC Sun                 1
Name: specials, dtype: int64

**Since there is a speficic column for AC, sun or rain we don't really need to have that data duplicated, however, we will leave it there at the moment**

In [34]:
df.specials = df.specials.replace(["half rain half sun", "sun" ], "sun")
df.specials = df.specials.replace(["rain", "snow" ], "rain")
df.specials = df.specials.replace(["ac", "AC" ], "AC")
df.specials = df.specials.replace(["AC rain", "AC snow","ac rain" ], "AC and rain")
df.specials = df.specials.replace(["sun ac", "AC sun","AC Sun" ], "AC and sun")

In [35]:
df.specials.value_counts()

rain           35
sun            28
AC             14
AC and rain    11
AC and sun      5
Name: specials, dtype: int64

In [37]:
df.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters
0,28.0,5.0,26,21.5,12,,E10,0,0,0,45.0
1,12.0,4.2,30,21.5,13,,E10,0,0,0,
2,11.2,5.5,38,21.5,15,,E10,0,0,0,
3,12.9,3.9,36,21.5,14,,E10,0,0,0,
4,18.5,4.5,46,21.5,15,,E10,0,0,0,


# Exterior Temperature
I consider important to know if it is cold or not. The criteria is that it is cold if is the temperature is cooler outside than inside.

In [46]:
ext = []
'''for o,i in df.temp_outside, df.temp_inside:
    if o>i:
        ext.append('HOT')
    else:
        ext.append('COLD')'''
df["external_temp"] = df.temp_outside - df.temp_inside

In [48]:
for value in df["external_temp"]:
    if value <= 0:
        df["external_temp"] = df["external_temp"].replace(value, "COLD")
    else:
        df["external_temp"] = df["external_temp"].replace(value, "HOT")

## This dataset is ready for its first plots

In [49]:
df

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,external_temp
0,28.0,5.0,26,21.5,12,,E10,0,0,0,45.0,COLD
1,12.0,4.2,30,21.5,13,,E10,0,0,0,,COLD
2,11.2,5.5,38,21.5,15,,E10,0,0,0,,COLD
3,12.9,3.9,36,21.5,14,,E10,0,0,0,,COLD
4,18.5,4.5,46,21.5,15,,E10,0,0,0,,COLD
...,...,...,...,...,...,...,...,...,...,...,...,...
383,16.0,3.7,39,24.5,18,,SP98,0,0,0,,COLD
384,16.1,4.3,38,25.0,31,AC,SP98,1,0,0,,HOT
385,16.0,3.8,45,25.0,19,,SP98,0,0,0,,COLD
386,15.4,4.6,42,25.0,31,AC,SP98,1,0,0,,HOT


In [51]:
df.to_csv("input/clean_measures.csv")