In [8]:
import pandas as pd
import seaborn as sns

## Initial exploration:

#### Let's import the dataset:

In [2]:
gas_data = pd.read_csv("../data/measurements.csv")

#### Understand dataset shape and columns:

In [3]:
gas_data.shape

(388, 12)

In [5]:
gas_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   distance       388 non-null    object
 1   consume        388 non-null    object
 2   speed          388 non-null    int64 
 3   temp_inside    376 non-null    object
 4   temp_outside   388 non-null    int64 
 5   specials       93 non-null     object
 6   gas_type       388 non-null    object
 7   AC             388 non-null    int64 
 8   rain           388 non-null    int64 
 9   sun            388 non-null    int64 
 10  refill liters  13 non-null     object
 11  refill gas     13 non-null     object
dtypes: int64(5), object(7)
memory usage: 36.5+ KB


In [6]:
gas_data.sample(5)

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
299,187,42,65,25,18,sun ac,SP98,1,0,1,,
238,118,46,40,225,2,,E10,0,0,0,,
317,419,47,53,22,14,,SP98,0,0,0,,
343,16,38,42,22,17,,E10,0,0,0,,
151,99,5,28,215,9,,E10,0,0,0,,


#### Things to explore:
- Check "Temp_inside" nulls. What i should do with them? Can i replace them or i have to delete those rows? Change it's column type.
- Check "Distance" column type. Cleaning and change it's type.
- Check "Consume" column type. Cleaning and change it's type.
- Explore further "Specials" column to understand it deeply.
- What i should do with "refill liters" and "refill gas" columns?

## Let's explore this dataset deeper:

In [7]:
gas_data.describe()

Unnamed: 0,speed,temp_outside,AC,rain,sun
count,388.0,388.0,388.0,388.0,388.0
mean,41.927835,11.358247,0.07732,0.123711,0.082474
std,13.598524,6.991542,0.267443,0.329677,0.275441
min,14.0,-5.0,0.0,0.0,0.0
25%,32.75,7.0,0.0,0.0,0.0
50%,40.5,10.0,0.0,0.0,0.0
75%,50.0,16.0,0.0,0.0,0.0
max,90.0,31.0,1.0,1.0,1.0


### Temp_inside column:

#### How many nulls does "temp_inside" have?

In [17]:
gas_data.temp_inside.unique()

array(['21,5', '22,5', '20', nan, '21', '20,5', '23', '23,5', '25', '24',
       '22', '19', '24,5', '25,5'], dtype=object)

In [29]:
gas_data.temp_inside.isna().value_counts()

False    376
True      12
Name: temp_inside, dtype: int64

In [31]:
gas_data[gas_data.temp_inside.isna()]

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
93,124,47,43,,10,,SP98,0,0,0,,
95,118,53,52,,11,,SP98,0,0,0,,
97,157,53,33,,9,,SP98,0,0,0,,
98,129,57,35,,9,,SP98,0,0,0,,
99,64,44,37,,10,,SP98,0,0,0,,
100,53,41,34,,9,,SP98,0,0,0,,
102,188,5,62,,9,rain,SP98,0,1,0,,
201,222,38,42,,15,,SP98,0,0,0,,
203,126,41,33,,17,,SP98,0,0,0,,
261,245,39,50,,15,sun,E10,0,0,1,,


#### 12 of 388 rows (3% of the rows). In the cleaning phase i should decide if i could fill those rows or i will remove them.

### Distance column:

#### Let's explore "distance" columns:

In [30]:
gas_data.distance.unique()

array(['28', '12', '11,2', '12,9', '18,5', '8,3', '7,8', '12,3', '4,9',
       '11,9', '12,4', '11,8', '24,7', '17,3', '33,4', '25,9', '25,3',
       '14,2', '17,9', '18,4', '18,3', '32,6', '19', '12,1', '20', '4,5',
       '11,7', '10,2', '5,4', '2', '16', '27,3', '10,6', '11,6', '13,1',
       '6,1', '153,5', '2,9', '2,1', '9,8', '6,8', '14', '13,9', '9,7',
       '24,8', '34,8', '5,2', '10,5', '13,2', '13', '12,2', '12,5',
       '15,7', '6,4', '5,3', '26,2', '18,8', '22,9', '162,7', '16,6',
       '15,9', '5,1', '22,4', '31,1', '16,1', '4,2', '17,4', '23,5', '7',
       '20,1', '20,8', '1,7', '35,9', '36,9', '16,8', '9,9', '36,6',
       '44,9', '21,6', '39,4', '26,6', '53,2', '18,9', '43,5', '16,4',
       '21,1', '22,7', '44,4', '35,8', '40,6', '14,1', '58,7', '16,2',
       '31,8', '51,6', '38,6', '81,2', '130,3', '67,2', '43,7', '56,1',
       '39', '38,5', '28,2', '19,6', '22,2', '13,6', '12,6', '8,7', '7,9',
       '2,4', '18,1', '1,3', '13,4', '12,8', '29', '31,4', '27,1', '

#### It seems like replacing "," with a "." the column can be changed to a float type.

### Consume column:

In [36]:
gas_data.consume.unique()

array(['5', '4,2', '5,5', '3,9', '4,5', '6,4', '4,4', '5,3', '5,6', '4,6',
       '5,9', '5,1', '4,7', '4,9', '5,7', '4,1', '5,8', '4,8', '4,3',
       '5,2', '7,4', '6,5', '4', '3,3', '9,9', '6,1', '6,2', '7,9',
       '12,2', '5,4', '3,6', '6,9', '8,7', '6,3', '6', '10,8', '8,1',
       '7,1', '3,8', '9', '3,7', '11,5', '6,6'], dtype=object)

#### Same as "distance" column.

### Specials column:

In [37]:
gas_data.specials.value_counts()

rain                  32
sun                   27
AC rain                9
ac                     8
AC                     6
snow                   3
sun ac                 3
AC sun                 1
ac rain                1
AC snow                1
AC Sun                 1
half rain half sun     1
Name: specials, dtype: int64

In [38]:
gas_data.specials.unique()

array([nan, 'AC rain', 'AC', 'rain', 'snow', 'AC snow',
       'half rain half sun', 'sun', 'AC sun', 'sun ac', 'ac', 'AC Sun',
       'ac rain'], dtype=object)

In [39]:
gas_data.specials.isna().value_counts()

True     295
False     93
Name: specials, dtype: int64

#### This information it's already in AC, rain and sun columns. I should use this column only to verify those columns and eliminate it.

### With columns "refill liters" and "refill gas" i will separate them from this dataset and work them as a separate dataframe. This is because the note in the kaggle dataset description:

#### "...I have also two columns saying how much and which gas type I was buying. Careful with those. The numbers don't add exactly up, because I note only the rides that occur under certain conditions: If the car was not cooling down enough to have another independent measure from the one before, i don't note it."

### We have some missing information there, so i will extremely careful with this dataframe in particular.

### Let's check "gas_type" column:

In [40]:
gas_data.gas_type.unique()

array(['E10', 'SP98'], dtype=object)

#### Super clean column.

### "AC", "sun" and "rain" columns:

In [41]:
gas_data.AC.unique()

array([0, 1])

In [42]:
gas_data.sun.unique()

array([0, 1])

In [43]:
gas_data.rain.unique()

array([0, 1])

#### "AC", "sun" and "rain" columns are clean as an integer type with binary information.