In [17]:
import pandas as pd
import seaborn as sns

#### Importing dataset:

In [18]:
gas_data = pd.read_csv("../data/measurements.csv")

### Let's start cleaning it:

In [19]:
gas_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   distance       388 non-null    object
 1   consume        388 non-null    object
 2   speed          388 non-null    int64 
 3   temp_inside    376 non-null    object
 4   temp_outside   388 non-null    int64 
 5   specials       93 non-null     object
 6   gas_type       388 non-null    object
 7   AC             388 non-null    int64 
 8   rain           388 non-null    int64 
 9   sun            388 non-null    int64 
 10  refill liters  13 non-null     object
 11  refill gas     13 non-null     object
dtypes: int64(5), object(7)
memory usage: 36.5+ KB


#### Distance and consume columns:

In [20]:
gas_data.distance = gas_data.distance.str.replace(",",".")

In [21]:
gas_data.distance.unique()

array(['28', '12', '11.2', '12.9', '18.5', '8.3', '7.8', '12.3', '4.9',
       '11.9', '12.4', '11.8', '24.7', '17.3', '33.4', '25.9', '25.3',
       '14.2', '17.9', '18.4', '18.3', '32.6', '19', '12.1', '20', '4.5',
       '11.7', '10.2', '5.4', '2', '16', '27.3', '10.6', '11.6', '13.1',
       '6.1', '153.5', '2.9', '2.1', '9.8', '6.8', '14', '13.9', '9.7',
       '24.8', '34.8', '5.2', '10.5', '13.2', '13', '12.2', '12.5',
       '15.7', '6.4', '5.3', '26.2', '18.8', '22.9', '162.7', '16.6',
       '15.9', '5.1', '22.4', '31.1', '16.1', '4.2', '17.4', '23.5', '7',
       '20.1', '20.8', '1.7', '35.9', '36.9', '16.8', '9.9', '36.6',
       '44.9', '21.6', '39.4', '26.6', '53.2', '18.9', '43.5', '16.4',
       '21.1', '22.7', '44.4', '35.8', '40.6', '14.1', '58.7', '16.2',
       '31.8', '51.6', '38.6', '81.2', '130.3', '67.2', '43.7', '56.1',
       '39', '38.5', '28.2', '19.6', '22.2', '13.6', '12.6', '8.7', '7.9',
       '2.4', '18.1', '1.3', '13.4', '12.8', '29', '31.4', '27.1', '

In [22]:
gas_data.distance = gas_data.distance.astype(float)

In [23]:
gas_data.consume = gas_data.consume.str.replace(",",".")
gas_data.consume = gas_data.consume.astype(float)
gas_data.consume.unique()

array([ 5. ,  4.2,  5.5,  3.9,  4.5,  6.4,  4.4,  5.3,  5.6,  4.6,  5.9,
        5.1,  4.7,  4.9,  5.7,  4.1,  5.8,  4.8,  4.3,  5.2,  7.4,  6.5,
        4. ,  3.3,  9.9,  6.1,  6.2,  7.9, 12.2,  5.4,  3.6,  6.9,  8.7,
        6.3,  6. , 10.8,  8.1,  7.1,  3.8,  9. ,  3.7, 11.5,  6.6])

#### Both columns now are clean after replacing "," for "." and transforming their type to "float".

#### Special column:

In [26]:
gas_data.specials.unique()

array([nan, 'AC rain', 'AC', 'rain', 'snow', 'AC snow',
       'half rain half sun', 'sun', 'AC sun', 'sun ac', 'ac', 'AC Sun',
       'ac rain'], dtype=object)

#### First, let's get everything in upper letters:

In [31]:
gas_data.specials = gas_data.specials.str.upper()

In [32]:
gas_data.specials.unique()

array([nan, 'AC RAIN', 'AC', 'RAIN', 'SNOW', 'AC SNOW',
       'HALF RAIN HALF SUN', 'SUN', 'AC SUN', 'SUN AC'], dtype=object)

#### Let's compare binary columns with "specials":

In [34]:
gas_data.specials.value_counts()

RAIN                  32
SUN                   27
AC                    14
AC RAIN               10
SUN AC                 3
SNOW                   3
AC SUN                 2
AC SNOW                1
HALF RAIN HALF SUN     1
Name: specials, dtype: int64

#### First, AC column with "AC" word in "special" column: 

In [71]:
((gas_data.specials.str.contains("AC")) & (gas_data.AC == 0)).value_counts()

False    387
True       1
dtype: int64

#### We found one row that doesn't match:

In [81]:
gas_data[((gas_data.specials.str.contains("AC")) & (gas_data.AC == 0))]

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
334,129.7,4.6,58,24,21,AC,E10,0,0,0,,


In [87]:
gas_data.loc[334, "AC"] = 1

In [90]:
gas_data.loc[334]

distance         129.7
consume            4.6
speed               58
temp_inside         24
temp_outside        21
specials            AC
gas_type           E10
AC                   1
rain                 0
sun                  0
refill liters      NaN
refill gas         NaN
Name: 334, dtype: object

#### Corrected.

#### I will do the same with RAIN and SUN:

In [97]:
((gas_data.specials.str.contains("RAIN")) & (gas_data.rain == 0)).value_counts()

False    388
dtype: int64

#### "rain" column is OK

In [100]:
((gas_data.specials.str.contains("SUN")) & (gas_data.sun == 0)).value_counts()

False    387
True       1
dtype: int64

In [101]:
gas_data[((gas_data.specials.str.contains("SUN")) & (gas_data.sun == 0))]

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
191,43.7,4.7,44,22,9,HALF RAIN HALF SUN,SP98,0,1,0,10,SP98


#### This one has "HALF RAIN HALF SUN". I will put 0 in both columns because we don't have a define weather for our analysis:

In [102]:
gas_data.loc[191, "rain"] = 0

In [103]:
gas_data.loc[191]

distance                       43.7
consume                         4.7
speed                            44
temp_inside                      22
temp_outside                      9
specials         HALF RAIN HALF SUN
gas_type                       SP98
AC                                0
rain                              0
sun                               0
refill liters                    10
refill gas                     SP98
Name: 191, dtype: object

#### Now "rain" and "sun" columns are ok.

#### I will add a "snow" column and eliminate "special" column because we have already all that data in the others.

In [106]:
gas_data["snow"] = gas_data.specials.str.contains("SNOW")

In [125]:
gas_data["snow"].fillna(False, inplace=True)

In [129]:
gas_data.snow = gas_data.snow.astype(int)

In [131]:
gas_data.snow.value_counts()

0    384
1      4
Name: snow, dtype: int64

#### Now we have the new column ready

In [136]:
gas_data.drop(columns="specials", axis=1, inplace=True)

In [139]:
gas_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   distance       388 non-null    float64
 1   consume        388 non-null    float64
 2   speed          388 non-null    int64  
 3   temp_inside    376 non-null    object 
 4   temp_outside   388 non-null    int64  
 5   gas_type       388 non-null    object 
 6   AC             388 non-null    int64  
 7   rain           388 non-null    int64  
 8   sun            388 non-null    int64  
 9   refill liters  13 non-null     object 
 10  refill gas     13 non-null     object 
 11  snow           388 non-null    int64  
dtypes: float64(2), int64(6), object(4)
memory usage: 36.5+ KB


#### Finally, let's check what we should do with "temp_inside" column nulls:

In [143]:
gas_data[gas_data.temp_inside.isna()]

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,AC,rain,sun,refill liters,refill gas,snow
93,12.4,4.7,43,,10,SP98,0,0,0,,,0
95,11.8,5.3,52,,11,SP98,0,0,0,,,0
97,15.7,5.3,33,,9,SP98,0,0,0,,,0
98,12.9,5.7,35,,9,SP98,0,0,0,,,0
99,6.4,4.4,37,,10,SP98,0,0,0,,,0
100,5.3,4.1,34,,9,SP98,0,0,0,,,0
102,18.8,5.0,62,,9,SP98,0,1,0,,,0
201,22.2,3.8,42,,15,SP98,0,0,0,,,0
203,12.6,4.1,33,,17,SP98,0,0,0,,,0
261,24.5,3.9,50,,15,E10,0,0,1,,,0


#### Are temperature inside and temperature outside correlated? If they are, we can use the mean with that temperature outside to estimate the temperature inside:

In [145]:
cols = ["temp_inside", "temp_outside"]
temp_data = gas_data[cols]

In [152]:
temp_data = temp_data.dropna()

In [155]:
temp_data.temp_inside.unique()

array(['21,5', '22,5', '20', '21', '20,5', '23', '23,5', '25', '24', '22',
       '19', '24,5', '25,5'], dtype=object)

In [156]:
temp_data.temp_inside = temp_data.temp_inside.str.replace(",",".")
temp_data.temp_inside = temp_data.temp_inside.astype(float)
temp_data.temp_inside.unique()

array([21.5, 22.5, 20. , 21. , 20.5, 23. , 23.5, 25. , 24. , 22. , 19. ,
       24.5, 25.5])

In [157]:
temp_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 376 entries, 0 to 387
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   temp_inside   376 non-null    float64
 1   temp_outside  376 non-null    int64  
dtypes: float64(1), int64(1)
memory usage: 8.8 KB


In [158]:
temp_data.corr()

Unnamed: 0,temp_inside,temp_outside
temp_inside,1.0,0.361308
temp_outside,0.361308,1.0


#### They don't have an strong correlation, i will just eliminate those rows.

In [177]:
gas_data = gas_data[gas_data.temp_inside.notna()]

In [181]:
gas_data.temp_inside.unique()

array(['21,5', '22,5', '20', '21', '20,5', '23', '23,5', '25', '24', '22',
       '19', '24,5', '25,5'], dtype=object)

In [182]:
gas_data.temp_inside = gas_data.temp_inside.str.replace(",",".")
gas_data.temp_inside = gas_data.temp_inside.astype(float)
gas_data.temp_inside.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


array([21.5, 22.5, 20. , 21. , 20.5, 23. , 23.5, 25. , 24. , 22. , 19. ,
       24.5, 25.5])

In [184]:
gas_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 376 entries, 0 to 387
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   distance       376 non-null    float64
 1   consume        376 non-null    float64
 2   speed          376 non-null    int64  
 3   temp_inside    376 non-null    float64
 4   temp_outside   376 non-null    int64  
 5   gas_type       376 non-null    object 
 6   AC             376 non-null    int64  
 7   rain           376 non-null    int64  
 8   sun            376 non-null    int64  
 9   refill liters  13 non-null     object 
 10  refill gas     13 non-null     object 
 11  snow           376 non-null    int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 38.2+ KB


### Finally, i will separate columns "refill liters" and "refill gas". I consider that this information is useful but it's not connected well to the the rest of the dataset. We have only 13 rows complete and the owner of this dataset said that it has some missing information:

In [185]:
gas_data.rename(columns = {"refill liters": "refill_liters", "refill gas": "refill_gas"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [193]:
cols_refill = ["refill_liters", "refill_gas"]
refill_data = gas_data[cols_refill]

In [194]:
consume_data = gas_data.drop(axis=1, columns=cols_refill)

In [196]:
consume_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 376 entries, 0 to 387
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   distance      376 non-null    float64
 1   consume       376 non-null    float64
 2   speed         376 non-null    int64  
 3   temp_inside   376 non-null    float64
 4   temp_outside  376 non-null    int64  
 5   gas_type      376 non-null    object 
 6   AC            376 non-null    int64  
 7   rain          376 non-null    int64  
 8   sun           376 non-null    int64  
 9   snow          376 non-null    int64  
dtypes: float64(3), int64(6), object(1)
memory usage: 32.3+ KB


#### "consume_data" dataframe it's clean. Let's clean "refill_data" eliminating those null rows and changing it's type:

In [197]:
refill_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 376 entries, 0 to 387
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   refill_liters  13 non-null     object
 1   refill_gas     13 non-null     object
dtypes: object(2)
memory usage: 8.8+ KB


In [205]:
refill_data = refill_data[(refill_data.refill_liters.notna()) & (refill_data.refill_gas.notna())]

In [207]:
refill_data.refill_liters = refill_data.refill_liters.str.replace(",",".")
refill_data.refill_liters = refill_data.refill_liters.astype(float)
refill_data.refill_liters.unique()

array([45. , 37.6, 37.7, 38. , 38.3, 10. , 39. , 41. , 37. , 37.2])

In [210]:
refill_data

Unnamed: 0,refill_liters,refill_gas
0,45.0,E10
44,37.6,SP98
82,37.7,SP98
106,45.0,SP98
139,38.0,E10
171,38.3,E10
191,10.0,SP98
192,39.0,SP98
234,39.0,E10
274,41.0,SP98


In [209]:
refill_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13 entries, 0 to 349
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   refill_liters  13 non-null     float64
 1   refill_gas     13 non-null     object 
dtypes: float64(1), object(1)
memory usage: 312.0+ bytes


#### Clean both datasets!

### I will export them into my "data" folder to proceed with my analysis:

In [211]:
consume_data.to_csv("../data/consume_data.csv")

In [212]:
refill_data.to_csv("../data/refill_data.csv")

#### I will export them with their indexes just in case i want to connect them in the future.