# Import libraries

In [1]:
import pandas as pd
import re

# First cleaning

In [2]:
df = pd.read_csv("measurements.csv")

In [3]:
df

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
0,28,5,26,215,12,,E10,0,0,0,45,E10
1,12,42,30,215,13,,E10,0,0,0,,
2,112,55,38,215,15,,E10,0,0,0,,
3,129,39,36,215,14,,E10,0,0,0,,
4,185,45,46,215,15,,E10,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
383,16,37,39,245,18,,SP98,0,0,0,,
384,161,43,38,25,31,AC,SP98,1,0,0,,
385,16,38,45,25,19,,SP98,0,0,0,,
386,154,46,42,25,31,AC,SP98,1,0,0,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   distance       388 non-null    object
 1   consume        388 non-null    object
 2   speed          388 non-null    int64 
 3   temp_inside    376 non-null    object
 4   temp_outside   388 non-null    int64 
 5   specials       93 non-null     object
 6   gas_type       388 non-null    object
 7   AC             388 non-null    int64 
 8   rain           388 non-null    int64 
 9   sun            388 non-null    int64 
 10  refill liters  13 non-null     object
 11  refill gas     13 non-null     object
dtypes: int64(5), object(7)
memory usage: 36.5+ KB


In [5]:
#Drop columns "specials", "refill liters" and "refill gas" as they do not offer a lot of information

In [6]:
df.drop(['specials', 'refill liters', 'refill gas'], axis=1, inplace=True)

In [7]:
df.isnull().sum()

distance         0
consume          0
speed            0
temp_inside     12
temp_outside     0
gas_type         0
AC               0
rain             0
sun              0
dtype: int64

In [8]:
df.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,AC,rain,sun
0,28,5,26,215,12,E10,0,0,0
1,12,42,30,215,13,E10,0,0,0
2,112,55,38,215,15,E10,0,0,0
3,129,39,36,215,14,E10,0,0,0
4,185,45,46,215,15,E10,0,0,0


# Cleaning by columns

### Column distance

In [9]:
# To transform this column into numeric values

In [10]:
distance = list(df.distance.unique())

In [11]:
lista=[]
for i in distance:
    lista.append(re.sub(",", ".", i))

In [12]:
dict_from_list = dict(zip(distance, lista))

In [13]:
df=df.replace({"distance": dict_from_list})

In [14]:
df['distance'] = pd.to_numeric(df.distance, errors='coerce')

In [15]:
df.isnull().sum()

distance         0
consume          0
speed            0
temp_inside     12
temp_outside     0
gas_type         0
AC               0
rain             0
sun              0
dtype: int64

### Column consume

In [16]:
# To transform this column into numeric values. Similar procedure as distance column.

In [17]:
consume = list(df.consume.unique())

In [18]:
lista1=[]
for i in consume:
    lista1.append(re.sub(",", ".", i))

In [19]:
dict_from_list1 = dict(zip(consume, lista1))

In [20]:
df=df.replace({"consume": dict_from_list1})

In [21]:
df['consume'] = pd.to_numeric(df.consume, errors='coerce')

### Column speed

In [22]:
df.speed.unique()

array([26, 30, 38, 36, 46, 50, 43, 40, 42, 59, 58, 24, 32, 39, 37, 62, 57,
       21, 28, 29, 35, 51, 23, 55, 44, 25, 63, 61, 82, 52, 18, 41, 20, 56,
       45, 22, 60, 33, 34, 71, 75, 66, 27, 49, 14, 80, 53, 47, 73, 69, 85,
       67, 54, 31, 48, 16, 88, 65, 90, 87])

In [23]:
df['speed'] = pd.to_numeric(df.speed, errors='coerce')

### Column temp_inside

In [24]:
# To transform this column into numeric values. Similar procedure as distance and consume columns.

In [25]:
temp = list(df.temp_inside.unique())

In [26]:
print(temp)

['21,5', '22,5', '20', nan, '21', '20,5', '23', '23,5', '25', '24', '22', '19', '24,5', '25,5']


In [27]:
lista2=[]
for i in temp:
    try:
        lista2.append(re.sub(",", ".", i))
    except:
        lista2.append(i)

In [28]:
print(lista2)

['21.5', '22.5', '20', nan, '21', '20.5', '23', '23.5', '25', '24', '22', '19', '24.5', '25.5']


In [29]:
dict_from_list2 = dict(zip(temp, lista2))

In [30]:
df=df.replace({"temp_inside": dict_from_list2})

In [31]:
df['temp_inside'] = pd.to_numeric(df.temp_inside, errors='coerce')

In [32]:
# To fill in Nan with mean values.

In [33]:
mean=round(df.temp_inside.mean(), 1)

In [34]:
df ['temp_inside'] = df ['temp_inside']. fillna(mean)

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   distance      388 non-null    float64
 1   consume       388 non-null    float64
 2   speed         388 non-null    int64  
 3   temp_inside   388 non-null    float64
 4   temp_outside  388 non-null    int64  
 5   gas_type      388 non-null    object 
 6   AC            388 non-null    int64  
 7   rain          388 non-null    int64  
 8   sun           388 non-null    int64  
dtypes: float64(3), int64(5), object(1)
memory usage: 27.4+ KB


### Column gas type

In [37]:
# Get numeric values.

In [38]:
df.gas_type.unique()

array(['E10', 'SP98'], dtype=object)

In [39]:
gas_type_dum=pd.get_dummies(df['gas_type'])

In [40]:
gas_type_dum

Unnamed: 0,E10,SP98
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
383,0,1
384,0,1
385,0,1
386,0,1


In [41]:
df=pd.concat([df,gas_type_dum],axis=1)

In [42]:
df.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,AC,rain,sun,E10,SP98
0,28.0,5.0,26,21.5,12,E10,0,0,0,1,0
1,12.0,4.2,30,21.5,13,E10,0,0,0,1,0
2,11.2,5.5,38,21.5,15,E10,0,0,0,1,0
3,12.9,3.9,36,21.5,14,E10,0,0,0,1,0
4,18.5,4.5,46,21.5,15,E10,0,0,0,1,0


### Column AC

In [43]:
# This column is already numeric

In [44]:
df.AC.unique()

array([0, 1])

### Column Rain

In [45]:
# This column is already numeric

In [46]:
df.rain.unique()

array([0, 1])

### Column Sun

In [47]:
# This column is already numeric

In [48]:
df.sun.unique()

array([0, 1])

In [49]:
df.shape

(388, 11)

In [50]:
df.isnull().sum()

distance        0
consume         0
speed           0
temp_inside     0
temp_outside    0
gas_type        0
AC              0
rain            0
sun             0
E10             0
SP98            0
dtype: int64

In [51]:
# Export this dataframe into csv
df.to_csv("measurements_cleaned.csv", index = False)