In [235]:
import pandas as pd
import numpy as np
import json
from pathlib import Path

Read the data

In [236]:
dataset = "eu_emission_mother_load.csv" 
col_list = ['Make', 'Fuel', 'Model', 'Cnt', 'Wltp', 'NEDC', 'year']
type_dict = {
    "Make": np.str,
    "Model": np.str,
    "NEDC": np.float,
    "Wltp": np.float,
    "Fuel": np.str,
    "Cnt": np.int,
    "year": np.int
}
df = pd.read_csv(dataset, dtype=type_dict, usecols=col_list)

print(f"number of records included: {len(df)}")
print(f"number of records considered: {np.sum(df['Cnt'])}")

number of records included: 60000
number of records considered: 87531401


Simple clean

In [237]:
columns_to_upper_case = ["Make", "Model", "Fuel"]
for col_name in columns_to_upper_case:
    df.loc[:, col_name] = df.loc[:, col_name].apply(lambda s: str(s).upper().strip())
#makes string, puts it to upper case, and strips preceeding and following spaces

In [238]:
filter_away_Makes = ['OUT OF SCOPE', 'UNKNOWN', 'DUPLICATE']
df = df.loc[~df['Make'].isin(filter_away_Makes)]

filter_away_Models = ['OUT OF SCOPE', 'UNKNOWN', 'DUPLICATE']
df = df.loc[~df['Model'].isin(filter_away_Makes)]

# removing DIESEL/ELECTRIC and PETROL/ELECTRIC -> "-" instead of "/"
df.loc[:, "Fuel"] = df["Fuel"].apply(lambda s: "-".join(s.split("/")))
print(df)

             Make    Fuel                      Model     Cnt   Wltp   NEDC  \
0         RENAULT  PETROL                       CLIO  401080  134.0  114.0   
1         CITROEN  PETROL                         C3  324540  135.0  106.0   
2           DACIA  PETROL                    SANDERO  321984  146.0  124.0   
3         PEUGEOT  PETROL                        208  315989  133.0  106.0   
4           DACIA  PETROL                    SANDERO  308575  147.0  118.0   
...           ...     ...                        ...     ...    ...    ...   
59995        FORD  DIESEL  MONDEO TI-NIUM X TDCI163A      22    NaN    NaN   
59996     RENAULT  DIESEL                    RENAULT      22    NaN    NaN   
59997  VOLKSWAGEN  DIESEL         SHARAN / 2.0 / TDI      22    NaN    NaN   
59998        AUDI  PETROL                 TT RS COUP      22    NaN  190.0   
59999     RENAULT  DIESEL         KANGOO / 1.5 / DCI      22    NaN    NaN   

       year  
0      2018  
1      2019  
2      2019  
3      

merging Wltp and NEDC

In [239]:
Wltp_per_NEDC = 1.2 #average ratio
# TODO: 26931 rows have nans for both nedc and wltp
'''
count = 0
for index, row in df.iterrows():
    if np.isnan(row['Wltp']) and np.isnan(row['NEDC']):
        count += 1
print(count)
'''

"\ncount = 0\nfor index, row in df.iterrows():\n    if np.isnan(row['Wltp']) and np.isnan(row['NEDC']):\n        count += 1\nprint(count)\n"

In [240]:
df.loc[:, "Wltp"].where(df['Wltp'].notna(), df['NEDC'] * Wltp_per_NEDC, inplace=True)
df['co2e'] = df['Wltp']
df.drop(columns=["NEDC", "Wltp"], inplace=True)
df.dropna(axis = 0, inplace = True)

Renaming Makes

In [241]:
#F.e. when Make is AUDI AG it will be renamed to AUDI
Make_dict = {}
with open('Make_dict.json') as Make_dict_file:
    Make_dict = json.load(Make_dict_file)

df.loc[:, 'Make'].replace(Make_dict, inplace=True)

Cleaning Model

In [242]:
#when Make is AUDI and Model is AUDI A4 this clean up will change the Model to A4.
df['Model'] = df.apply(lambda row: 
    row['Model'][len(row['Make']) + 1: ] 
    if len(row['Make']) > len(row['Model']) and 
        row['Make'] == row['Model'][ : len(row['Make'])] and 
        row['Model'][len(row['Make'])] == " "
    else row['Model'],
    axis=1)
print(df)

             Make    Fuel       Model     Cnt  year   co2e
0         RENAULT  PETROL        CLIO  401080  2018  134.0
1         CITROEN  PETROL          C3  324540  2019  135.0
2           DACIA  PETROL     SANDERO  321984  2019  146.0
3         PEUGEOT  PETROL         208  315989  2018  133.0
4           DACIA  PETROL     SANDERO  308575  2018  147.0
...           ...     ...         ...     ...   ...    ...
59965        AUDI      NG          A5      22  2019  130.0
59973  VOLKSWAGEN      NG       CADDY      22  2020  128.0
59990        OPEL     LPG   KARLROCKS      22  2018  129.6
59993     FERRARI  PETROL  CALIFORNIA      22  2018  301.2
59998        AUDI  PETROL  TT RS COUP      22  2018  228.0

[32944 rows x 6 columns]


Grouping by

In [243]:
#We might have some duplicates in our data
df['Cnt_co2e'] = df['Cnt'] * df['co2e']
grouped = df.groupby(['Make', 'Model', 'Fuel', 'year']).agg(
    Cnt=pd.NamedAgg(column='Cnt', aggfunc='sum'),
    Cnt_co2e=pd.NamedAgg(column='Cnt_co2e', aggfunc='sum')
)
df = grouped.reset_index()
df['co2e'] = df['Cnt_co2e'] / df['Cnt'] #weighted average
df.drop(columns='Cnt_co2e', inplace=True)
print(df)

         Make                     Model             Fuel  year   Cnt   co2e
0      ABARTH      124 GT MULTIAIR AUTO           PETROL  2018    26  183.6
1      ABARTH      124 GT MULTIAIR AUTO           PETROL  2019    24  183.6
2      ABARTH       124 SPIDER MULTIAIR           PETROL  2018  1078  177.6
3      ABARTH       124 SPIDER MULTIAIR           PETROL  2019    68  177.6
4      ABARTH  124 SPIDER MULTIAIR AUTO           PETROL  2018   804  183.6
...       ...                       ...              ...   ...   ...    ...
27684   VOLVO       XC90 T8 TWIN ENGINE           PETROL  2020    73   66.0
27685   VOLVO       XC90 T8 TWIN ENGINE  PETROL-ELECTRIC  2017  2424   58.8
27686   VOLVO       XC90 T8 TWIN ENGINE  PETROL-ELECTRIC  2018  5072  129.0
27687   VOLVO       XC90 T8 TWIN ENGINE  PETROL-ELECTRIC  2019  8310   82.0
27688   VOLVO       XC90 T8 TWIN ENGINE  PETROL-ELECTRIC  2020  3699   73.0

[27689 rows x 6 columns]


Generating Make dictionary

In [244]:
Makes = np.unique(df['Make'])
new_Make_dict = {}
for b in Makes:
    if b in Make_dict:
        new_Make_dict[b] = Make_dict[b]
    else:
        new_Make_dict[b] = b

#in Makes_found.json all brands included should be reasonable
with open("Makes_found.json", "w") as outfile:
    json.dump(new_Make_dict, outfile, indent=4)



Generating Model dictionary

In [245]:
Models = np.unique(df['Model'])
new_Model_dict = {}
for b in Models:
    #if b in Model_dict:
    #    new_Model_dict[b] = Model_dict[b]
    #else:
        new_Model_dict[b] = b

#in Models_found.json all brands included should be reasonable
with open("Models_found.json", "w") as outfile:
    json.dump(new_Model_dict, outfile, indent=4)


Making sure no nans are included

In [246]:
df.dropna(axis='index', inplace=True)

Filtering based on count

In [249]:
min_count = 100
df = df[df['Cnt'] >= min_count]
df.reset_index(inplace=True, drop=True)
#print(df.groupby('Make').Model.count()) #number of Models per Make
print(df)

         Make                     Model             Fuel  year   Cnt   co2e
0      ABARTH       124 SPIDER MULTIAIR           PETROL  2018  1078  177.6
1      ABARTH  124 SPIDER MULTIAIR AUTO           PETROL  2018   804  183.6
2      ABARTH  124 SPIDER MULTIAIR AUTO           PETROL  2019   412  183.6
3      ABARTH                       595           PETROL  2018  3930  162.0
4      ABARTH                       595           PETROL  2019  2026  162.0
...       ...                       ...              ...   ...   ...    ...
17926   VOLVO                   XC90 T8  PETROL-ELECTRIC  2019   154   75.0
17927   VOLVO       XC90 T8 TWIN ENGINE  PETROL-ELECTRIC  2017  2424   58.8
17928   VOLVO       XC90 T8 TWIN ENGINE  PETROL-ELECTRIC  2018  5072  129.0
17929   VOLVO       XC90 T8 TWIN ENGINE  PETROL-ELECTRIC  2019  8310   82.0
17930   VOLVO       XC90 T8 TWIN ENGINE  PETROL-ELECTRIC  2020  3699   73.0

[17931 rows x 6 columns]
