In [800]:
import pandas as pd
import numpy as np
import json
from pathlib import Path

Read the data

In [801]:
dataset = "eu_emission_mother_load.csv" 
col_list = ['Make', 'Fuel', 'Model', 'Cnt', 'Wltp', 'NEDC', 'year']
type_dict = {
    "Make": np.str,
    "Model": np.str,
    "NEDC": np.float,
    "Wltp": np.float,
    "Fuel": np.str,
    "Cnt": np.int,
    "year": np.int
}
df = pd.read_csv(dataset, dtype=type_dict, usecols=col_list)
df.rename(columns={
    "Make": "Brand",
    "NEDC": "Nedc",
}, inplace=True)

print(f"number of records included: {len(df)}")
print(f"number of records considered: {np.sum(df['Cnt'])}")
print(df)

number of records included: 60000
number of records considered: 87531401
            Brand    Fuel                      Model     Cnt   Wltp   Nedc  \
0         RENAULT  Petrol                       CLIO  401080  134.0  114.0   
1         CITROEN  petrol                         C3  324540  135.0  106.0   
2           DACIA  petrol                    SANDERO  321984  146.0  124.0   
3         PEUGEOT  Petrol                        208  315989  133.0  106.0   
4           DACIA  Petrol                    SANDERO  308575  147.0  118.0   
...           ...     ...                        ...     ...    ...    ...   
59995        FORD  diesel  MONDEO TI-NIUM X TDCI163A      22    NaN    NaN   
59996     RENAULT  Diesel                    RENAULT      22    NaN    NaN   
59997  VOLKSWAGEN  Diesel         SHARAN / 2.0 / TDI      22    NaN    NaN   
59998        AUDI  Petrol                 TT RS COUP      22    NaN  190.0   
59999     RENAULT  Diesel         KANGOO / 1.5 / DCI      22    NaN  

Simple clean

In [802]:
columns_to_upper_case = ["Brand", "Model", "Fuel"]
for col_name in columns_to_upper_case:
    df.loc[:, col_name] = df.loc[:, col_name].apply(lambda s: str(s).upper().strip())
#makes string, puts it to upper case, and strips preceeding and following spaces

In [803]:
filter_away = ['OUT OF SCOPE', 'UNKNOWN', 'DUPLICATE', 'NAN']
for column in ['Brand', 'Model', 'Fuel']:
    df = df.loc[~df[column].isin(filter_away)]

# removing DIESEL/ELECTRIC and PETROL/ELECTRIC -> "-" instead of "/"
df.loc[:, "Fuel"] = df["Fuel"].apply(lambda s: "-".join(s.split("/")))

merging Wltp and Nedc

In [804]:
Wltp_per_Nedc = 1.2 #average ratio
# TODO: 26931 rows have nans for both nedc and wltp
'''
count = 0
for index, row in df.iterrows():
    if np.isnan(row['Wltp']) and np.isnan(row['Nedc']):
        count += 1
print(count)
'''

"\ncount = 0\nfor index, row in df.iterrows():\n    if np.isnan(row['Wltp']) and np.isnan(row['Nedc']):\n        count += 1\nprint(count)\n"

In [805]:
df.loc[:, "Wltp"].where(df['Wltp'].notna(), df['Nedc'] * Wltp_per_Nedc, inplace=True)
df['co2e'] = df['Wltp']
df.drop(columns=["Nedc", "Wltp"], inplace=True)
df.dropna(axis = 0, inplace = True)

Renaming Brands

In [806]:
#F.e. when Brand is AUDI AG it will be renamed to AUDI
Brand_dict = {}
with open('Brand_dict.json') as Brand_dict_file:
    Brand_dict = json.load(Brand_dict_file)

df.loc[:, 'Brand'].replace(Brand_dict, inplace=True)

Cleaning Model

In [807]:
#when Brand is AUDI and Model is AUDI A4 this clean up will change the Model to A4.
def clean_make_from_model():
    df['Model'] = df.apply(lambda row: 
        row['Model'][len(row['Brand']) + 1 : ] 
        if len(row['Brand']) < len(row['Model']) and 
            row['Brand'] == row['Model'][ : len(row['Brand'])] and 
            row['Model'][len(row['Brand'])] == " "
        else row['Model'],
        axis=1)
clean_make_from_model()

In [808]:
exclude = ["/", " "]
max_number_model_words = 3
def clean_model(s):
    return " ".join(
            tuple(
                filter(lambda x: x not in exclude, s.split())
            )[ : max_number_model_words]
        )
df['Model'] = df['Model'].apply(clean_model)

In [809]:
#Based on Linuses work: selecting the top Brand Model pairs
minimum_count = 5000
brand_model_count = df.groupby(['Brand', 'Model']).Cnt.sum().sort_values(ascending=False)
brand_model_count = brand_model_count[brand_model_count>minimum_count].reset_index()
brand_model_count.sort_values(by=["Brand", "Model"]).to_csv("make_model.csv", index=False)
#drop all rows where the Model or Brand is not in the top models list
for column in ['Brand', 'Model']:
    df = df.drop(df[
            ~df[column].isin(np.unique(brand_model_count[column]))
        ].index)

Grouping by - removing duplicate rows

In [810]:
#We might have some duplicates in our data
df['Cnt_co2e'] = df['Cnt'] * df['co2e']
grouped = df.groupby(['Brand', 'Model', 'Fuel', 'year']).agg(
    Cnt=pd.NamedAgg(column='Cnt', aggfunc='sum'),
    Cnt_co2e=pd.NamedAgg(column='Cnt_co2e', aggfunc='sum')
)
df = grouped.reset_index()
df['co2e'] = df['Cnt_co2e'] / df['Cnt'] #weighted average
df.drop(columns='Cnt_co2e', inplace=True)

Generating Brand dictionary

In [811]:
'''
#One-time thing, please don't delete

Brands = np.unique(df['Brand'])
new_Brand_dict = {}
for b in Brands:
    if b in Brand_dict:
        new_Brand_dict[b] = Brand_dict[b]
    else:
        new_Brand_dict[b] = b

#in Brands_found.json all brands included should be reasonable
with open("Brands_found.json", "w") as outfile:
    json.dump(new_Brand_dict, outfile, indent=4)

'''



'\n#One-time thing, please don\'t delete\n\nBrands = np.unique(df[\'Brand\'])\nnew_Brand_dict = {}\nfor b in Brands:\n    if b in Brand_dict:\n        new_Brand_dict[b] = Brand_dict[b]\n    else:\n        new_Brand_dict[b] = b\n\n#in Brands_found.json all brands included should be reasonable\nwith open("Brands_found.json", "w") as outfile:\n    json.dump(new_Brand_dict, outfile, indent=4)\n\n'

JSON for frontend

In [812]:
# this cell is very ugly, but works 

df_frontend = df.drop(columns=["Cnt", "co2e", "year"])
df_frontend = df_frontend.groupby(['Brand', 'Model']).Fuel.apply(np.unique).reset_index() 
#np.array of Fuels per Brand and Models

df_frontend = df_frontend.groupby('Brand').agg(
    Model=pd.NamedAgg(column='Model', aggfunc=list),
    Fuel=pd.NamedAgg(column='Fuel', aggfunc=list)
).reset_index()

#df_frontend is of the form Brand, list of Models, list of np.arrays of Fuels

res_dict = {row[0]: {
        row[1][i]: row[2][i].tolist() for i in range(len(row[1]))
    } for row in zip(df_frontend['Brand'], df_frontend['Model'], df_frontend['Fuel'])}

with open("frontend_data.json", "w") as outfile:
    json.dump(res_dict, outfile, indent=4, sort_keys=True)

JSON for Mr. Carbon

In [819]:
folder_path = Path("carbon_db") #using Path to make sure it is compatible among different OS.

df['co2e'] = round(df['co2e'])
df_carbon = df.drop(columns='Cnt').groupby(['Brand', 'Model', 'Fuel', 'year']).agg('mean')
#df_carbon co2e value can be accessed by df_carbon.loc['Brand', 'Model', 'Fuel', 'year']['co2e']
#since each such record is unique, this gives us a value (not a series)

df_indeces = df_carbon.index.to_frame(index = False)
df_indeces = df_indeces.groupby('Brand').agg(
    Model = pd.NamedAgg(column='Model', aggfunc=list),
    Fuel = pd.NamedAgg(column='Fuel', aggfunc=list),
    year = pd.NamedAgg(column='year', aggfunc=list)
).reset_index()

for _, row in df_indeces.iterrows():
    Brand = row['Brand']
    path = folder_path / (Brand + ".json")
    res_dict = {}
    assert len(row['Model']) == len(row['Fuel']) == len(row['year'])
    for i in range(len(row['Model'])):
        Model = row['Model'][i]
        Fuel = row['Fuel'][i]
        year = row['year'][i]

        df_co2e = df_carbon.loc[Brand, Model, Fuel, year]
        assert len(df_co2e) == 1
        co2e = df_co2e['co2e']

        if Model not in res_dict:
            res_dict[Model] = {}
        if Fuel not in res_dict[Model]:
            res_dict[Model][Fuel] = {}
        if year in res_dict[Model][Fuel]:
            print(f"Warning, there is a duplicate {Model} {Fuel} {year}")
            
        res_dict[Model][Fuel][year] = co2e
    
    with open(path, "w") as outfile:
        json.dump(res_dict, outfile, indent=4, sort_keys=True)
