## Creating Functions for the model analysis

In [2]:
import pandas as pd

In [3]:
# Defining a function to create a DataFrame out of model_and_fuel_type worksheet in excel files

def clean_model_fuel(a:str, b):
    
    import pandas as pd
    #read the sheet
    regis_model_fuel_df = pd.read_excel(a, sheet_name = 'model_and_fuel_type')
    regis_model_fuel_df['Hersteller'] = regis_model_fuel_df['Hersteller'].fillna(method = 'ffill')

    # drop the first column
    regis_model_fuel_df = regis_model_fuel_df.drop(['Unnamed: 0'], axis = 1)

    # drop the rows with ZUSAMMEN
    regis_model_fuel_df = regis_model_fuel_df.drop(regis_model_fuel_df[regis_model_fuel_df['Hersteller'].str.contains('ZUSAMMEN')].index)

    # concentrate on only electric cars
    regis_model_fuel_df = regis_model_fuel_df[regis_model_fuel_df['Kraftstoffart'] == 'E']

    regis_model_fuel_df_copy = regis_model_fuel_df.copy()
    regis_model_fuel_df_copy.rename(columns={'Hersteller': 'manufacturer',
                                        'Handelsname': 'model',
                                        'Typ-Schl.-Nr.': 'tsn',
                                        'kW': 'power_kw',
                                        'Kraftstoffart': 'fuel_type',
                                        'Allrad': 'drive_type',
                                        'Aufbauart': 'body_type',
                                        'Insgesamt': 'total',
                                        'Wohnmobile': 'motorhomes',
                                        'private\nHalter': 'private_owners',
                                        'Halter\nbis 29 Jahre': 'owners_under_29_years',
                                        'Halter\nab 60 Jahre': 'owners_over_60_years',
                                        'weibliche\nHalter': 'female_owners'}, inplace=True)

    regis_model_fuel_df = regis_model_fuel_df_copy
    
    # Variable with all columns to change
    cols_to_convert_fuel = ['total', 'motorhomes', 'private_owners',
        'owners_under_29_years', 'owners_over_60_years', 'female_owners']
    # Replace - with 0
    for col_fuel in cols_to_convert_fuel:
        regis_model_fuel_df[col_fuel] = regis_model_fuel_df[col_fuel].replace('-', '0')

    for col_fuel1 in cols_to_convert_fuel:
        regis_model_fuel_df[col_fuel1] = regis_model_fuel_df[col_fuel1].replace(',', '.').astype(int)

    regis_model_fuel_df["year"] = b
    return regis_model_fuel_df


In [4]:
clean_model_fuel('model_17.xlsx', 2017).shape

(27, 14)

In [5]:
clean_model_fuel('model_21.xlsx', 2017).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 132 entries, 14 to 2928
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   manufacturer           132 non-null    object 
 1   model                  132 non-null    object 
 2   tsn                    132 non-null    object 
 3   power_kw               132 non-null    float64
 4   fuel_type              132 non-null    object 
 5   drive_type             132 non-null    object 
 6   body_type              132 non-null    object 
 7   total                  132 non-null    int64  
 8   motorhomes             132 non-null    int64  
 9   private_owners         132 non-null    int64  
 10  owners_under_29_years  132 non-null    int64  
 11  owners_over_60_years   132 non-null    int64  
 12  female_owners          132 non-null    int64  
 13  year                   132 non-null    int64  
dtypes: float64(1), int64(7), object(6)
memory usage: 15.5+ K

In [6]:
clean_model_fuel('model_21.xlsx', 2017).unique_combo.nunique()

AttributeError: 'DataFrame' object has no attribute 'unique_combo'

In [None]:
# Define function for car models by state
def clean_model_state(a: str, b):
    
    import pandas as pd

    # read excel
    regis_model_state_df = pd.read_excel(a, sheet_name = 'model_by_state')
    regis_model_state_df['Hersteller'] = regis_model_state_df['Hersteller'].fillna(method = 'ffill')

    # drop the first column
    regis_model_state_df = regis_model_state_df.drop(['Unnamed: 0'], axis = 1)

    # drop the rows with ZUSAMMEN 
    regis_model_state_df = regis_model_state_df.drop(regis_model_state_df[regis_model_state_df['Hersteller'].str.contains('ZUSAMMEN')].index)


    # rename
    regis_model_state_df.rename(columns={'Hersteller': 'manufacturer',
                                        'Handelsname': 'model',
                                        'Typ-Schl.-Nr.': 'tsn',
                                        'Baden-\nWürttemberg' : 'Baden-Württemberg',
                                        'Bayern': 'Bayern',
                                        'Berlin': 'Berlin',
                                        'Branden-\nburg': 'Brandenburg',
                                        'Bremen': 'Bremen',
                                        'Hamburg': 'Hamburg',
                                        'Hessen': 'Hessen',
                                        'Mecklenburg-\nVorpommern': 'Mecklenburg-Vorpommern',
                                        'Nieder-\nsachsen': 'Niedersachsen',
                                        'Nordrhein-\nWestfalen': 'Nordrhein-Westfalen',
                                        'Rheinland-\nPfalz': 'Rheinland-Pfalz',
                                        'Saarland': 'Saarland',
                                        'Sachsen': 'Sachsen',
                                        'Sachsen-\nAnhalt': 'Sachsen-Anhalt',
                                        'Schleswig-\nHolstein': 'Schleswig-Holstein',
                                        'Thüringen': 'Thüringen',
                                        'Sonstige': 'special',
                                        'Deutschland': 'total_in_ger'}, inplace=True)

    regis_model_state_df.drop(['total_in_ger'], axis=1, inplace=True)
    # Variable with all columns to change
    cols_to_convert = ['Baden-Württemberg', 'Bayern',
        'Berlin', 'Brandenburg', 'Bremen', 'Hamburg', 'Hessen',
        'Mecklenburg-Vorpommern', 'Niedersachsen', 'Nordrhein-Westfalen',
        'Rheinland-Pfalz', 'Saarland', 'Sachsen', 'Sachsen-Anhalt',
        'Schleswig-Holstein', 'Thüringen', 'special']
    # Replace - with 0
    for col in cols_to_convert:
        regis_model_state_df[col] = regis_model_state_df[col].replace('-', '0')

    for col1 in cols_to_convert:
            regis_model_state_df[col1] = regis_model_state_df[col1].replace(',', '.').astype(float).fillna(0).astype(int)


    regis_model_state_df["year"] = b

    # put all states into one column
    id_cols = pd.concat([regis_model_state_df.iloc[:, :3], regis_model_state_df.iloc[:, 20:]], axis=1)
    regis_model_state_df = pd.melt(regis_model_state_df, id_vars=id_cols, value_vars=regis_model_state_df.columns[3:20], var_name='federal_state', value_name='new_registration')
    
    return regis_model_state_df

In [None]:
clean_model_state('model_19.xlsx', 2021)

Unnamed: 0,manufacturer,model,tsn,year,federal_state,new_registration
0,ALPINA,BMW ALPINA B3 S BITURBO,ACI,2021,Baden-Württemberg,7
1,ALPINA,BMW ALPINA B4 S BITURBO,ACE,2021,Baden-Württemberg,2
2,ALPINA,BMW ALPINA B4 S BITURBO,ACG,2021,Baden-Württemberg,2
3,ALPINA,BMW ALPINA B4 S BITURBO,ACS,2021,Baden-Württemberg,1
4,ALPINA,BMW ALPINA B4 S BITURBO,ACT,2021,Baden-Württemberg,2
...,...,...,...,...,...,...
54480,VOLVO (S),XC90,BPR,2021,special,0
54481,VOLVO (S),"XC90 T8 Twin Engine,XC90",BMH,2021,special,0
54482,VOLVO (S),"XC90 T8 Twin Engine,XC90",BMQ,2021,special,0
54483,VOLVO (S),SONSTIGE/NICHT GETYPT,,2021,special,0


In [None]:
def clean_join_model(a:str, b): 
    df_join_model = clean_model_fuel(a, b).merge(clean_model_state(a, b), on='model')
    return df_join_model

In [None]:
# Displaying up to 100 columns
pd.set_option('display.max_columns', 100)

In [7]:
from table_function import clean_join_model_default
df_model_analysis = clean_join_model_default()
df_model_analysis

Unnamed: 0,manufacturer_x,model,tsn,power_kw,fuel_type,drive_type,body_type,total,motorhomes,private_owners,owners_under_29_years,owners_over_60_years,female_owners,year_x,manufacturer_y,year_y,federal_state,new_registration
0,BMW,"i3s,i3",BSI,75.0,E,,,2788,0,1120,14,304,311,2017,BMW,2017,Baden-Württemberg,471
1,BMW,"i3s,i3",BSI,75.0,E,,,2788,0,1120,14,304,311,2017,BMW,2017,Bayern,1024
2,BMW,"i3s,i3",BSI,75.0,E,,,2788,0,1120,14,304,311,2017,BMW,2017,Berlin,101
3,BMW,"i3s,i3",BSI,75.0,E,,,2788,0,1120,14,304,311,2017,BMW,2017,Brandenburg,56
4,BMW,"i3s,i3",BSI,75.0,E,,,2788,0,1120,14,304,311,2017,BMW,2017,Bremen,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2222,VOLVO (S),"XC40,C40",BTE,160.0,E,A,,1217,0,262,4,137,67,2021,VOLVO (S),2021,Sachsen,12
2223,VOLVO (S),"XC40,C40",BTE,160.0,E,A,,1217,0,262,4,137,67,2021,VOLVO (S),2021,Sachsen-Anhalt,6
2224,VOLVO (S),"XC40,C40",BTE,160.0,E,A,,1217,0,262,4,137,67,2021,VOLVO (S),2021,Schleswig-Holstein,45
2225,VOLVO (S),"XC40,C40",BTE,160.0,E,A,,1217,0,262,4,137,67,2021,VOLVO (S),2021,Thüringen,7


In [8]:
df_model_analysis.columns

Index(['manufacturer_x', 'model', 'tsn', 'power_kw', 'fuel_type', 'drive_type',
       'body_type', 'total', 'motorhomes', 'private_owners',
       'owners_under_29_years', 'owners_over_60_years', 'female_owners',
       'year_x', 'manufacturer_y', 'year_y', 'federal_state',
       'new_registration'],
      dtype='object')

In [9]:
df_model_analysis.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5406 entries, 0 to 2226
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   manufacturer_x         5406 non-null   object 
 1   model                  5406 non-null   object 
 2   tsn                    5406 non-null   object 
 3   power_kw               5406 non-null   float64
 4   fuel_type              5406 non-null   object 
 5   drive_type             4114 non-null   object 
 6   body_type              4165 non-null   object 
 7   total                  5406 non-null   int64  
 8   motorhomes             5406 non-null   int64  
 9   private_owners         5406 non-null   int64  
 10  owners_under_29_years  5406 non-null   int64  
 11  owners_over_60_years   5406 non-null   int64  
 12  female_owners          5406 non-null   int64  
 13  year_x                 5406 non-null   int64  
 14  manufacturer_y         5406 non-null   object 
 15  year

In [10]:
df_model_analysis.drop(['manufacturer_y', 'year_y', 'motorhomes', 'drive_type', 'body_type'], axis=1, inplace=True)

In [11]:
df_model_analysis.head(1)

Unnamed: 0,manufacturer_x,model,tsn,power_kw,fuel_type,total,private_owners,owners_under_29_years,owners_over_60_years,female_owners,year_x,federal_state,new_registration
0,BMW,"i3s,i3",BSI,75.0,E,2788,1120,14,304,311,2017,Baden-Württemberg,471


In [12]:
# Saving the df as a csv file in this Repo on GitHub 
df_model_analysis.to_csv('model_analysis_clean.csv', index=False)