## Data Cleaning

### Approach

- Importing and inspecting the data sets.
- After that -> cleaning the data accordingly.
- Creating Meta-Data for all the data sets.

#### Importing Libraries

In [24]:
# Importing python libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import seaborn as sns
import requests


In [25]:
# read excel
regis_model_df = pd.read_excel('model_21.xlsx', sheet_name = 'model_by_state')
regis_model_df['Hersteller'] = regis_model_df['Hersteller'].fillna(method = 'ffill')

### Simple EDA "model_21"

In [26]:
# look at the data
regis_model_df

Unnamed: 0.1,Unnamed: 0,Hersteller,Handelsname,Typ-Schl.-Nr.,Baden-\nWürttemberg,Bayern,Berlin,Branden-\nburg,Bremen,Hamburg,...,Nieder-\nsachsen,Nordrhein-\nWestfalen,Rheinland-\nPfalz,Saarland,Sachsen,Sachsen-\nAnhalt,Schleswig-\nHolstein,Thüringen,Sonstige,Deutschland
0,,ALPINA,BMW ALPINA B3 Limousine,ACU,7,8,4,1,1,-,...,3,6,4,3,-,-,2,-,-,42.0
1,,ALPINA,BMW ALPINA B3 Touring,ACV,26,40,3,1,2,4,...,19,32,17,1,2,1,13,1,-,178.0
2,,ALPINA,BMW ALPINA B5,ACW,3,9,1,-,-,-,...,-,7,2,-,-,-,2,-,-,29.0
3,,ALPINA,BMW ALPINA B5,ACX,25,33,2,2,2,4,...,9,26,5,4,3,1,5,1,-,135.0
4,,ALPINA,BMW ALPINA B8,ADG,-,4,-,1,-,-,...,1,3,3,-,-,-,1,-,-,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2961,,ZHEJIANG GEELY (RC) ZUSAMMEN,Lynk & Co 01,AAB,-,3,-,3,-,1,...,3,2,-,-,2,-,-,-,-,1416.0
2962,,SONSTIGE HERSTELLER,Lynk & Co 01,AAB,218,396,19,61,13,4,...,887,320,56,22,69,41,96,31,8,2436.0
2963,,INSGESAMT,Lynk & Co 01,AAB,367282,531550,60376,50986,15561,92464,...,269551,541469,104156,31205,86713,42680,72040,47804,1511,2622132.0
2964,,INSGESAMT,Lynk & Co 01,AAB,367282,531550,60376,50986,15561,92464,...,269551,541469,104156,31205,86713,42680,72040,47804,1511,2622132.0


In [27]:
regis_model_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2966 entries, 0 to 2965
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               0 non-null      float64
 1   Hersteller               2966 non-null   object 
 2   Handelsname              2966 non-null   object 
 3   Typ-Schl.-Nr.            2966 non-null   object 
 4   Baden-
Württemberg       2966 non-null   object 
 5   Bayern                   2966 non-null   object 
 6   Berlin                   2966 non-null   object 
 7   Branden-
burg            2966 non-null   object 
 8   Bremen                   2966 non-null   object 
 9   Hamburg                  2966 non-null   object 
 10  Hessen                   2966 non-null   object 
 11  Mecklenburg-
Vorpommern  2966 non-null   object 
 12  Nieder-
sachsen          2966 non-null   object 
 13  Nordrhein-
Westfalen     2966 non-null   object 
 14  Rheinland-
Pfalz        

## Finding Problems:
1. drop the first column
2. drop the rows: 'Hersteller' has 'ZUSAMMEN'
3. drop or deal with tail(5): delect or keep?
4. rename

In [28]:
# 1. drop the first column
regis_model_df = regis_model_df.drop(['Unnamed: 0'], axis = 1)

In [29]:
# 2. drop the rows with ZUSAMMEN in 'Hersteller	'
# 2.1 specify which rows 
regis_model_df[regis_model_df['Hersteller'].str.contains('ZUSAMMEN')]

Unnamed: 0,Hersteller,Handelsname,Typ-Schl.-Nr.,Baden-\nWürttemberg,Bayern,Berlin,Branden-\nburg,Bremen,Hamburg,Hessen,...,Nieder-\nsachsen,Nordrhein-\nWestfalen,Rheinland-\nPfalz,Saarland,Sachsen,Sachsen-\nAnhalt,Schleswig-\nHolstein,Thüringen,Sonstige,Deutschland
13,ALPINA ZUSAMMEN,SONSTIGE/NICHT GETYPT,ACQ,112,206,19,11,10,19,71,...,69,162,70,10,13,3,45,5,-,829.0
15,ASTON MARTIN (UK) ZUSAMMEN,SONSTIGE/NICHT GETYPT,ACQ,47,85,16,4,-,26,105,...,19,100,6,1,17,3,11,-,-,441.0
313,AUDI (D) ZUSAMMEN,SONSTIGE/NICHT GETYPT,AAE,27045,64576,4265,1809,791,6193,13628,...,10709,30450,6225,1235,4078,1902,3081,2598,181,179991.0
324,AUDI (H) ZUSAMMEN,SONSTIGE/NICHT GETYPT,ACQ,342,745,30,8,1,16,114,...,112,354,89,10,21,11,22,8,-,1886.0
326,BEIJING BORGWARD (RC) ZUSAMMEN,BX7; BX7 TS,AAB,104,-,-,3,-,-,-,...,1,-,-,-,-,-,-,-,-,108.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2649,VAZ-LADA (RUS) ZUSAMMEN,SONSTIGE/NICHT GETYPT,AAY,96,111,5,273,-,1,8,...,601,136,35,9,88,141,4,101,-,1621.0
2888,VOLKSWAGEN (D) ZUSAMMEN,SONSTIGE/NICHT GETYPT,COY,53330,76710,9158,7084,2259,29649,38955,...,120922,87506,14568,2729,13984,7203,12422,8117,375,489858.0
2890,VOLKSWAGEN-VWOA (USA) ZUSAMMEN,SONSTIGE/NICHT GETYPT,COY,-,-,-,-,-,-,-,...,79,-,-,-,-,2,-,-,-,81.0
2958,VOLVO (S) ZUSAMMEN,SONSTIGE/NICHT GETYPT,BMQ,4726,6512,1050,751,292,2357,4882,...,3369,16132,1240,176,864,308,1655,482,9,45171.0


In [30]:
# 2.2 drop
regis_model_df = regis_model_df.drop(regis_model_df[regis_model_df['Hersteller'].str.contains('ZUSAMMEN')].index)

In [31]:
regis_model_df.head(5)

Unnamed: 0,Hersteller,Handelsname,Typ-Schl.-Nr.,Baden-\nWürttemberg,Bayern,Berlin,Branden-\nburg,Bremen,Hamburg,Hessen,...,Nieder-\nsachsen,Nordrhein-\nWestfalen,Rheinland-\nPfalz,Saarland,Sachsen,Sachsen-\nAnhalt,Schleswig-\nHolstein,Thüringen,Sonstige,Deutschland
0,ALPINA,BMW ALPINA B3 Limousine,ACU,7,8,4,1,1,-,3,...,3,6,4,3,-,-,2,-,-,42.0
1,ALPINA,BMW ALPINA B3 Touring,ACV,26,40,3,1,2,4,16,...,19,32,17,1,2,1,13,1,-,178.0
2,ALPINA,BMW ALPINA B5,ACW,3,9,1,-,-,-,5,...,-,7,2,-,-,-,2,-,-,29.0
3,ALPINA,BMW ALPINA B5,ACX,25,33,2,2,2,4,12,...,9,26,5,4,3,1,5,1,-,135.0
4,ALPINA,BMW ALPINA B8,ADG,-,4,-,1,-,-,-,...,1,3,3,-,-,-,1,-,-,13.0


In [32]:
# 3. drop or deal with tail(5): delect or keep?
regis_model_df.tail(5)

Unnamed: 0,Hersteller,Handelsname,Typ-Schl.-Nr.,Baden-\nWürttemberg,Bayern,Berlin,Branden-\nburg,Bremen,Hamburg,Hessen,...,Nieder-\nsachsen,Nordrhein-\nWestfalen,Rheinland-\nPfalz,Saarland,Sachsen,Sachsen-\nAnhalt,Schleswig-\nHolstein,Thüringen,Sonstige,Deutschland
2960,ZHEJIANG GEELY (RC),Lynk & Co 01,AAB,-,3,-,3,-,1,1073,...,3,2,-,-,2,-,-,-,-,1087.0
2962,SONSTIGE HERSTELLER,Lynk & Co 01,AAB,218,396,19,61,13,4,171,...,887,320,56,22,69,41,96,31,8,2436.0
2963,INSGESAMT,Lynk & Co 01,AAB,367282,531550,60376,50986,15561,92464,276989,...,269551,541469,104156,31205,86713,42680,72040,47804,1511,2622132.0
2964,INSGESAMT,Lynk & Co 01,AAB,367282,531550,60376,50986,15561,92464,276989,...,269551,541469,104156,31205,86713,42680,72040,47804,1511,2622132.0
2965,"© Kraftfahrt-Bundesamt, Flensburg",Lynk & Co 01,AAB,367282,531550,60376,50986,15561,92464,276989,...,269551,541469,104156,31205,86713,42680,72040,47804,1511,2622132.0


In [33]:
regis_model_df = regis_model_df[0:2898]

In [34]:
regis_model_df

Unnamed: 0,Hersteller,Handelsname,Typ-Schl.-Nr.,Baden-\nWürttemberg,Bayern,Berlin,Branden-\nburg,Bremen,Hamburg,Hessen,...,Nieder-\nsachsen,Nordrhein-\nWestfalen,Rheinland-\nPfalz,Saarland,Sachsen,Sachsen-\nAnhalt,Schleswig-\nHolstein,Thüringen,Sonstige,Deutschland
0,ALPINA,BMW ALPINA B3 Limousine,ACU,7,8,4,1,1,-,3,...,3,6,4,3,-,-,2,-,-,42.0
1,ALPINA,BMW ALPINA B3 Touring,ACV,26,40,3,1,2,4,16,...,19,32,17,1,2,1,13,1,-,178.0
2,ALPINA,BMW ALPINA B5,ACW,3,9,1,-,-,-,5,...,-,7,2,-,-,-,2,-,-,29.0
3,ALPINA,BMW ALPINA B5,ACX,25,33,2,2,2,4,12,...,9,26,5,4,3,1,5,1,-,135.0
4,ALPINA,BMW ALPINA B8,ADG,-,4,-,1,-,-,-,...,1,3,3,-,-,-,1,-,-,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2957,VOLVO (S),SONSTIGE/NICHT GETYPT,BMQ,10,20,3,2,1,8,13,...,7,33,6,1,5,-,3,-,2,118.0
2959,ZHEJIANG GEELY (RC),Lynk & Co 01,AAA,-,-,-,-,-,-,329,...,-,-,-,-,-,-,-,-,-,329.0
2960,ZHEJIANG GEELY (RC),Lynk & Co 01,AAB,-,3,-,3,-,1,1073,...,3,2,-,-,2,-,-,-,-,1087.0
2962,SONSTIGE HERSTELLER,Lynk & Co 01,AAB,218,396,19,61,13,4,171,...,887,320,56,22,69,41,96,31,8,2436.0


In [35]:
# 4. rename
regis_model_df.columns

Index(['Hersteller', 'Handelsname', 'Typ-Schl.-Nr.', 'Baden-\nWürttemberg',
       'Bayern', 'Berlin', 'Branden-\nburg', 'Bremen', 'Hamburg', 'Hessen',
       'Mecklenburg-\nVorpommern', 'Nieder-\nsachsen', 'Nordrhein-\nWestfalen',
       'Rheinland-\nPfalz', 'Saarland', 'Sachsen', 'Sachsen-\nAnhalt',
       'Schleswig-\nHolstein', 'Thüringen', 'Sonstige', 'Deutschland'],
      dtype='object')

In [36]:
# rename

regis_model_df.rename(columns={'Hersteller': 'car_company',
                                     'Handelsname': 'model',
                                     'Typ-Schl.-Nr.': 'Typ_Schl._Nr.',
                                     'Baden-\nWürttemberg' : 'Baden_Württemberg',
                                     'Bayern': 'Bayern',
                                     'Berlin': 'Berlin',
                                     'Branden-\nburg': 'Brandenburg',
                                     'Bremen': 'Bremen',
                                     'Hamburg': 'Hamburg',
                                     'Hessen': 'Hessen',
                                     'Mecklenburg-\nVorpommern': 'Mecklenburg_Vorpommern',
                                     'Nieder-\nsachsen': 'Niedersachsen',
                                     'Nordrhein-\nWestfalen': 'Nordrhein_Westfalen',
                                     'Rheinland-\nPfalz': 'Rheinland_Pfalz',
                                     'Saarland': 'Saarland',
                                     'Sachsen': 'Sachsen',
                                     'Sachsen-\nAnhalt': 'Sachsen_Anhalt',
                                     'Schleswig-\nHolstein': 'Schleswig_Holstein',
                                     'Thüringen': 'Thüringen',
                                     'Sonstige': 'special',
                                     'Deutschland': 'germany'}, inplace=True)


In [37]:
regis_model_df.columns

Index(['car_company', 'model', 'Typ_Schl._Nr.', 'Baden_Württemberg', 'Bayern',
       'Berlin', 'Brandenburg', 'Bremen', 'Hamburg', 'Hessen',
       'Mecklenburg_Vorpommern', 'Niedersachsen', 'Nordrhein_Westfalen',
       'Rheinland_Pfalz', 'Saarland', 'Sachsen', 'Sachsen_Anhalt',
       'Schleswig_Holstein', 'Thüringen', 'special', 'germany'],
      dtype='object')

In [38]:
# Variable with all columns to change
cols_to_convert = ['Baden_Württemberg', 'Bayern',
       'Berlin', 'Brandenburg', 'Bremen', 'Hamburg', 'Hessen',
       'Mecklenburg_Vorpommern', 'Niedersachsen', 'Nordrhein_Westfalen',
       'Rheinland_Pfalz', 'Saarland', 'Sachsen', 'Sachsen_Anhalt',
       'Schleswig_Holstein', 'Thüringen', 'special', 'germany']
# Replace - with 0
for col in cols_to_convert:
    regis_model_df[col] = regis_model_df[col].replace('-', '0')

In [39]:
regis_model_df.head(5)

Unnamed: 0,car_company,model,Typ_Schl._Nr.,Baden_Württemberg,Bayern,Berlin,Brandenburg,Bremen,Hamburg,Hessen,...,Niedersachsen,Nordrhein_Westfalen,Rheinland_Pfalz,Saarland,Sachsen,Sachsen_Anhalt,Schleswig_Holstein,Thüringen,special,germany
0,ALPINA,BMW ALPINA B3 Limousine,ACU,7,8,4,1,1,0,3,...,3,6,4,3,0,0,2,0,0,42.0
1,ALPINA,BMW ALPINA B3 Touring,ACV,26,40,3,1,2,4,16,...,19,32,17,1,2,1,13,1,0,178.0
2,ALPINA,BMW ALPINA B5,ACW,3,9,1,0,0,0,5,...,0,7,2,0,0,0,2,0,0,29.0
3,ALPINA,BMW ALPINA B5,ACX,25,33,2,2,2,4,12,...,9,26,5,4,3,1,5,1,0,135.0
4,ALPINA,BMW ALPINA B8,ADG,0,4,0,1,0,0,0,...,1,3,3,0,0,0,1,0,0,13.0


In [40]:
for col1 in cols_to_convert:
    regis_model_df[col1] = regis_model_df[col1].replace(',', '.')

In [41]:
regis_model_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2898 entries, 0 to 2963
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   car_company             2898 non-null   object 
 1   model                   2898 non-null   object 
 2   Typ_Schl._Nr.           2898 non-null   object 
 3   Baden_Württemberg       2898 non-null   object 
 4   Bayern                  2898 non-null   object 
 5   Berlin                  2898 non-null   object 
 6   Brandenburg             2898 non-null   object 
 7   Bremen                  2898 non-null   object 
 8   Hamburg                 2898 non-null   object 
 9   Hessen                  2898 non-null   object 
 10  Mecklenburg_Vorpommern  2898 non-null   object 
 11  Niedersachsen           2898 non-null   object 
 12  Nordrhein_Westfalen     2898 non-null   object 
 13  Rheinland_Pfalz         2898 non-null   object 
 14  Saarland                2898 non-null   

In [42]:
# convert column to integer data type
for col2 in cols_to_convert:
    regis_model_df[col2] = regis_model_df[col2].astype(int)

In [43]:
regis_model_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2898 entries, 0 to 2963
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   car_company             2898 non-null   object
 1   model                   2898 non-null   object
 2   Typ_Schl._Nr.           2898 non-null   object
 3   Baden_Württemberg       2898 non-null   int64 
 4   Bayern                  2898 non-null   int64 
 5   Berlin                  2898 non-null   int64 
 6   Brandenburg             2898 non-null   int64 
 7   Bremen                  2898 non-null   int64 
 8   Hamburg                 2898 non-null   int64 
 9   Hessen                  2898 non-null   int64 
 10  Mecklenburg_Vorpommern  2898 non-null   int64 
 11  Niedersachsen           2898 non-null   int64 
 12  Nordrhein_Westfalen     2898 non-null   int64 
 13  Rheinland_Pfalz         2898 non-null   int64 
 14  Saarland                2898 non-null   int64 
 15  Sach

In [44]:
regis_model_df["year"] = 2021

In [45]:
regis_model_df

Unnamed: 0,car_company,model,Typ_Schl._Nr.,Baden_Württemberg,Bayern,Berlin,Brandenburg,Bremen,Hamburg,Hessen,...,Nordrhein_Westfalen,Rheinland_Pfalz,Saarland,Sachsen,Sachsen_Anhalt,Schleswig_Holstein,Thüringen,special,germany,year
0,ALPINA,BMW ALPINA B3 Limousine,ACU,7,8,4,1,1,0,3,...,6,4,3,0,0,2,0,0,42,2021
1,ALPINA,BMW ALPINA B3 Touring,ACV,26,40,3,1,2,4,16,...,32,17,1,2,1,13,1,0,178,2021
2,ALPINA,BMW ALPINA B5,ACW,3,9,1,0,0,0,5,...,7,2,0,0,0,2,0,0,29,2021
3,ALPINA,BMW ALPINA B5,ACX,25,33,2,2,2,4,12,...,26,5,4,3,1,5,1,0,135,2021
4,ALPINA,BMW ALPINA B8,ADG,0,4,0,1,0,0,0,...,3,3,0,0,0,1,0,0,13,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2957,VOLVO (S),SONSTIGE/NICHT GETYPT,BMQ,10,20,3,2,1,8,13,...,33,6,1,5,0,3,0,2,118,2021
2959,ZHEJIANG GEELY (RC),Lynk & Co 01,AAA,0,0,0,0,0,0,329,...,0,0,0,0,0,0,0,0,329,2021
2960,ZHEJIANG GEELY (RC),Lynk & Co 01,AAB,0,3,0,3,0,1,1073,...,2,0,0,2,0,0,0,0,1087,2021
2962,SONSTIGE HERSTELLER,Lynk & Co 01,AAB,218,396,19,61,13,4,171,...,320,56,22,69,41,96,31,8,2436,2021


In [47]:
# put all states into one column
id_cols = pd.concat([regis_model_df.iloc[:, :3], regis_model_df.iloc[:, 20:]], axis=1)
regis_model_df = pd.melt(regis_model_df, id_vars=id_cols, value_vars=regis_model_df.columns[3:20], var_name='federal_state', value_name='new_registration')

In [49]:
regis_model_df.head(2)

Unnamed: 0,car_company,model,Typ_Schl._Nr.,germany,year,federal_state,new_registration
0,ALPINA,BMW ALPINA B3 Limousine,ACU,42,2021,Baden_Württemberg,7
1,ALPINA,BMW ALPINA B3 Touring,ACV,178,2021,Baden_Württemberg,26


In [50]:
regis_model_df[regis_model_df['model']== 'BMW ALPINA B3 Limousine']

Unnamed: 0,car_company,model,Typ_Schl._Nr.,germany,year,federal_state,new_registration
0,ALPINA,BMW ALPINA B3 Limousine,ACU,42,2021,Baden_Württemberg,7
2898,ALPINA,BMW ALPINA B3 Limousine,ACU,42,2021,Bayern,8
5796,ALPINA,BMW ALPINA B3 Limousine,ACU,42,2021,Berlin,4
8694,ALPINA,BMW ALPINA B3 Limousine,ACU,42,2021,Brandenburg,1
11592,ALPINA,BMW ALPINA B3 Limousine,ACU,42,2021,Bremen,1
14490,ALPINA,BMW ALPINA B3 Limousine,ACU,42,2021,Hamburg,0
17388,ALPINA,BMW ALPINA B3 Limousine,ACU,42,2021,Hessen,3
20286,ALPINA,BMW ALPINA B3 Limousine,ACU,42,2021,Mecklenburg_Vorpommern,0
23184,ALPINA,BMW ALPINA B3 Limousine,ACU,42,2021,Niedersachsen,3
26082,ALPINA,BMW ALPINA B3 Limousine,ACU,42,2021,Nordrhein_Westfalen,6


In [None]:
## at the end
## regis_model_df.to_csv('regis_model_2021.csv', index=False)

In [None]:
"""
For PDF
import tabula
df = tabula.read_pdf('fz4_2016_pdf-2.pdf', pages='10-12')
"""

## Analysis
1. How many car companies are producing e-cars and the share of each company (most sucessful company)
4. How many models and the share of each model (most popular model)