## Data Cleaning

### Approach

- Importing and inspecting the data sets.
- After that -> cleaning the data accordingly.
- Creating Meta-Data for all the data sets.

#### Importing Libraries

In [2]:
# Importing python libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import seaborn as sns
import requests
import sql_functions as sql

In [3]:
# Checking the directory currently in
import os
print(os.getcwd())


/Users/andreasdangaris/NeueFische/capstone_project_ev/capstone_project_ev


In [27]:
# Displaying up to 100 columns
pd.set_option('display.max_columns', 100)

In [29]:
# Creating DataFrame from xlsx file in repo
df_registered_cars = pd.read_excel('registration_08_23.xlsx')

In [30]:
# checking and confirming the status of the df 
type(df_registered_cars)

pandas.core.frame.DataFrame

In [31]:
# calling the dataframe to check on it's content
df_registered_cars.head(2)

Unnamed: 0,Land,Benzin,Diesel,Flüssiggas\n(LPG)\n(einschließlich\nbivalent),Erdgas\n(CNG)\n(einschließlich\nbivalent),Elektro\n(BEV),Hybrid\ninsgesamt,darunter\nPlug-in,Zum Vergleich:\nInsgesamt,Anzahl insgesamt Brennstoffzelle (Wasserstoff),Total Gas,Jahr,Month,andere Hybrid
0,Baden-Württemberg,,,,,8706,18395,4287.0,56104,7.0,265,2023,Jan-Feb,14108.0
1,Bayern,,,,,9558,22954,3629.0,81058,8.0,325,2023,Jan-Feb,19325.0


### Simple EDA "newly registered cars"

In [32]:
# Checking rows and columns
df_registered_cars.shape

(256, 14)

In [33]:
# Checking for null values and data types
df_registered_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 14 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Land                                            256 non-null    object 
 1   Benzin                                          224 non-null    float64
 2   Diesel                                          224 non-null    float64
 3   Flüssiggas
(LPG)
(einschließlich
bivalent)      224 non-null    float64
 4   Erdgas
(CNG)
(einschließlich
bivalent)          224 non-null    float64
 5   Elektro
(BEV)                                   256 non-null    object 
 6   Hybrid
insgesamt                                256 non-null    int64  
 7   darunter
Plug-in                                112 non-null    float64
 8   Zum Vergleich:
Insgesamt                        256 non-null    int64  
 9   Anzahl insgesamt Brennstoffzelle (Wassersto

In [34]:
# Checking the dataframes central tendencies
df_registered_cars.describe()

Unnamed: 0,Benzin,Diesel,Flüssiggas\n(LPG)\n(einschließlich\nbivalent),Erdgas\n(CNG)\n(einschließlich\nbivalent),Hybrid\ninsgesamt,darunter\nPlug-in,Zum Vergleich:\nInsgesamt,Anzahl insgesamt Brennstoffzelle (Wasserstoff),Total Gas,Jahr,andere Hybrid
count,224.0,224.0,224.0,224.0,256.0,112.0,256.0,39.0,256.0,256.0,32.0
mean,107921.897321,78592.285714,459.46875,429.486607,11199.019531,9054.080357,186189.382812,34.153846,851.917969,2015.5,17311.3125
std,109916.974184,87790.281046,640.838275,504.176218,27847.475319,16676.532674,200125.489492,49.367333,1043.335427,4.618802,26731.924427
min,5020.0,3625.0,20.0,10.0,83.0,122.0,2790.0,1.0,15.0,2008.0,796.0
25%,35264.25,17724.75,88.0,111.5,541.75,703.0,53837.0,5.5,199.5,2011.75,3091.25
50%,54902.0,34090.5,220.5,205.5,1887.0,2934.0,86148.5,14.0,408.5,2015.5,7732.5
75%,170532.75,132045.0,535.5,600.25,7781.75,8057.5,315525.5,40.5,1143.25,2019.25,16814.75
max,521661.0,339338.0,4037.0,2845.0,184240.0,84491.0,745980.0,244.0,6015.0,2023.0,114141.0


In [35]:
# Checking for the columns 
df_registered_cars.columns

Index(['Land', 'Benzin', 'Diesel',
       'Flüssiggas\n(LPG)\n(einschließlich\nbivalent)',
       'Erdgas\n(CNG)\n(einschließlich\nbivalent)', 'Elektro\n(BEV)',
       'Hybrid\ninsgesamt', 'darunter\nPlug-in', 'Zum Vergleich:\nInsgesamt',
       'Anzahl insgesamt Brennstoffzelle (Wasserstoff)', 'Total Gas', 'Jahr',
       'Month', 'andere Hybrid'],
      dtype='object')

In [37]:
# Translating column names from German to English
# Changing the names in a pythonic way. lower case and replacing ' ' with '_'
df_registered_cars.rename(columns={'Land': 'Country',
                                    'Benzin': 'Gasoline',
                                    'Diesel': 'Diesel',
                                    'Flüssiggas\n(LPG)\n(einschließlich\nbivalent)': 'Liquefied petroleum gas (LPG)',
                                    'Erdgas\n(CNG)\n(einschließlich\nbivalent)': 'Compressed natural gas (CNG)',
                                    'Elektro\n(BEV)': 'Battery-electric vehicle (BEV)',
                                    'Hybrid\ninsgesamt': 'Total hybrid',
                                    'darunter\nPlug-in': 'Plug-in',
                                    'Zum Vergleich:\nInsgesamt': 'Total',
                                    'Anzahl insgesamt Brennstoffzelle (Wasserstoff)': 'Total fuel cell (hydrogen)',
                                    'Total Gas': 'Total gas',
                                    'Jahr': 'Year',
                                    'Month': 'Month',
                                    'andere Hybrid': 'Other hybrid'}, inplace=True)


In [38]:
# Convert column names to lowercase and replace spaces with underscores
df_registered_cars.columns = df_registered_cars.columns.str.lower().str.replace(' ', '_')

In [39]:
# Checking the new column names
df_registered_cars.columns

Index(['country', 'gasoline', 'diesel', 'liquefied_petroleum_gas_(lpg)',
       'compressed_natural_gas_(cng)', 'battery-electric_vehicle_(bev)',
       'total_hybrid', 'plug-in', 'total', 'total_fuel_cell_(hydrogen)',
       'total_gas', 'year', 'month', 'other_hybrid'],
      dtype='object')

In [40]:
# Changing country column to federal state
df_registered_cars.rename(columns={'country': 'federal_state', 'battery-electric_vehicle_(bev)': 'battery_electric_vehicle', 'plug-in': 'hybrid_electric'}, inplace=True)

In [41]:
# Dropping the unnecessary columns
df_registered_cars.drop(['gasoline', 'diesel', 'liquefied_petroleum_gas_(lpg)', 'compressed_natural_gas_(cng)', 'month'], axis=1, inplace=True)

In [42]:
df_registered_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   federal_state               256 non-null    object 
 1   battery_electric_vehicle    256 non-null    object 
 2   total_hybrid                256 non-null    int64  
 3   hybrid_electric             112 non-null    float64
 4   total                       256 non-null    int64  
 5   total_fuel_cell_(hydrogen)  39 non-null     float64
 6   total_gas                   256 non-null    int64  
 7   year                        256 non-null    int64  
 8   other_hybrid                32 non-null     float64
dtypes: float64(3), int64(4), object(2)
memory usage: 18.1+ KB


#### Meta Data

|Variable |Definition  | Key|
|--- | --- | ---|
|**federal_state**| Federal state of Germany ||
|**battery_electric_vehicle**| Count of pure electric cars ||
|**total_hybrid**| Count of all hybrids (plug-in hybrids included) ||
|**hybrid_electric**| Count of all plug-in hybrids ||
|**total**| Total amount  ||
|**total_fuel_cell_(hydrogen)**| Count of hydrogen cars ||
|**total_gas**| Count of CNG and LPG cars ||
|**year**| Year of observation (2008-2023) ||
|**other_hybrid**| Count of all hybrids (plug-in hybrids excluded) |total_hybrid - hybrid_electric|

#### Detailed EDA

In [43]:
# Check for missing values
print(df_registered_cars.isnull().sum())

federal_state                   0
battery_electric_vehicle        0
total_hybrid                    0
hybrid_electric               144
total                           0
total_fuel_cell_(hydrogen)    217
total_gas                       0
year                            0
other_hybrid                  224
dtype: int64


In [18]:
# Getting rid of leading or trailing whitespaces
df_registered_cars = df_registered_cars.applymap(lambda x: x.strip() if isinstance(x, str) else x)
df_registered_cars.tail(2)

Unnamed: 0,federal_state,battery_electric_vehicle,total_hybrid,hybrid_electric,total,total_fuel_cell_(hydrogen),total_gas,year,other_hybrid
254,Schleswig-Holstein,2,206.0,,79943,,,2008,
255,Thüringen,-,140.0,,66687,,,2008,


In [19]:
# Checking for duplicates
duplicates = df_registered_cars.duplicated()
print('Number of duplicate entries: ', duplicates.sum())
print(duplicates)

Number of duplicate entries:  0
0      False
1      False
2      False
3      False
4      False
       ...  
251    False
252    False
253    False
254    False
255    False
Length: 256, dtype: bool


In [20]:
# Calling dataframe after dealing with duplicates
df_registered_cars.tail(3)

Unnamed: 0,federal_state,battery_electric_vehicle,total_hybrid,hybrid_electric,total,total_fuel_cell_(hydrogen),total_gas,year,other_hybrid
253,Sachsen-Anhalt,-,143.0,,59576,,,2008,
254,Schleswig-Holstein,2,206.0,,79943,,,2008,
255,Thüringen,-,140.0,,66687,,,2008,


In [25]:
# Checking if any column starts with a space
for col in df_registered_cars.columns:
    if df_registered_cars[col].dtype == 'O': # check if column has object dtype (i.e. contains strings)
        if df_registered_cars[col].str.startswith(' ').any(): # check if any string in column starts with a space
            print(f"Column {col} contains values starting with a space.")


In [26]:
df_registered_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   federal_state               256 non-null    object 
 1   battery_electric_vehicle    256 non-null    object 
 2   total_hybrid                224 non-null    float64
 3   hybrid_electric             112 non-null    float64
 4   total                       256 non-null    int64  
 5   total_fuel_cell_(hydrogen)  48 non-null     object 
 6   total_gas                   48 non-null     float64
 7   year                        256 non-null    int64  
 8   other_hybrid                32 non-null     float64
dtypes: float64(4), int64(2), object(3)
memory usage: 18.1+ KB


In [28]:
df_registered_cars.head(1)

Unnamed: 0,federal_state,battery_electric_vehicle,total_hybrid,hybrid_electric,total,total_fuel_cell_(hydrogen),total_gas,year,other_hybrid
0,Baden-Württemberg,8706,,4287.0,56104,7,265.0,2023,14108.0


In [46]:
# Calculate other_hybrid for the rows where it is missing
df_registered_cars.loc[df_registered_cars['other_hybrid'].isna(), 'other_hybrid'] = df_registered_cars['total_hybrid'] - df_registered_cars['hybrid_electric']

# Check the result for calculated other_hybrid entries
df_registered_cars.iloc[33:41]

Unnamed: 0,federal_state,battery_electric_vehicle,total_hybrid,hybrid_electric,total,total_fuel_cell_(hydrogen),total_gas,year,other_hybrid
33,Bayern,66860,162937,62898.0,531550,65.0,1789,2021,100039.0
34,Berlin,8737,20620,8889.0,60376,24.0,265,2021,11731.0
35,Brandenburg,6638,14219,5665.0,50986,7.0,352,2021,8554.0
36,Bremen,1833,5014,2189.0,15561,1.0,68,2021,2825.0
37,Hamburg,7334,22912,8295.0,92464,16.0,156,2021,14617.0
38,Hessen,33727,72303,32006.0,276989,33.0,1640,2021,40297.0
39,Mecklenburg-Vorpommern,3259,7244,2354.0,29795,7.0,132,2021,4890.0
40,Niedersachsen,40084,55973,27969.0,269551,38.0,2060,2021,28004.0


In [50]:
# Convert 'year' column to datetime format
df_registered_cars["year"] = pd.to_datetime(df_registered_cars["year"], format= '%Y')
# Extracting only the year
df_registered_cars['year'] = df_registered_cars['year'].dt.year

In [51]:
# Check the format
df_registered_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   federal_state               256 non-null    object 
 1   battery_electric_vehicle    256 non-null    object 
 2   total_hybrid                256 non-null    int64  
 3   hybrid_electric             112 non-null    float64
 4   total                       256 non-null    int64  
 5   total_fuel_cell_(hydrogen)  39 non-null     float64
 6   total_gas                   256 non-null    int64  
 7   year                        256 non-null    int64  
 8   other_hybrid                112 non-null    float64
dtypes: float64(3), int64(4), object(2)
memory usage: 18.1+ KB


In [52]:
df_registered_cars.head(1)

Unnamed: 0,federal_state,battery_electric_vehicle,total_hybrid,hybrid_electric,total,total_fuel_cell_(hydrogen),total_gas,year,other_hybrid
0,Baden-Württemberg,8706,18395,4287.0,56104,7.0,265,2023,14108.0


In [67]:
# Replace '-' with NaN in 'battery_electric_vehicle'
df_registered_cars['battery_electric_vehicle'] = df_registered_cars['battery_electric_vehicle'].replace('-', np.nan)

# Convert 'battery_electric_vehicle' and 'total_fuel_cell_(hydrogen)' to int Dtype
df_registered_cars['battery_electric_vehicle'] = df_registered_cars['battery_electric_vehicle'].astype(float).astype('Int64')
df_registered_cars['total_fuel_cell_(hydrogen)'] = df_registered_cars['total_fuel_cell_(hydrogen)'].astype(float).astype('Int64')

# Fill NaN values with 0 in the column
df_registered_cars['battery_electric_vehicle'] = df_registered_cars['battery_electric_vehicle'].fillna(0)
df_registered_cars['total_fuel_cell_(hydrogen)'] = df_registered_cars['total_fuel_cell_(hydrogen)'].fillna(0)

In [68]:
# Create new column for fuel engine cars
df_registered_cars['fuel_engine'] = df_registered_cars['total'] - df_registered_cars['total_hybrid'] - df_registered_cars['battery_electric_vehicle'] - df_registered_cars['total_fuel_cell_(hydrogen)'] - df_registered_cars['total_gas']


In [69]:
# Check new column
df_registered_cars.columns

Index(['federal_state', 'battery_electric_vehicle', 'total_hybrid',
       'hybrid_electric', 'total', 'total_fuel_cell_(hydrogen)', 'total_gas',
       'year', 'other_hybrid', 'fuel_engine'],
      dtype='object')

In [70]:
# Call DataFrame to look at entries of new column
df_registered_cars.head(3)

Unnamed: 0,federal_state,battery_electric_vehicle,total_hybrid,hybrid_electric,total,total_fuel_cell_(hydrogen),total_gas,year,other_hybrid,fuel_engine
0,Baden-Württemberg,8706,18395,4287.0,56104,7,265,2023,14108.0,28731
1,Bayern,9558,22954,3629.0,81058,8,325,2023,19325.0,48213
2,Berlin,1449,4088,607.0,11459,6,50,2023,3481.0,5866


In [71]:
# Check info on changed DataFrame
df_registered_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   federal_state               256 non-null    object 
 1   battery_electric_vehicle    256 non-null    Int64  
 2   total_hybrid                256 non-null    int64  
 3   hybrid_electric             112 non-null    float64
 4   total                       256 non-null    int64  
 5   total_fuel_cell_(hydrogen)  256 non-null    Int64  
 6   total_gas                   256 non-null    int64  
 7   year                        256 non-null    int64  
 8   other_hybrid                112 non-null    float64
 9   fuel_engine                 256 non-null    Int64  
dtypes: Int64(3), float64(2), int64(4), object(1)
memory usage: 20.9+ KB


In [None]:
"""
# Replace 0 with NaN in the entire dataframe
# df_registered_cars = df_registered_cars.replace(0, np.nan)
"""

In [72]:
# Filter the rows for the desired years to create a DataFrame for a hybrid-included analysis
df_reg_cars_hybrid_analysis = df_registered_cars[(df_registered_cars['year'] >= 2017) & (df_registered_cars['year'] <= 2023)]

# Reset the index
df_reg_cars_hybrid_analysis = df_reg_cars_hybrid_analysis.reset_index(drop=True)


In [78]:
# Convert 'hybrid_electric' and 'other_hybrid' to int Dtype
df_registered_cars['hybrid_electric'] = df_registered_cars['hybrid_electric'].astype(float).astype('Int64')
df_registered_cars['other_hybrid'] = df_registered_cars['other_hybrid'].astype(float).astype('Int64')
df_reg_cars_hybrid_analysis['hybrid_electric'] = df_reg_cars_hybrid_analysis['hybrid_electric'].astype(float).astype('Int64')
df_reg_cars_hybrid_analysis['other_hybrid'] = df_reg_cars_hybrid_analysis['other_hybrid'].astype(float).astype('Int64')

In [79]:
df_registered_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   federal_state               256 non-null    object
 1   battery_electric_vehicle    256 non-null    Int64 
 2   total_hybrid                256 non-null    int64 
 3   hybrid_electric             112 non-null    Int64 
 4   total                       256 non-null    int64 
 5   total_fuel_cell_(hydrogen)  256 non-null    Int64 
 6   total_gas                   256 non-null    int64 
 7   year                        256 non-null    int64 
 8   other_hybrid                112 non-null    Int64 
 9   fuel_engine                 256 non-null    Int64 
dtypes: Int64(5), int64(4), object(1)
memory usage: 21.4+ KB


In [80]:
# Checking the shape and info on this new DataFrame
print(df_reg_cars_hybrid_analysis.shape)
df_reg_cars_hybrid_analysis.info()

(112, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   federal_state               112 non-null    object
 1   battery_electric_vehicle    112 non-null    Int64 
 2   total_hybrid                112 non-null    int64 
 3   hybrid_electric             112 non-null    Int64 
 4   total                       112 non-null    int64 
 5   total_fuel_cell_(hydrogen)  112 non-null    Int64 
 6   total_gas                   112 non-null    int64 
 7   year                        112 non-null    int64 
 8   other_hybrid                112 non-null    Int64 
 9   fuel_engine                 112 non-null    Int64 
dtypes: Int64(5), int64(4), object(1)
memory usage: 9.4+ KB


In [73]:
df_reg_cars_hybrid_analysis.head(2)

Unnamed: 0,federal_state,battery_electric_vehicle,total_hybrid,hybrid_electric,total,total_fuel_cell_(hydrogen),total_gas,year,other_hybrid,fuel_engine
0,Baden-Württemberg,8706,18395,4287.0,56104,7,265,2023,14108.0,28731
1,Bayern,9558,22954,3629.0,81058,8,325,2023,19325.0,48213


#### Saving Dataframe as CSV 

In [81]:
# Saving the df_registered_cars as a csv file in this Repo on GitHub 
df_registered_cars.to_csv('registered_cars_clean.csv', index=False)

In [82]:
# Saving the df_reg_cars_hybrid_analysis as a csv file in this Repo on GitHub 
df_reg_cars_hybrid_analysis.to_csv('reg_cars_hybrid_analysis_clean.csv', index=False)