## Data Cleaning

### Approach

- Importing and inspecting the data sets.
- After that cleaning the data accordingly.
- Creating Meta-Data for all the data sets.

#### Importing Libraries

In [2]:
# Importing python libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import seaborn as sns
import requests

In [3]:
# Checking the directory currently in
import os
print(os.getcwd())


/Users/andreasdangaris/NeueFische/capstone_project_ev/capstone_project_ev


In [4]:
# Creating dataframe from xlsx file in repo
df_registered_cars = pd.read_excel('newly_registered_cars_08_23.xlsx', sheet_name='08_23')

In [5]:
# checking and confirming the status of the df 
type(df_registered_cars)

pandas.core.frame.DataFrame

In [6]:
# calling the dataframe to check on it's content
df_registered_cars.head(2)

Unnamed: 0,Land,Benzin,Diesel,Flüssiggas\n(LPG)\n(einschließlich\nbivalent),Erdgas\n(CNG)\n(einschließlich\nbivalent),Elektro\n(BEV),Hybrid\ninsgesamt,darunter\nPlug-in,Zum Vergleich:\nInsgesamt,Anzahl insgesamt Brennstoffzelle (Wasserstoff),Total Gas,Jahr,Month,andere Hybrid
0,Baden-Württemberg,,,,,8706,,4287.0,56104,7,265.0,2023,Jan-Feb,14108.0
1,Bayern,,,,,9558,,3629.0,81058,8,325.0,2023,Jan-Feb,19325.0


### Simple EDA "charging stations"

In [7]:
# Checking rows and columns
df_registered_cars.shape

(256, 14)

In [8]:
# Checking for null values and data types
df_registered_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 14 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Land                                            256 non-null    object 
 1   Benzin                                          224 non-null    float64
 2   Diesel                                          224 non-null    float64
 3   Flüssiggas
(LPG)
(einschließlich
bivalent)      224 non-null    float64
 4   Erdgas
(CNG)
(einschließlich
bivalent)          224 non-null    float64
 5   Elektro
(BEV)                                   256 non-null    object 
 6   Hybrid
insgesamt                                224 non-null    float64
 7   darunter
Plug-in                                112 non-null    float64
 8   Zum Vergleich:
Insgesamt                        256 non-null    int64  
 9   Anzahl insgesamt Brennstoffzelle (Wassersto

In [9]:
# Checking the dataframes central tendencies
df_registered_cars.describe()

Unnamed: 0,Benzin,Diesel,Flüssiggas\n(LPG)\n(einschließlich\nbivalent),Erdgas\n(CNG)\n(einschließlich\nbivalent),Hybrid\ninsgesamt,darunter\nPlug-in,Zum Vergleich:\nInsgesamt,Total Gas,Jahr,andere Hybrid
count,224.0,224.0,224.0,224.0,224.0,112.0,256.0,48.0,256.0,32.0
mean,107921.897321,78592.285714,459.46875,429.486607,8617.808036,9054.080357,186189.382812,687.395833,2015.5,17311.3125
std,109916.974184,87790.281046,640.838275,504.176218,22744.882688,16676.532674,200125.489492,919.443929,4.618802,26731.924427
min,5020.0,3625.0,20.0,10.0,83.0,122.0,2790.0,15.0,2008.0,796.0
25%,35264.25,17724.75,88.0,111.5,482.0,703.0,53837.0,131.0,2011.75,3091.25
50%,54902.0,34090.5,220.5,205.5,1422.5,2934.0,86148.5,325.5,2015.5,7732.5
75%,170532.75,132045.0,535.5,600.25,5384.5,8057.5,315525.5,743.75,2019.25,16814.75
max,521661.0,339338.0,4037.0,2845.0,171750.0,84491.0,745980.0,4121.0,2023.0,114141.0


In [10]:
# Checking for the columns 
df_registered_cars.columns

Index(['Land', 'Benzin', 'Diesel',
       'Flüssiggas\n(LPG)\n(einschließlich\nbivalent)',
       'Erdgas\n(CNG)\n(einschließlich\nbivalent)', 'Elektro\n(BEV)',
       'Hybrid\ninsgesamt', 'darunter\nPlug-in', 'Zum Vergleich:\nInsgesamt',
       'Anzahl insgesamt Brennstoffzelle (Wasserstoff)', 'Total Gas', 'Jahr',
       'Month', 'andere Hybrid'],
      dtype='object')

In [11]:
# Translating column names from German to English
# Changing the names in a pythonic way. lower case and replacing ' ' with '_'
df_registered_cars.rename(columns={'Land': 'Country',
                                    'Benzin': 'Gasoline',
                                    'Diesel': 'Diesel',
                                    'Flüssiggas\n(LPG)\n(einschließlich\nbivalent)': 'Liquefied petroleum gas (LPG)',
                                    'Erdgas\n(CNG)\n(einschließlich\nbivalent)': 'Compressed natural gas (CNG)',
                                    'Elektro\n(BEV)': 'Battery-electric vehicle (BEV)',
                                    'Hybrid\ninsgesamt': 'Total hybrid',
                                    'darunter\nPlug-in': 'Plug-in',
                                    'Zum Vergleich:\nInsgesamt': 'Total',
                                    'Anzahl insgesamt Brennstoffzelle (Wasserstoff)': 'Total fuel cell (hydrogen)',
                                    'Total Gas': 'Total gas',
                                    'Jahr': 'Year',
                                    'Month': 'Month',
                                    'andere Hybrid': 'Other hybrid'}, inplace=True)


In [12]:
# Convert column names to lowercase and replace spaces with underscores
df_registered_cars.columns = df_registered_cars.columns.str.lower().str.replace(' ', '_')

In [13]:
# Checking the new column names
df_registered_cars.columns

Index(['country', 'gasoline', 'diesel', 'liquefied_petroleum_gas_(lpg)',
       'compressed_natural_gas_(cng)', 'battery-electric_vehicle_(bev)',
       'total_hybrid', 'plug-in', 'total', 'total_fuel_cell_(hydrogen)',
       'total_gas', 'year', 'month', 'other_hybrid'],
      dtype='object')

In [14]:
# Changing country column to federal state
df_registered_cars.rename(columns={'country': 'federal_state', 'battery-electric_vehicle_(bev)': 'battery_electric_vehicle', 'plug-in': 'hybrid_electric'}, inplace=True)

In [15]:
# Dropping the unnecessary columns
df_registered_cars.drop(['gasoline', 'diesel', 'liquefied_petroleum_gas_(lpg)', 'compressed_natural_gas_(cng)', 'month'], axis=1, inplace=True)

In [16]:
df_registered_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   federal_state               256 non-null    object 
 1   battery_electric_vehicle    256 non-null    object 
 2   total_hybrid                224 non-null    float64
 3   hybrid_electric             112 non-null    float64
 4   total                       256 non-null    int64  
 5   total_fuel_cell_(hydrogen)  48 non-null     object 
 6   total_gas                   48 non-null     float64
 7   year                        256 non-null    int64  
 8   other_hybrid                32 non-null     float64
dtypes: float64(4), int64(2), object(3)
memory usage: 18.1+ KB


#### Meta Data

|Variable |Definition  | Key|
|--- | --- | ---|
|**federal_state**| Federal state of Germany ||
|**battery_electric_vehicle**| Count of pure electric cars ||
|**total_hybrid**| Count of all hybrids (plug-in hybrids included) ||
|**hybrid_electric**| Count of all plug-in hybrids ||
|**total**| Total amount  ||
|**total_fuel_cell_(hydrogen)**| Count of hydrogen cars ||
|**total_gas**| Count of CNG and LPG cars ||
|**year**| Year of observation (2008-2023) ||
|**other_hybrid**| Count of all hybrids (plug-in hybrids excluded) ||

#### Detailed EDA

In [17]:
# Check for missing values
print(df_registered_cars.isnull().sum())

federal_state                   0
battery_electric_vehicle        0
total_hybrid                   32
hybrid_electric               144
total                           0
total_fuel_cell_(hydrogen)    208
total_gas                     208
year                            0
other_hybrid                  224
dtype: int64


In [18]:
# Getting rid of leading or trailing whitespaces
df_registered_cars = df_registered_cars.applymap(lambda x: x.strip() if isinstance(x, str) else x)
df_registered_cars.tail(2)

Unnamed: 0,federal_state,battery_electric_vehicle,total_hybrid,hybrid_electric,total,total_fuel_cell_(hydrogen),total_gas,year,other_hybrid
254,Schleswig-Holstein,2,206.0,,79943,,,2008,
255,Thüringen,-,140.0,,66687,,,2008,


In [19]:
# Checking for duplicates
duplicates = df_registered_cars.duplicated()
print('Number of duplicate entries: ', duplicates.sum())
print(duplicates)

Number of duplicate entries:  0
0      False
1      False
2      False
3      False
4      False
       ...  
251    False
252    False
253    False
254    False
255    False
Length: 256, dtype: bool


In [20]:
# Calling dataframe after dealing with duplicates
df_registered_cars.tail(3)

Unnamed: 0,federal_state,battery_electric_vehicle,total_hybrid,hybrid_electric,total,total_fuel_cell_(hydrogen),total_gas,year,other_hybrid
253,Sachsen-Anhalt,-,143.0,,59576,,,2008,
254,Schleswig-Holstein,2,206.0,,79943,,,2008,
255,Thüringen,-,140.0,,66687,,,2008,


In [25]:
# Checking if any column starts with a space
for col in df_registered_cars.columns:
    if df_registered_cars[col].dtype == 'O': # check if column has object dtype (i.e. contains strings)
        if df_registered_cars[col].str.startswith(' ').any(): # check if any string in column starts with a space
            print(f"Column {col} contains values starting with a space.")


In [26]:
df_registered_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   federal_state               256 non-null    object 
 1   battery_electric_vehicle    256 non-null    object 
 2   total_hybrid                224 non-null    float64
 3   hybrid_electric             112 non-null    float64
 4   total                       256 non-null    int64  
 5   total_fuel_cell_(hydrogen)  48 non-null     object 
 6   total_gas                   48 non-null     float64
 7   year                        256 non-null    int64  
 8   other_hybrid                32 non-null     float64
dtypes: float64(4), int64(2), object(3)
memory usage: 18.1+ KB
