## Data Cleaning

### Approach

- Importing and inspecting the data sets.
- After that cleaning the data accordingly.
- Creating Meta-Data for all the data sets.

#### Importing Libraries

In [18]:
# Importing python libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import seaborn as sns
import requests

#### Downloading CSV file from web

In [4]:
# Defining the variable url with the website_link
url = 'https://www.bundesnetzagentur.de/SharedDocs/Downloads/DE/Sachgebiete/Energie/Unternehmen_Institutionen/E_Mobilitaet/Ladesaeulenregister_CSV.csv?__blob=publicationFile&v=44'

In [10]:
# pd.read_csv

<function pandas.io.parsers.readers.read_csv(filepath_or_buffer: 'FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]', *, sep: 'str | None | lib.NoDefault' = <no_default>, delimiter: 'str | None | lib.NoDefault' = None, header: "int | Sequence[int] | None | Literal['infer']" = 'infer', names: 'Sequence[Hashable] | None | lib.NoDefault' = <no_default>, index_col: 'IndexLabel | Literal[False] | None' = None, usecols=None, squeeze: 'bool | None' = None, prefix: 'str | lib.NoDefault' = <no_default>, mangle_dupe_cols: 'bool' = True, dtype: 'DtypeArg | None' = None, engine: 'CSVEngine | None' = None, converters=None, true_values=None, false_values=None, skipinitialspace: 'bool' = False, skiprows=None, skipfooter: 'int' = 0, nrows: 'int | None' = None, na_values=None, keep_default_na: 'bool' = True, na_filter: 'bool' = True, verbose: 'bool' = False, skip_blank_lines: 'bool' = True, parse_dates=None, infer_datetime_format: 'bool' = False, keep_date_col: 'bool' = False, date_parser=None, dayf

In [26]:
# Importing the csv file from the website with the defined 'url' variable
# First 10 rows not needed
df_charging_stations = pd.read_csv(url, delimiter=';', encoding='iso8859-1', skiprows=10)

In [None]:
# checking and confirming the status of the df 
type(df_charging_stations)

In [28]:
# calling the dataframe to check on it's content
df_charging_stations.head()

Unnamed: 0,Betreiber,Straße,Hausnummer,Adresszusatz,Postleitzahl,Ort,Bundesland,Kreis/kreisfreie Stadt,Breitengrad,Längengrad,...,Public Key1,Steckertypen2,P2 [kW],Public Key2,Steckertypen3,P3 [kW],Public Key3,Steckertypen4,P4 [kW],Public Key4
0,Albwerk GmbH & Co. KG,Ennabeurer Weg,0,,72535,Heroldstatt,Baden-Württemberg,Landkreis Alb-Donau-Kreis,48442398,9659075,...,,AC Steckdose Typ 2,22.0,,,,,,,
1,smopi®,Albstraße,14,,72535,Heroldstatt,Baden-Württemberg,Landkreis Alb-Donau-Kreis,48449353,9672201,...,,AC Steckdose Typ 2,22.0,,AC Steckdose Typ 2,22.0,,AC Steckdose Typ 2,22.0,
2,Albwerk GmbH & Co. KG,Parkplatz Campingplatz,0,,72589,Westerheim,Baden-Württemberg,Landkreis Alb-Donau-Kreis,485105,9609,...,,AC Steckdose Typ 2,22.0,,,,,,,
3,EnBW mobility+ AG und Co.KG,Hauptstraße,91c,,73340,Amstetten,Baden-Württemberg,Landkreis Alb-Donau-Kreis,485785342,98748399,...,,"DC Kupplung Combo, DC CHAdeMO",50.0,,,,,,,
4,SWU Energie GmbH,Sterngasse,6,,88481,Balzheim,Baden-Württemberg,Landkreis Alb-Donau-Kreis,48179959,10076977,...,,,,,,,,,,


In [21]:
"""
# Download the database
r = requests.get(url)

# Save database to local file storage
with open(path+zip_file, 'wb') as f:
    f.write(r.content)
"""

"\n# Download the database\nr = requests.get(url)\n\n# Save database to local file storage\nwith open(path+zip_file, 'wb') as f:\n    f.write(r.content)\n"

#### Saving Dataframe as CSV 

In [19]:
# Saving the df as a csv file in this Repo on GitHub 
df_charging_stations.to_csv('charging_stations.csv', index=False)

### Simple EDA "charging stations"

In [29]:
# Checking rows and columns
df_charging_stations.shape

(40674, 26)

In [30]:
# Checking for null values and data types
df_charging_stations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40674 entries, 0 to 40673
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Betreiber               40674 non-null  object
 1   Straße                  40674 non-null  object
 2   Hausnummer              40674 non-null  object
 3   Adresszusatz            5761 non-null   object
 4   Postleitzahl            40674 non-null  int64 
 5   Ort                     40674 non-null  object
 6   Bundesland              40674 non-null  object
 7   Kreis/kreisfreie Stadt  40674 non-null  object
 8   Breitengrad             40674 non-null  object
 9   Längengrad              40674 non-null  object
 10  Inbetriebnahmedatum     40674 non-null  object
 11  Anschlussleistung       40674 non-null  object
 12  Art der Ladeeinrichung  40673 non-null  object
 13  Anzahl Ladepunkte       40674 non-null  int64 
 14  Steckertypen1           40674 non-null  object
 15  P1

In [31]:
# Checking the dataframes central tendencies
df_charging_stations.describe()

Unnamed: 0,Postleitzahl,Anzahl Ladepunkte
count,40674.0,40674.0
mean,55257.259748,1.900108
std,27139.40286,0.574907
min,1067.0,1.0
25%,32469.0,2.0
50%,57336.5,2.0
75%,79780.0,2.0
max,99991.0,4.0


In [32]:
# Checking for the columns 
df_charging_stations.columns

Index(['Betreiber', 'Straße', 'Hausnummer', 'Adresszusatz', 'Postleitzahl',
       'Ort', 'Bundesland', 'Kreis/kreisfreie Stadt', 'Breitengrad',
       'Längengrad', 'Inbetriebnahmedatum', 'Anschlussleistung',
       'Art der Ladeeinrichung', 'Anzahl Ladepunkte', 'Steckertypen1',
       'P1 [kW]', 'Public Key1', 'Steckertypen2', 'P2 [kW]', 'Public Key2',
       'Steckertypen3', 'P3 [kW]', 'Public Key3', 'Steckertypen4', 'P4 [kW]',
       'Public Key4'],
      dtype='object')

In [33]:
# Translating column names from German to English
# Changing the names in a pythonic way. lower case and replacing ' ' with '_'
df_charging_stations.rename(columns={'Betreiber': 'operator',
                                     'Straße': 'street',
                                     'Hausnummer': 'house_number',
                                     'Adresszusatz' : 'address_addition',
                                     'Postleitzahl': 'zip_code',
                                     'Ort': 'city',
                                     'Bundesland': 'federal_state',
                                     'Kreis/kreisfreie Stadt': 'county',
                                     'Breitengrad': 'latitude',
                                     'Längengrad': 'longitude',
                                     'Inbetriebnahmedatum': 'commissioning_date',
                                     'Anschlussleistung': 'connecting_power',
                                     'Art der Ladeeinrichung': 'charging_device_type',
                                     'Anzahl Ladepunkte': 'numbers_charging_points',
                                     'Steckertypen1': 'plug_types_1',
                                     'P1 [kW]': 'p1_kw',
                                     'Public Key1': 'public_key1',
                                     'Steckertypen2': 'plug_types_2',
                                     'P2 [kW]': 'p2_kw',
                                     'Public Key2': 'public_key2',
                                     'Steckertypen3': 'plug_types_3',
                                     'P3 [kW]': 'p3_kw',
                                     'Public Key3': 'public_key3',
                                     'Steckertypen4': 'plug_types_4',
                                     'P4 [kW]': 'p4_kw',
                                     'Public Key4': 'public_key4'}, inplace=True)

In [35]:
# Checking the new column names
df_charging_stations.columns

Index(['operator', 'street', 'house_number', 'address_addition', 'zip_code',
       'city', 'federal_state', 'county', 'latitude', 'longitude',
       'commissioning_date', 'connecting_power', 'charging_device_type',
       'numbers_charging_points', 'plug_types_1', 'p1_kw', 'public_key1',
       'plug_types_2', 'p2_kw', 'public_key2', 'plug_types_3', 'p3_kw',
       'public_key3', 'plug_types_4', 'p4_kw', 'public_key4'],
      dtype='object')

In [37]:
# Dropping the unnecessary columns
df_charging_stations.drop(['address_addition', 'public_key1', 'public_key2', 'public_key3', 'public_key4'], axis=1, inplace= True)

In [46]:
# Calling df again to check if columns are updated
df_charging_stations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40674 entries, 0 to 40673
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   operator                 40674 non-null  object
 1   street                   40674 non-null  object
 2   house_number             40674 non-null  object
 3   zip_code                 40674 non-null  int64 
 4   city                     40674 non-null  object
 5   federal_state            40674 non-null  object
 6   county                   40674 non-null  object
 7   latitude                 40674 non-null  object
 8   longitude                40674 non-null  object
 9   commissioning_date       40674 non-null  object
 10  connecting_power         40674 non-null  object
 11  charging_device_type     40673 non-null  object
 12  numbers_charging_points  40674 non-null  int64 
 13  plug_types_1             40674 non-null  object
 14  p1_kw                    40674 non-nul

#### Meta Data

|Variable |Definition  | Key|
|--- | --- | ---|
|**operator**| Operator of the charging station ||
|**street**| Street of the charging station ||
|**house_number**| House number ||
|**zip_code**| Zip code ||
|**city**| City ||
|**federal_state**| Federal state ||
|**county**| County and independent city ||
|**latitude**| Coordinates of charging station ||
|**longitude**| Coordinates of charging station ||
|**commissioning_date**| Starting date of operation(Jan 2017 - Jan2023) | 'DD'/'MM'/'YY' |
|**connecting_power**| Power of charging station | Unit = kW |
|**charging_device_type**| Normal/ fast charging device ||
|**numbers_charging_points**| Number of charging points with varied numbers of connectors("Ladesäule") | 1, 2, 3, 4 |
|**plug_types_1**| Type of connectors |  |
|**p1_kw**| Power outage of first charging point | Unit = kW |
|**plug_types_2**| Type of connectors |  |
|**p2_kw**| Power outage of second charging point | Unit = kW |
|**plug_types_3**| Type of connectors |  |
|**p3_kw**| Power outage of third charging point | Unit = kW |
|**plug_types_4**| Type of connectors |  |
|**p4_kw**| Power outage of fourth charging point | Unit = kW |

#### Detailed EDA

In [None]:
# Dealing with null values.

In [45]:
"""
df_2 = df_charging_stations.explode('plug_types_1')
unique = list(df_2['plug_types_1'].unique())
print(unique)
"""

"\ndf_2 = df_charging_stations.explode('plug_types_1')\nunique = list(df_2['plug_types_1'].unique())\nprint(unique)\n"