# Import all data

In [6]:
import pandas as pd

## PC6 levels

In [7]:
# PC6 general
data_PC6 = pd.read_csv(r'path\to\file\data_PC6.csv', sep=',', index_col=0)

In [8]:
# Land Use
data_landuse = pd.read_csv(r'path\to\file\data_land_use_by_postcode.csv', sep=',')

Strandeiland and Buiteneiland are new islands, in construction, so nothing recorded yet but the land use.

In [9]:
# Transportation
data_bus = pd.read_csv(r'path\to\file\data_bus_by_postcode.csv', sep=',')
data_tram_metro = pd.read_csv(r'path\to\file\data_public_transport_by_postcode.csv', sep=',')

In [10]:
# Parking 
data_parking = pd.read_csv(r'path\to\file\data_parking_by_postcode.csv', sep=',')

In [12]:
from functools import reduce

# Define list of all DataFrames of level PC6
level_PC6 = [data_PC6, data_bus, data_tram_metro, data_parking, data_landuse]

# Merge all DataFrames into one
data_level_PC6 = reduce(lambda  left,right: pd.merge(left, right, on='PC6',
                                                     how='outer'), level_PC6)

# Fill NAs where neccesary
columns_to_fillna = ['bus_count','metro_count','tram_count','parking_count']
data_level_PC6[columns_to_fillna] = data_level_PC6[columns_to_fillna].fillna(0)

In [13]:
# Get PC4 level for merging later
data_level_PC6['PC4'] = data_level_PC6['PC6'].str[:4].astype(float)

In [14]:
data_level_PC6.head()

Unnamed: 0,PC6,bus_count,metro_count,tram_count,parking_count,traffic_area,built_up_area,semi_built_up_area,recreational_area,agrarian_area,forest_nature_are,water_body_area,PC4
0,1011AB,0.0,0.0,0.0,0.0,5.5393,5.3554,1.4589,0.0,0.0,0.0,11.9398,1011.0
1,1011AC,0.0,0.0,1.0,0.0,5.5393,5.3554,1.4589,0.0,0.0,0.0,11.9398,1011.0
2,1011AG,0.0,0.0,0.0,0.0,0.4505,4.864,0.0,0.0,0.0,0.0,2.4582,1011.0
3,1011AH,0.0,0.0,0.0,0.0,0.4505,4.864,0.0,0.0,0.0,0.0,2.4582,1011.0
4,1011AJ,0.0,0.0,0.0,0.0,0.4505,4.864,0.0,0.0,0.0,0.0,2.4582,1011.0


## PC4 levels

In [16]:
import numpy as np

# Registered EVs vehicles
data_ev = pd.read_csv(r'path\to\file\ev_ams_passenger.csv', sep=',')
data_ev = data_ev[['PC4','December 2022_EV']]

# Replace cells in the December 2022_EV column with NaN based on the outlier postcode
data_ev.loc[data_ev['PC4'] == 1097, 'December 2022_EV'] = np.nan
data_ev.loc[data_ev['PC4'] == 1082, 'December 2022_EV'] = np.nan
data_ev.loc[data_ev['PC4'] == 1101, 'December 2022_EV'] = np.nan
data_ev.loc[data_ev['PC4'] == 1102, 'December 2022_EV'] = np.nan
data_ev.head()

Unnamed: 0,PC4,December 2022_EV
0,1011,367.0
1,1012,79.0
2,1013,228.0
3,1014,363.0
4,1015,116.0


In [17]:
# Registered hybrid vehicles
data_hybrid = pd.read_csv(r'path\to\file\hybird_ams_passenger.csv', sep=',')
data_hybrid = data_hybrid[['PC4','December 2022_hybrid']]

# Replace cells in the December 2022_EV column with NaN based on the outlier postcode
data_hybrid.loc[data_hybrid['PC4'] == 1012, 'December 2022_hybrid'] = np.nan
data_hybrid.loc[data_hybrid['PC4'] == 1071, 'December 2022_hybrid'] = np.nan
data_hybrid.loc[data_hybrid['PC4'] == 1077, 'December 2022_hybrid'] = np.nan
data_hybrid.loc[data_hybrid['PC4'] == 1102, 'December 2022_hybrid'] = np.nan
data_hybrid.head()

Unnamed: 0,PC4,December 2022_hybrid
0,1011,187.0
1,1012,
2,1013,154.0
3,1014,82.0
4,1015,109.0


In [18]:
# Population
data_population_age = pd.read_csv(r'path\to\file\data_population_age.csv', sep=',')
data_population_age = data_population_age[['PC4','2022_0_to_10','2022_10_to_20','2022_20_to_30','2022_30_to_40','2022_40_to_50']]
data_population_age.head()

Unnamed: 0,PC4,2022_0_to_10,2022_10_to_20,2022_20_to_30,2022_30_to_40,2022_40_to_50
0,1011,475.0,540.0,2065.0,1645.0,1080.0
1,1012,280.0,330.0,3120.0,2045.0,1010.0
2,1013,1600.0,1670.0,3805.0,4000.0,2920.0
3,1014,625.0,245.0,395.0,945.0,610.0
4,1015,815.0,925.0,2920.0,2630.0,1825.0


In [19]:
# Household
data_household = pd.read_csv(r'path\to\file\data_household.csv', sep=',')
data_household = data_household[['PC4','2019.0_household']]
data_household.head()

Unnamed: 0,PC4,2019.0_household
0,1011.0,6200
1,1012.0,5700
2,1013.0,12300
3,1014.0,900
4,1015.0,9400


In [20]:
# Income
data_income = pd.read_csv(r'path\to\file\data_income_by_postcode.csv', sep=',')
data_income = data_income[['PC4','2019.0_income_1_to_40_percent','2019.0_income_41_to_80_percent','2019.0_income_81_to_100_percent','2019.0_income_average']]
data_income.head()

Unnamed: 0,PC4,2019.0_income_1_to_40_percent,2019.0_income_41_to_80_percent,2019.0_income_81_to_100_percent,2019.0_income_average
0,1011.0,45.0,28.0,27.1,49.3
1,1012.0,47.1,29.8,23.1,46.2
2,1013.0,46.4,29.4,24.2,48.1
3,1014.0,24.8,22.7,52.5,66.6
4,1015.0,44.4,27.9,27.7,50.1


In [21]:
# Assets
data_assets = pd.read_csv(r'path\to\file\data_asset_by_postcode.csv', sep=',')
data_assets = data_assets[['PC4','2019.0_assets_1_to_40_percent','2019.0_assets_41_to_80_percent','2019.0_assets_81_to_100_percent','2019.0_assets_average']]
data_assets.head()

Unnamed: 0,PC4,2019.0_assets_1_to_40_percent,2019.0_assets_41_to_80_percent,2019.0_assets_81_to_100_percent,2019.0_assets_average
0,1011.0,50.8,23.0,26.2,47.6
1,1012.0,63.1,20.5,16.4,38.2
2,1013.0,54.7,25.2,20.1,44.2
3,1014.0,46.7,27.4,25.9,49.2
4,1015.0,50.2,23.3,26.5,48.3


In [22]:
# SES
data_ses = pd.read_csv(r'path\to\file\data_ses_woa_by_postcode.csv', sep=',')
data_ses = data_ses[['PC4','2019.0_Gemiddelde SES WOA totaalscore']]
data_ses.head()

Unnamed: 0,PC4,2019.0_Gemiddelde SES WOA totaalscore
0,1011.0,0.034
1,1012.0,0.01
2,1013.0,-0.033
3,1014.0,0.321
4,1015.0,0.038


In [23]:
# Define list of all DataFrames of level PC4
level_PC4 = [data_ev, data_hybrid, data_population_age, data_household, data_income, data_assets, data_ses]

# Merge all DataFrames into one
data_level_PC4 = reduce(lambda  left,right: pd.merge(left, right, on='PC4',
                                                     how='outer'), level_PC4)

# Fill NAs where neccesary
data_level_PC4.iloc[:, 3:] = data_level_PC4.iloc[:, 3:].fillna(0)

In [24]:
data_level_PC4.head()

Unnamed: 0,PC4,December 2022_EV,December 2022_hybrid,2022_0_to_10,2022_10_to_20,2022_20_to_30,2022_30_to_40,2022_40_to_50,2019.0_household,2019.0_income_1_to_40_percent,2019.0_income_41_to_80_percent,2019.0_income_81_to_100_percent,2019.0_income_average,2019.0_assets_1_to_40_percent,2019.0_assets_41_to_80_percent,2019.0_assets_81_to_100_percent,2019.0_assets_average,2019.0_Gemiddelde SES WOA totaalscore
0,1011,367.0,187.0,475.0,540.0,2065.0,1645.0,1080.0,6200.0,45.0,28.0,27.1,49.3,50.8,23.0,26.2,47.6,0.034
1,1012,79.0,,280.0,330.0,3120.0,2045.0,1010.0,5700.0,47.1,29.8,23.1,46.2,63.1,20.5,16.4,38.2,0.01
2,1013,228.0,154.0,1600.0,1670.0,3805.0,4000.0,2920.0,12300.0,46.4,29.4,24.2,48.1,54.7,25.2,20.1,44.2,-0.033
3,1014,363.0,82.0,625.0,245.0,395.0,945.0,610.0,900.0,24.8,22.7,52.5,66.6,46.7,27.4,25.9,49.2,0.321
4,1015,116.0,109.0,815.0,925.0,2920.0,2630.0,1825.0,9400.0,44.4,27.9,27.7,50.1,50.2,23.3,26.5,48.3,0.038


# Merge PC4 and PC6

In [26]:
# Merging all the independent variables
data_independent = data_level_PC6.merge(data_level_PC4, on='PC4', how='left').drop(columns='PC4')

In [27]:
# Imputate the missing values and abnormalies. We can do this before merging with the target variables as these indepdent features have no time variance to them. 
# We decided to choost the most recent value for all features, as we only have data of the target variables for 2022 and 2023
import miceforest as mf

# Convert object columns to categorical and numeric type 
data_independent['PC6'] = data_independent['PC6'].astype('category')


# Create kernel. 
kds = mf.ImputationKernel(
  data_independent,
  save_all_iterations=True,
  random_state=1991
)

# Run the MICE algorithm for 2 iterations
kds.mice(5)

# Return the completed dataset 
data_independent_complete = kds.complete_data()

  warn(


In [28]:
data_independent_complete.head()

Unnamed: 0,PC6,bus_count,metro_count,tram_count,parking_count,traffic_area,built_up_area,semi_built_up_area,recreational_area,agrarian_area,...,2019.0_household,2019.0_income_1_to_40_percent,2019.0_income_41_to_80_percent,2019.0_income_81_to_100_percent,2019.0_income_average,2019.0_assets_1_to_40_percent,2019.0_assets_41_to_80_percent,2019.0_assets_81_to_100_percent,2019.0_assets_average,2019.0_Gemiddelde SES WOA totaalscore
0,1011AB,0.0,0.0,0.0,0.0,5.5393,5.3554,1.4589,0.0,0.0,...,6200.0,45.0,28.0,27.1,49.3,50.8,23.0,26.2,47.6,0.034
1,1011AC,0.0,0.0,1.0,0.0,5.5393,5.3554,1.4589,0.0,0.0,...,6200.0,45.0,28.0,27.1,49.3,50.8,23.0,26.2,47.6,0.034
2,1011AG,0.0,0.0,0.0,0.0,0.4505,4.864,0.0,0.0,0.0,...,6200.0,45.0,28.0,27.1,49.3,50.8,23.0,26.2,47.6,0.034
3,1011AH,0.0,0.0,0.0,0.0,0.4505,4.864,0.0,0.0,0.0,...,6200.0,45.0,28.0,27.1,49.3,50.8,23.0,26.2,47.6,0.034
4,1011AJ,0.0,0.0,0.0,0.0,0.4505,4.864,0.0,0.0,0.0,...,6200.0,45.0,28.0,27.1,49.3,50.8,23.0,26.2,47.6,0.034


In [146]:
#data_independent_complete.to_csv('data_independent_complete.csv', index=True)

# Merge with the Session Data

In [29]:
# Read in the session data for target variables
data_session = pd.read_csv(r'path\to\file\df_weekly_22_23.csv', sep=',')

  data_session = pd.read_csv(r'C:\Users\vongu\UM\Master\Smart Service Project\Registered EVs\df_weekly_22_23.csv', sep=',')


In [30]:
# Format the columns
data_session = data_session.drop(columns=['City']).rename(columns={'zipcode':'PC6'})
data_session['PC6'] = data_session['PC6'].str.replace(' ', '')

In [31]:
# Convert 'Date' column to datetime
data_session['Date'] = pd.to_datetime(data_session['Date'])

# Order the data by date and postcode
data_session = data_session.sort_values(by=['Date', 'PC6'], ascending=[True, True])

In [32]:
data_session.head()

Unnamed: 0.1,Unnamed: 0,PC6,Date,District,ConnectionTimeHours,kWh,effective_charging_hrs,power,MaxOccupancy,MaxPower,...,effective_charging_hrs20-21,effective_charging_hrs21-22,effective_charging_hrs22-23,effective_charging_hrs23-24,SpareCap_Effective,SpareCap_Occup_kWh,SpareCap_Hrs,Effective%,Occupancy_kwh%,index
0,0,1011AB,2022-01-03,Centrum,67.59,125.8,22.789855,33.12,288.0,1589.76,...,1.533333,2.0,2.0,1.966667,1463.96,-648.8208,220.41,0.079131,1.408125,1
66,66,1011AC,2022-01-03,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
18084,18084,1011AD,2022-01-03,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
18150,18150,1011AE,2022-01-03,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
132,132,1011AG,2022-01-03,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


## Final Data

In [33]:
# Merge the target variable with the independent variables
final_data = data_session.merge(data_independent_complete, on='PC6')

In [34]:
# Order the data by date and postcode
final_data = final_data.sort_values(by=['Date', 'PC6'], ascending=[True, True])

In [36]:
final_data.head()

Unnamed: 0.1,Unnamed: 0,PC6,Date,District,ConnectionTimeHours,kWh,effective_charging_hrs,power,MaxOccupancy,MaxPower,...,2019.0_household,2019.0_income_1_to_40_percent,2019.0_income_41_to_80_percent,2019.0_income_81_to_100_percent,2019.0_income_average,2019.0_assets_1_to_40_percent,2019.0_assets_41_to_80_percent,2019.0_assets_81_to_100_percent,2019.0_assets_average,2019.0_Gemiddelde SES WOA totaalscore
0,0,1011AB,2022-01-03,Centrum,67.59,125.8,22.789855,33.12,288.0,1589.76,...,6200.0,45.0,28.0,27.1,49.3,50.8,23.0,26.2,47.6,0.034
66,66,1011AC,2022-01-03,0,0.0,0.0,0.0,0.0,0.0,0.0,...,6200.0,45.0,28.0,27.1,49.3,50.8,23.0,26.2,47.6,0.034
132,18084,1011AD,2022-01-03,0,0.0,0.0,0.0,0.0,0.0,0.0,...,6200.0,45.0,28.0,27.1,49.3,50.8,23.0,26.2,47.6,0.034
198,18150,1011AE,2022-01-03,0,0.0,0.0,0.0,0.0,0.0,0.0,...,6200.0,45.0,28.0,27.1,49.3,50.8,23.0,26.2,47.6,0.034
264,132,1011AG,2022-01-03,0,0.0,0.0,0.0,0.0,0.0,0.0,...,6200.0,45.0,28.0,27.1,49.3,50.8,23.0,26.2,47.6,0.034


In [33]:
#final_data.to_csv('final_data.csv', index=True)