In [1]:
# import libraries
import pandas as pd

In [2]:
# specify name of csv file
filename = 'final_data_SE.csv'

# open csv file
df = pd.read_csv(filename)

df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,PC6,Date,District,ConnectionTimeHours,kWh,power,effective_charging_hrs,...,2019.0_household,2019.0_income_1_to_40_percent,2019.0_income_41_to_80_percent,2019.0_income_81_to_100_percent,2019.0_income_average,2019.0_assets_1_to_40_percent,2019.0_assets_41_to_80_percent,2019.0_assets_81_to_100_percent,2019.0_assets_average,2019.0_Gemiddelde SES WOA totaalscore
0,0,0,1,1011AB,2022-01-03,Centrum,67.59,125.8,5.52,22.789855,...,6200.0,45.0,28.0,27.1,49.3,50.8,23.0,26.2,47.6,0.034
1,1,66,1,1011AC,2022-01-03,Zuid,0.00,0.0,5.52,0.000000,...,6200.0,45.0,28.0,27.1,49.3,50.8,23.0,26.2,47.6,0.034
2,2,132,1,1011AD,2022-01-03,Zuid,0.00,0.0,5.52,0.000000,...,6200.0,45.0,28.0,27.1,49.3,50.8,23.0,26.2,47.6,0.034
3,3,198,1,1011AE,2022-01-03,Zuid,0.00,0.0,5.52,0.000000,...,6200.0,45.0,28.0,27.1,49.3,50.8,23.0,26.2,47.6,0.034
4,4,264,1,1011AG,2022-01-03,Zuid,0.00,0.0,5.52,0.000000,...,6200.0,45.0,28.0,27.1,49.3,50.8,23.0,26.2,47.6,0.034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1219807,1219807,1219547,66,1109CB,2023-04-03,Zuid,0.00,0.0,5.52,0.000000,...,700.0,34.3,40.7,25.0,55.7,34.6,40.7,24.7,54.9,0.157
1219808,1219808,1219613,66,1109CD,2023-04-03,Zuid,0.00,0.0,5.52,0.000000,...,700.0,34.3,40.7,25.0,55.7,34.6,40.7,24.7,54.9,0.157
1219809,1219809,1219679,66,1109CE,2023-04-03,Zuid,0.00,0.0,5.52,0.000000,...,700.0,34.3,40.7,25.0,55.7,34.6,40.7,24.7,54.9,0.157
1219810,1219810,1219745,66,1109CH,2023-04-03,Zuid,0.00,0.0,5.52,0.000000,...,700.0,34.3,40.7,25.0,55.7,34.6,40.7,24.7,54.9,0.157


In [None]:
# print first 5 rows
df.rename(columns={df.columns[0]: 'index_to_drop', df.columns[1]: 'RowIndex', df.columns[2]: 'WeekIndex'},inplace=True)

# remove "index" column, since this is not the correct index
df.drop(columns=["index_to_drop"], inplace=True)

In [None]:
# unit test that each row has a unique index
assert df['RowIndex'].nunique() == len(df.index)

# convert 'RowIndex' column to integer
df['RowIndex'] = df['RowIndex'].astype(int)

# set the 'RowIndex' column to index
df = df.set_index("RowIndex")

In [None]:
# cut district column and insert as last column
df['District'] = df['District'].str[0:2]

# count unique categories in District column -> 11
print(df["District"].nunique())

# encode categorical variable "District" with binary encoding
# documentation: https://contrib.scikit-learn.org/category_encoders/binary.html
from category_encoders.binary import BinaryEncoder
encoder = BinaryEncoder(cols=['District'])
encoder.fit(df)
df = encoder.transform(df)

In [None]:
# count unique categories in ZipCode "PC6" column
print(df["PC6"].nunique())

# retrieve PC4 from PC6, that is the first 4 digits of the 6 digit zip code
df["PC4"] = df["PC6"].astype(str).str[:4]
print(df["PC4"].nunique())

# encode categorical variable "PC4" with binary encoding
encoder = BinaryEncoder(cols=['PC4'])
encoder.fit(df)
df = encoder.transform(df)

In [None]:
df

In [None]:
# all columns (including provider columns) that are not in the above lists are unknown variables
providers = ["Total", "Vattenfall", "Allego", "Equans", "EvBox", "Nuon", "WDS", "Pitpoint", "Ecotap", "Engie"]

# create a new column called "number of chargers" which is the sum of the values of the columns which names are stored in the providers list
df['number_of_chargers'] = df[providers].sum(axis=1)

providers.append("number_of_chargers")

# reverse list
providers = providers[::-1]

In [None]:
# split columns into several groups: identifiers, target variables, know_variables, unknown_variables, purely_descriptive_variables

# reorder columns in the following order:
identifiers = ["PC6", "Date"]
know_variables = ["WeekIndex", "MaxPower"]
target_variables = ["kWh", "Blocked_kWh"]

purely_descriptive_variables = ["ChargeSocket_ID_count", "ConnectionTimeHours", "power", "effective_charging_hrs", "MaxOccupancy", "SpareCap_Effective", "SpareCap_Occup_kWh", "SpareCap_Hrs", "Effective%", "Occupancy_kwh%"]


unknown_variables = [col for col in df.columns if col not in identifiers + target_variables + know_variables + providers+ purely_descriptive_variables]
unknown_variables = providers + unknown_variables

print(len(identifiers + target_variables + know_variables + purely_descriptive_variables + unknown_variables))

In [None]:
# sort df columns in the following order: identifiers, target variables, know_variables, unknown_variables, purely_descriptive_variables
df = df[identifiers + know_variables + target_variables + unknown_variables + purely_descriptive_variables]

In [None]:
# inspect df without the purely_descriptive_variables
df[identifiers + know_variables + target_variables + unknown_variables]

In [None]:
# check missing values: show only columns where missing values >0 and their count per column
print(df.isna().sum()[df.isna().sum() > 0])

In [None]:
# sort df by date column
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by=['PC6', 'Date'])

In [None]:
df

In [None]:
# count number of remaining rows
print(df.shape)

In [None]:
# load cleaned data
# df = pd.read_csv(filename.split('.')[0] + '_cleaned.csv')

# create new column as concatenation of PC6 and WeekIndex columns
df['PC6_WeekIndex'] = df['PC6'].astype(str) + df['WeekIndex'].astype(str)

# move this column to the first position
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]

# keep only first 10000 rows for testing purposes
df = df.head(10000)

# inspect data
df.head()

In [None]:
# save to new csv file named original name + _cleaned
df.to_csv(filename.split('.')[0] + '_cleaned.csv', index=False)