In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

#for json convertion
import ast

In [2]:
# Load Data Sets
df_NoNull = pd.read_csv('../cleanData/cleanChargingDataNoNull.csv')
df_Weather = pd.read_csv('../1_data preparation/weather_burbank_airport.csv')

In [3]:
#control if data was loaded
print(df_NoNull.head())
print(df_Weather.head())

   Unnamed: 0     id       connectionTime       disconnectTime  \
0        1151  52943  2018-10-09 15:26:40  2018-10-09 19:48:12   
1        1156  53111  2018-10-12 10:07:47  2018-10-12 15:58:22   
2        1157  53138  2018-10-15 06:46:28  2018-10-15 17:43:30   
3        1158  53213  2018-10-16 07:13:50  2018-10-16 15:03:42   
4        1159  53258  2018-10-16 15:29:34  2018-10-16 20:30:34   

      doneChargingTime  kWhDelivered  siteID  spaceID    stationID  userID_x  \
0  2018-10-09 19:28:29        12.034       1  AG-1F01  1-1-193-825     383.0   
1  2018-10-12 14:46:28        14.289       1  AG-1F01  1-1-193-825     335.0   
2  2018-10-15 09:03:25         6.457       1  AG-1F01  1-1-193-825     365.0   
3  2018-10-16 10:11:45         7.416       1  AG-1F01  1-1-193-825     374.0   
4  2018-10-16 18:33:31         9.909       1  AG-1F01  1-1-193-825     467.0   

      parkDuration  WhPerMile  kWhRequested  milesRequested  minutesAvailable  \
0  0 days 04:21:32      400.0        15.2

## Feature Calculation for the original data set

1. Session Duration (minutes): Time between connectionTime and disconnectTime.
2. Charging Duration (minutes): Time between connectionTime and doneChargingTime.
3. Idle Duration (minutes): Time between doneChargingTime and disconnectTime.
4. Charging Efficiency (kWh per minute): kWhDelivered / Charging Duration.
5. User Inputs: Extract fields from the userInputs JSON object (e.g., kWhRequested, minutesAvailable).

In [4]:
#since the data is in a string-format, there must be a conversion to datetime before performing calculations on it
df_NoNull['connectionTime'] = pd.to_datetime(df_NoNull['connectionTime'])
df_NoNull['disconnectTime'] = pd.to_datetime(df_NoNull['disconnectTime'])
df_NoNull['doneChargingTime'] = pd.to_datetime(df_NoNull['doneChargingTime'])
#calculating the session duration in minutes
df_NoNull['session_duration'] = (df_NoNull['disconnectTime'] - df_NoNull['connectionTime']).dt.total_seconds() / 60
df_NoNull['charging_duration'] = (df_NoNull['doneChargingTime'] - df_NoNull['connectionTime']).dt.total_seconds() / 60
df_NoNull['idle_duration'] = (df_NoNull['disconnectTime'] - df_NoNull['doneChargingTime']).dt.total_seconds() / 60
df_NoNull['charging_efficiency'] = df_NoNull['kWhDelivered'] / (df_NoNull['charging_duration'] / 60)

In [5]:
features_all = df_NoNull[['session_duration', 'charging_duration', 'idle_duration', 'charging_efficiency', 'kWhDelivered']]

print(features_all.head())

   session_duration  charging_duration  idle_duration  charging_efficiency  \
0        261.533333         241.816667      19.716667             2.985898   
1        350.583333         278.683333      71.900000             3.076395   
2        657.033333         136.950000     520.083333             2.828916   
3        469.866667         177.916667     291.950000             2.500946   
4        301.000000         183.950000     117.050000             3.232074   

   kWhDelivered  
0        12.034  
1        14.289  
2         6.457  
3         7.416  
4         9.909  


## Cleaning up the data in "features_all"

In [6]:
print(features_all.isnull().sum())  # Check for missing values in each column
print(np.isinf(features_all).sum())  # Check for infinite values

session_duration       0
charging_duration      0
idle_duration          0
charging_efficiency    0
kWhDelivered           0
dtype: int64
session_duration       0
charging_duration      0
idle_duration          0
charging_efficiency    0
kWhDelivered           0
dtype: int64


# Splitting up the two parking sites into new files

In [7]:
# Filter data for each garage
site1_CSV = df_NoNull[df_NoNull['siteID'] == 1]
site2_CSV = df_NoNull[df_NoNull['siteID'] == 2]

# Save each garage's data to a separate CSV file
site1_CSV.to_csv('GarageA_data.csv', index=False)
site2_CSV.to_csv('GarageB_data.csv', index=False)

print("CSV files created: 'GarageA_data.csv' and 'GarageB_data.csv'")

CSV files created: 'GarageA_data.csv' and 'GarageB_data.csv'


In [8]:
# Load new Data Sets
site1 = pd.read_csv('GarageA_data.csv')
site2 = pd.read_csv('GarageB_data.csv')

## Feature Calculation for Site 1

In [9]:
#since the data is in a string-format, there must be a conversion to datetime before performing calculations on it
site1['connectionTime'] = pd.to_datetime(site1['connectionTime'])
site1['disconnectTime'] = pd.to_datetime(site1['disconnectTime'])
site1['doneChargingTime'] = pd.to_datetime(site1['doneChargingTime'])

site1['session_duration'] = (site1['disconnectTime'] - site1['connectionTime']).dt.total_seconds() / 60
site1['charging_duration'] = (site1['doneChargingTime'] - site1['connectionTime']).dt.total_seconds() / 60
site1['idle_duration'] = (site1['disconnectTime'] - site1['doneChargingTime']).dt.total_seconds() / 60
site1['charging_efficiency'] = site1['kWhDelivered'] / (site1['charging_duration'] / 60)

In [10]:
featuresA = site1[['session_duration', 'charging_duration', 'idle_duration', 'charging_efficiency', 'kWhDelivered']]

print(featuresA.head())
print(featuresA.isnull().sum())  # Check for missing values in each column
print(np.isinf(featuresA).sum())  # Check for infinite values

   session_duration  charging_duration  idle_duration  charging_efficiency  \
0        261.533333         241.816667      19.716667             2.985898   
1        350.583333         278.683333      71.900000             3.076395   
2        657.033333         136.950000     520.083333             2.828916   
3        469.866667         177.916667     291.950000             2.500946   
4        301.000000         183.950000     117.050000             3.232074   

   kWhDelivered  
0        12.034  
1        14.289  
2         6.457  
3         7.416  
4         9.909  
session_duration       0
charging_duration      0
idle_duration          0
charging_efficiency    0
kWhDelivered           0
dtype: int64
session_duration       0
charging_duration      0
idle_duration          0
charging_efficiency    0
kWhDelivered           0
dtype: int64


## Feature Calculation for Site 2

In [11]:
#since the data is in a string-format, there must be a conversion to datetime before performing calculations on it
site2['connectionTime'] = pd.to_datetime(site2['connectionTime'])
site2['disconnectTime'] = pd.to_datetime(site2['disconnectTime'])
site2['doneChargingTime'] = pd.to_datetime(site2['doneChargingTime'])

site2['session_duration'] = (site2['disconnectTime'] - site2['connectionTime']).dt.total_seconds() / 60
site2['charging_duration'] = (site2['doneChargingTime'] - site2['connectionTime']).dt.total_seconds() / 60
site2['idle_duration'] = (site2['disconnectTime'] - site2['doneChargingTime']).dt.total_seconds() / 60
site2['charging_efficiency'] = site2['kWhDelivered'] / (site2['charging_duration'] / 60)

In [12]:
featuresB = site2[['session_duration', 'charging_duration', 'idle_duration', 'charging_efficiency', 'kWhDelivered']]

print(featuresB.head())
print(featuresB.isnull().sum())  # Check for missing values in each column
print(np.isinf(featuresB).sum())  # Check for infinite values

   session_duration  charging_duration  idle_duration  charging_efficiency  \
0        161.633333         150.000000      11.633333             3.387600   
1        236.900000          96.733333     140.166667             3.084562   
2        210.233333          91.083333     119.150000             3.099360   
3        316.566667         226.766667      89.800000             3.209202   
4        126.966667         126.883333       0.083333             3.351740   

   kWhDelivered  
0         8.469  
1         4.973  
2         4.705  
3        12.129  
4         7.088  
session_duration       0
charging_duration      0
idle_duration          0
charging_efficiency    0
kWhDelivered           0
dtype: int64
session_duration       0
charging_duration      0
idle_duration          0
charging_efficiency    0
kWhDelivered           0
dtype: int64


# Removing the Outliers

## Outlier-Removal for the original data set

In [13]:
# Step 1: IQR Filtering
Q1 = features_all.quantile(0.25)
Q3 = features_all.quantile(0.75)
IQR = Q3 - Q1
df_filtered = features_all[~((features_all < (Q1 - 1.5 * IQR)) | (features_all > (Q3 + 1.5 * IQR))).any(axis=1)]

# Step 2: Percentile-Based Filtering (Final Check)
lower_bound = df_filtered.quantile(0.01)
upper_bound = df_filtered.quantile(0.99)
df_filtered = df_filtered[~((df_filtered < lower_bound) | (df_filtered > upper_bound)).any(axis=1)]

# Save the cleaned dataset
df_filtered.to_csv("features_all_no_outliers.csv", index=False)

Checking if removal was successfull

In [14]:
print("Original Data:")
print(features_all.describe())

print("\nFiltered Data:")
print(df_filtered.describe())


Original Data:
       session_duration  charging_duration  idle_duration  \
count      37605.000000       37605.000000   37605.000000   
mean         412.859240         225.500994     187.358246   
std          257.857414         173.788328     216.905684   
min            5.116667           5.050000     -59.933333   
25%          225.733333         112.900000       4.466667   
50%          452.266667         187.500000     139.250000   
75%          570.816667         298.833333     325.250000   
max        12859.316667       12000.950000    9367.283333   

       charging_efficiency  kWhDelivered  
count         37605.000000  37605.000000  
mean              3.747800     13.429647  
std               1.797052     10.910784  
min               0.006710      0.503000  
25%               2.509527      6.005000  
50%               3.242888     10.475000  
75%               5.328567     16.055000  
max              93.740032     75.528000  

Filtered Data:
       session_duration  chargin

## Outlier Removal Site 1

In [15]:
# Step 1: IQR Filtering
Q1 = featuresA.quantile(0.25)
Q3 = featuresA.quantile(0.75)
IQR = Q3 - Q1
df_filteredA = featuresA[~((featuresA < (Q1 - 1.5 * IQR)) | (featuresA > (Q3 + 1.5 * IQR))).any(axis=1)]

# Step 2: Percentile-Based Filtering (Final Check)
lower_bound = df_filteredA.quantile(0.01)
upper_bound = df_filteredA.quantile(0.99)
df_filteredA = df_filteredA[~((df_filteredA < lower_bound) | (df_filteredA > upper_bound)).any(axis=1)]

# Save the cleaned dataset
df_filteredA.to_csv("featuresA_no_outliers.csv", index=False)

Checking if removal was successfull

In [16]:
print("Original Data:")
print(featuresA.describe())

print("\nFiltered Data:")
print(df_filteredA.describe())

Original Data:
       session_duration  charging_duration  idle_duration  \
count      24787.000000       24787.000000   24787.000000   
mean         426.730139         249.627974     177.102166   
std          195.802306         145.088098     170.999296   
min            5.983333           6.933333      -1.000000   
25%          253.525000         138.591667       6.433333   
50%          495.400000         220.700000     140.833333   
75%          581.850000         324.891667     312.416667   
max         3248.866667        1516.833333    3003.316667   

       charging_efficiency  kWhDelivered  
count         24787.000000  24787.000000  
mean              3.663764     14.855866  
std               1.739099     11.254901  
min               0.059558      0.503000  
25%               2.432975      6.959500  
50%               3.165454     11.746000  
75%               5.181552     18.402000  
max              93.740032     68.609000  

Filtered Data:
       session_duration  chargin

## Outlier Removal Site 2

In [17]:
# Step 1: IQR Filtering
Q1 = featuresB.quantile(0.25)
Q3 = featuresB.quantile(0.75)
IQR = Q3 - Q1
df_filteredB = featuresB[~((featuresB < (Q1 - 1.5 * IQR)) | (featuresB > (Q3 + 1.5 * IQR))).any(axis=1)]

# Step 2: Percentile-Based Filtering (Final Check)
lower_bound = df_filteredB.quantile(0.01)
upper_bound = df_filteredB.quantile(0.99)
df_filteredB = df_filteredB[~((df_filteredB < lower_bound) | (df_filteredB > upper_bound)).any(axis=1)]

# Save the cleaned dataset
df_filteredB.to_csv("featuresB_no_outliers.csv", index=False)

Checking if removal was successfull

In [18]:
print("Original Data:")
print(featuresB.describe())

print("\nFiltered Data:")
print(df_filteredB.describe())

Original Data:
       session_duration  charging_duration  idle_duration  \
count      12818.000000       12818.000000   12818.000000   
mean         386.036181         178.845085     207.191096   
std          346.186695         211.187571     284.414159   
min            5.116667           5.050000     -59.933333   
25%          170.766667          78.800000       2.250000   
50%          382.233333         123.516667     136.308333   
75%          529.366667         228.545833     350.850000   
max        12859.316667       12000.950000    9367.283333   

       charging_efficiency  kWhDelivered  
count         12818.000000  12818.000000  
mean              3.910307     10.671675  
std               1.893642      9.631365  
min               0.006710      0.510000  
25%               2.718440      4.382250  
50%               3.324363      7.735500  
75%               5.760983     13.551000  
max              59.172100     75.528000  

Filtered Data:
       session_duration  chargin

# Normalizing the data for clustering

In [20]:
# Load new Data Sets
both_sites_new = pd.read_csv('features_all_no_outliers.csv')
site1_new = pd.read_csv('featuresA_no_outliers.csv')
site2_new = pd.read_csv('featuresB_no_outliers.csv')

In [22]:
# Original dataset
scaled_features = scaler.fit_transform(both_sites_new)
scaled_features_df = pd.DataFrame(scaled_features, columns=both_sites_new.columns)
scaled_features_df.to_csv("scaled_features_all.csv", index=False)

# Parking site A
scaled_featuresA = scaler.fit_transform(site1_new)
scaled_featuresA_df = pd.DataFrame(scaled_featuresA, columns=site1_new.columns)
scaled_featuresA_df.to_csv("scaled_featuresA.csv", index=False)

# Parking site B
scaled_featuresB = scaler.fit_transform(site2_new)
scaled_featuresB_df = pd.DataFrame(scaled_featuresB, columns=site2_new.columns)
scaled_featuresB_df.to_csv("scaled_featuresB.csv", index=False)