In [67]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json

#for json convertion
import ast

In [68]:
df = pd.read_csv("charging_sessions.csv")

First, we take a look at the dataset

In [69]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,sessionID,siteID,spaceID,stationID,timezone,userID,userInputs
0,0,5e23b149f9af8b5fe4b973cf,2020-01-02 13:08:54+00:00,2020-01-02 19:11:15+00:00,2020-01-02 17:31:35+00:00,25.016,1_1_179_810_2020-01-02 13:08:53.870034,1,AG-3F30,1-1-179-810,America/Los_Angeles,194.0,"[{'WhPerMile': 250, 'kWhRequested': 25.0, 'mil..."
1,1,5e23b149f9af8b5fe4b973d0,2020-01-02 13:36:50+00:00,2020-01-02 22:38:21+00:00,2020-01-02 20:18:05+00:00,33.097,1_1_193_825_2020-01-02 13:36:49.599853,1,AG-1F01,1-1-193-825,America/Los_Angeles,4275.0,"[{'WhPerMile': 280, 'kWhRequested': 70.0, 'mil..."
2,2,5e23b149f9af8b5fe4b973d1,2020-01-02 13:56:35+00:00,2020-01-03 00:39:22+00:00,2020-01-02 16:35:06+00:00,6.521,1_1_193_829_2020-01-02 13:56:35.214993,1,AG-1F03,1-1-193-829,America/Los_Angeles,344.0,"[{'WhPerMile': 400, 'kWhRequested': 8.0, 'mile..."
3,3,5e23b149f9af8b5fe4b973d2,2020-01-02 13:59:58+00:00,2020-01-02 16:38:39+00:00,2020-01-02 15:18:45+00:00,2.355,1_1_193_820_2020-01-02 13:59:58.309319,1,AG-1F04,1-1-193-820,America/Los_Angeles,1117.0,"[{'WhPerMile': 400, 'kWhRequested': 8.0, 'mile..."
4,4,5e23b149f9af8b5fe4b973d3,2020-01-02 14:00:01+00:00,2020-01-02 22:08:40+00:00,2020-01-02 18:17:30+00:00,13.375,1_1_193_819_2020-01-02 14:00:00.779967,1,AG-1F06,1-1-193-819,America/Los_Angeles,334.0,"[{'WhPerMile': 400, 'kWhRequested': 16.0, 'mil..."


In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66450 entries, 0 to 66449
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        66450 non-null  int64  
 1   id                66450 non-null  object 
 2   connectionTime    66450 non-null  object 
 3   disconnectTime    66450 non-null  object 
 4   doneChargingTime  62362 non-null  object 
 5   kWhDelivered      66450 non-null  float64
 6   sessionID         66450 non-null  object 
 7   siteID            66450 non-null  int64  
 8   spaceID           66450 non-null  object 
 9   stationID         66450 non-null  object 
 10  timezone          66450 non-null  object 
 11  userID            49187 non-null  float64
 12  userInputs        49187 non-null  object 
dtypes: float64(2), int64(2), object(9)
memory usage: 6.6+ MB


Also checking for missing values within the data

In [71]:
missing_values = df.isnull().sum()
missing_values

Unnamed: 0              0
id                      0
connectionTime          0
disconnectTime          0
doneChargingTime     4088
kWhDelivered            0
sessionID               0
siteID                  0
spaceID                 0
stationID               0
timezone                0
userID              17263
userInputs          17263
dtype: int64

Now we check if all the IDs are actually unique

In [72]:
duplicates = df["Unnamed: 0"].unique()
len(duplicates)

15292

In [73]:
duplicates = df["id"].unique()
len(duplicates)

65037

In [74]:
duplicates = df["sessionID"].unique()
len(duplicates)

65037

The column "Unnamed: 0" ranges from 0 to 15292 and then repeats. Therefore we remove it completly, since its just another unnecessary index, that is even incorrect. Furthermore we replace "id" with a continous integer from 1-66450 to make it easier

In [75]:
df.drop(columns="Unnamed: 0", inplace=True)
df["id"] = range(1, len(df) + 1)
df.head()

Unnamed: 0,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,sessionID,siteID,spaceID,stationID,timezone,userID,userInputs
0,1,2020-01-02 13:08:54+00:00,2020-01-02 19:11:15+00:00,2020-01-02 17:31:35+00:00,25.016,1_1_179_810_2020-01-02 13:08:53.870034,1,AG-3F30,1-1-179-810,America/Los_Angeles,194.0,"[{'WhPerMile': 250, 'kWhRequested': 25.0, 'mil..."
1,2,2020-01-02 13:36:50+00:00,2020-01-02 22:38:21+00:00,2020-01-02 20:18:05+00:00,33.097,1_1_193_825_2020-01-02 13:36:49.599853,1,AG-1F01,1-1-193-825,America/Los_Angeles,4275.0,"[{'WhPerMile': 280, 'kWhRequested': 70.0, 'mil..."
2,3,2020-01-02 13:56:35+00:00,2020-01-03 00:39:22+00:00,2020-01-02 16:35:06+00:00,6.521,1_1_193_829_2020-01-02 13:56:35.214993,1,AG-1F03,1-1-193-829,America/Los_Angeles,344.0,"[{'WhPerMile': 400, 'kWhRequested': 8.0, 'mile..."
3,4,2020-01-02 13:59:58+00:00,2020-01-02 16:38:39+00:00,2020-01-02 15:18:45+00:00,2.355,1_1_193_820_2020-01-02 13:59:58.309319,1,AG-1F04,1-1-193-820,America/Los_Angeles,1117.0,"[{'WhPerMile': 400, 'kWhRequested': 8.0, 'mile..."
4,5,2020-01-02 14:00:01+00:00,2020-01-02 22:08:40+00:00,2020-01-02 18:17:30+00:00,13.375,1_1_193_819_2020-01-02 14:00:00.779967,1,AG-1F06,1-1-193-819,America/Los_Angeles,334.0,"[{'WhPerMile': 400, 'kWhRequested': 16.0, 'mil..."


Transforming the times to datetime so we can actually calculate with those later

In [76]:
df["connectionTime"] = pd.to_datetime(df["connectionTime"])
df["disconnectTime"] = pd.to_datetime(df["disconnectTime"])
df["doneChargingTime"] = pd.to_datetime(df["doneChargingTime"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66450 entries, 0 to 66449
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   id                66450 non-null  int64              
 1   connectionTime    66450 non-null  datetime64[ns, UTC]
 2   disconnectTime    66450 non-null  datetime64[ns, UTC]
 3   doneChargingTime  62362 non-null  datetime64[ns, UTC]
 4   kWhDelivered      66450 non-null  float64            
 5   sessionID         66450 non-null  object             
 6   siteID            66450 non-null  int64              
 7   spaceID           66450 non-null  object             
 8   stationID         66450 non-null  object             
 9   timezone          66450 non-null  object             
 10  userID            49187 non-null  float64            
 11  userInputs        49187 non-null  object             
dtypes: datetime64[ns, UTC](3), float64(2), int64(2), object(5)
m

Since the time data is in GMT, we will convert it to GMT-8, the time in california

In [77]:
# temp = df['connectionTime'].dt.strftime('%Y-%m-%d %H:%M:%S%z')
# temp[0]

In [None]:
dest_timezone = "America/Los_Angeles"
date_columns = ["connectionTime", "disconnectTime", "doneChargingTime"]

for date_column in date_columns:
    df[date_column] = pd.to_datetime(df[date_column])
    df[date_column] = df[date_column].dt.tz_convert(dest_timezone)
    df[date_column] = df[date_column].dt.strftime('%Y-%m-%d %H:%M:%S%z')
    df[date_column] = df[date_column].str.replace(r'[+-]\d{2}\d{2}$', '', regex=True)

df['connectionTime']

yessir
yessir
yessir


0        2020-01-02 05:08:54
1        2020-01-02 05:36:50
2        2020-01-02 05:56:35
3        2020-01-02 05:59:58
4        2020-01-02 06:00:01
                ...         
66445    2019-07-31 11:08:04
66446    2019-07-31 11:40:41
66447    2019-07-31 12:04:40
66448    2019-07-31 12:19:47
66449    2019-07-31 12:21:47
Name: connectionTime, Length: 66450, dtype: object

Check for duplicate entries

In [12]:
same_user_duplicates = df[df.duplicated(subset=['connectionTime', 'stationID', 'siteID', "spaceID"])]
len(same_user_duplicates)

1414

There are 1414 entries where a car simultaneaously started charging at the same station on the same space. Since this is physically not possible, we delete the duplicates

In [13]:
df = df.drop_duplicates(subset=["connectionTime", "spaceID"], ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65036 entries, 0 to 65035
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype                              
---  ------            --------------  -----                              
 0   id                65036 non-null  int64                              
 1   connectionTime    65036 non-null  datetime64[ns, America/Los_Angeles]
 2   disconnectTime    65036 non-null  datetime64[ns, America/Los_Angeles]
 3   doneChargingTime  60949 non-null  datetime64[ns, America/Los_Angeles]
 4   kWhDelivered      65036 non-null  float64                            
 5   sessionID         65036 non-null  object                             
 6   siteID            65036 non-null  int64                              
 7   spaceID           65036 non-null  object                             
 8   stationID         65036 non-null  object                             
 9   timezone          65036 non-null  object                     

Checking for invalid data

Possible invalid scenarios are:

    1. connectionTime is after disconnectTime
    2. connectionTime is after doneChargingTime
    4. The same stationID has two overlapping sessions

In [14]:
disAfterCon = df["connectionTime"] > df["disconnectTime"]
disAfterCon.value_counts()

False    65036
Name: count, dtype: int64

In [15]:
disAfterDone = df["connectionTime"] > df["doneChargingTime"]
disAfterDone.value_counts()

False    65009
True        27
Name: count, dtype: int64

Since this is not possible, we remove these 27 invalid entries

In [16]:
df.drop(df[df["connectionTime"] > df["doneChargingTime"]].index, inplace = True)

now checking for overlapping sessions on the same space

In [17]:
df.sort_values(by=["spaceID", "connectionTime"],inplace=True)
df["overlap"] = ((df["connectionTime"].shift(-1) < df["disconnectTime"]) & (df["spaceID"]==df["spaceID"].shift(-1))).fillna(False)
df["overlap"].value_counts()

overlap
False    65009
Name: count, dtype: int64

Seems like no entries are overlapping.

New temporary feature ParkDuration to check for outliers

In [18]:
df["parkDuration"] = df["disconnectTime"] - df["connectionTime"]
df.head()

Unnamed: 0,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,sessionID,siteID,spaceID,stationID,timezone,userID,userInputs,overlap,parkDuration
51321,51323,2020-11-18 15:36:26-08:00,2020-11-18 16:02:37-08:00,NaT,4.816,2_39_81_4550_2020-11-18 23:36:26.012461,2,11900388,2-39-81-4550,America/Los_Angeles,7132.0,"[{'WhPerMile': 274, 'kWhRequested': 8.22, 'mil...",False,0 days 00:26:11
51322,51324,2020-11-18 16:35:54-08:00,2020-11-18 17:31:08-08:00,NaT,10.027,2_39_81_4550_2020-11-19 00:35:53.924922,2,11900388,2-39-81-4550,America/Los_Angeles,4903.0,"[{'WhPerMile': 258, 'kWhRequested': 51.6, 'mil...",False,0 days 00:55:14
51323,51325,2020-11-18 17:34:02-08:00,2020-11-18 18:45:14-08:00,NaT,24.486,2_39_81_4550_2020-11-19 01:33:46.845927,2,11900388,2-39-81-4550,America/Los_Angeles,4903.0,"[{'WhPerMile': 258, 'kWhRequested': 51.6, 'mil...",False,0 days 01:11:12
51326,51328,2020-11-18 19:52:00-08:00,2020-11-18 20:00:50-08:00,NaT,4.788,2_39_81_4550_2020-11-19 03:51:59.755295,2,11900388,2-39-81-4550,America/Los_Angeles,1085.0,"[{'WhPerMile': 283, 'kWhRequested': 56.6, 'mil...",False,0 days 00:08:50
51327,51329,2020-11-18 20:24:11-08:00,2020-11-18 21:07:15-08:00,NaT,30.849,2_39_81_4550_2020-11-19 04:24:10.706432,2,11900388,2-39-81-4550,America/Los_Angeles,9284.0,"[{'WhPerMile': 400, 'kWhRequested': 40.0, 'mil...",False,0 days 00:43:04


In [19]:
df["parkDuration"].describe()

count                        65009
mean     0 days 06:16:27.227506960
std      0 days 04:52:55.289136130
min                0 days 00:02:04
25%                0 days 02:49:58
50%                0 days 06:09:34
75%                0 days 09:12:22
max               10 days 05:16:09
Name: parkDuration, dtype: object

Since there is no immensly high value like 3000 days, we are keeping all of the data

Next, we convert the User Inputs into table columns. We start by only keeping the last userInput Json, as this contains the latest user request

In [22]:
df["userInputs"] = df["userInputs"].str.rsplit("{", n=1).str[1]
df["userInputs"] = "[{" + df["userInputs"]

In [23]:
#convert userInputs from string and float to lists

def safe_eval(x):
    try:
        # Check if it's a string and try parsing it
        if isinstance(x, str):
            return ast.literal_eval(x)
        # If it's NaN or None, return an empty list
        elif x is None or (isinstance(x, float) and np.isnan(x)):
            return []
        # If it's already a list, return as is
        elif isinstance(x, list):
            return x
        # If it's some other unexpected object, return as is or log a warning
        else:
            return []  # Default to empty list for unrecognized cases
    except (ValueError, SyntaxError, TypeError):
        # Handle parsing errors safely
        return []

# Apply the safe_eval function to the 'userInputs' column
df['userInputs'] = df['userInputs'].apply(safe_eval)

# Verify the data types after applying the function
print(df['userInputs'].apply(type).unique())

[<class 'list'>]


In [24]:
# Explode and normalize userInputs
exploded_user_inputs = df['userInputs'].explode()

# Normalize the dictionaries into columns
user_inputs_normalized = pd.json_normalize(exploded_user_inputs)

# Restore the original index for merging
user_inputs_normalized['original_index'] = exploded_user_inputs.index

# Merge normalized data back into the original DataFrame
df = df.merge(user_inputs_normalized, left_index=True, right_on='original_index', how='left')

# Drop the auxiliary column
df.drop(columns=['original_index'], inplace=True, errors='ignore')

# Inspect the updated DataFrame
df.head()

Unnamed: 0,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,sessionID,siteID,spaceID,stationID,timezone,...,overlap,parkDuration,WhPerMile,kWhRequested,milesRequested,minutesAvailable,modifiedAt,paymentRequired,requestedDeparture,userID_y
0,51323,2020-11-18 15:36:26-08:00,2020-11-18 16:02:37-08:00,NaT,4.816,2_39_81_4550_2020-11-18 23:36:26.012461,2,11900388,2-39-81-4550,America/Los_Angeles,...,False,0 days 00:26:11,274.0,8.22,30.0,480.0,"Wed, 18 Nov 2020 23:36:42 GMT",True,"Thu, 19 Nov 2020 07:36:26 GMT",7132.0
1,51324,2020-11-18 16:35:54-08:00,2020-11-18 17:31:08-08:00,NaT,10.027,2_39_81_4550_2020-11-19 00:35:53.924922,2,11900388,2-39-81-4550,America/Los_Angeles,...,False,0 days 00:55:14,258.0,51.6,200.0,576.0,"Thu, 19 Nov 2020 00:36:25 GMT",True,"Thu, 19 Nov 2020 10:11:54 GMT",4903.0
2,51325,2020-11-18 17:34:02-08:00,2020-11-18 18:45:14-08:00,NaT,24.486,2_39_81_4550_2020-11-19 01:33:46.845927,2,11900388,2-39-81-4550,America/Los_Angeles,...,False,0 days 01:11:12,258.0,51.6,200.0,576.0,"Thu, 19 Nov 2020 01:34:02 GMT",True,"Thu, 19 Nov 2020 11:10:02 GMT",4903.0
3,51328,2020-11-18 19:52:00-08:00,2020-11-18 20:00:50-08:00,NaT,4.788,2_39_81_4550_2020-11-19 03:51:59.755295,2,11900388,2-39-81-4550,America/Los_Angeles,...,False,0 days 00:08:50,283.0,56.6,200.0,589.0,"Thu, 19 Nov 2020 03:52:50 GMT",True,"Thu, 19 Nov 2020 13:41:00 GMT",1085.0
4,51329,2020-11-18 20:24:11-08:00,2020-11-18 21:07:15-08:00,NaT,30.849,2_39_81_4550_2020-11-19 04:24:10.706432,2,11900388,2-39-81-4550,America/Los_Angeles,...,False,0 days 00:43:04,400.0,40.0,100.0,30.0,"Thu, 19 Nov 2020 04:24:50 GMT",True,"Thu, 19 Nov 2020 04:54:11 GMT",9284.0


We remove temporary and unnecessary columns for the next tasks.

In [25]:
df = df.drop("sessionID", axis=1)
df = df.drop("timezone", axis=1)
df = df.drop("overlap", axis=1)
df = df.drop("modifiedAt", axis=1)
df = df.drop("userInputs", axis=1)
df = df.drop("userID_y", axis=1)

In [26]:
df

Unnamed: 0,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,siteID,spaceID,stationID,userID_x,parkDuration,WhPerMile,kWhRequested,milesRequested,minutesAvailable,paymentRequired,requestedDeparture
0,51323,2020-11-18 15:36:26-08:00,2020-11-18 16:02:37-08:00,NaT,4.816,2,11900388,2-39-81-4550,7132.0,0 days 00:26:11,274.0,8.22,30.0,480.0,True,"Thu, 19 Nov 2020 07:36:26 GMT"
1,51324,2020-11-18 16:35:54-08:00,2020-11-18 17:31:08-08:00,NaT,10.027,2,11900388,2-39-81-4550,4903.0,0 days 00:55:14,258.0,51.60,200.0,576.0,True,"Thu, 19 Nov 2020 10:11:54 GMT"
2,51325,2020-11-18 17:34:02-08:00,2020-11-18 18:45:14-08:00,NaT,24.486,2,11900388,2-39-81-4550,4903.0,0 days 01:11:12,258.0,51.60,200.0,576.0,True,"Thu, 19 Nov 2020 11:10:02 GMT"
3,51328,2020-11-18 19:52:00-08:00,2020-11-18 20:00:50-08:00,NaT,4.788,2,11900388,2-39-81-4550,1085.0,0 days 00:08:50,283.0,56.60,200.0,589.0,True,"Thu, 19 Nov 2020 13:41:00 GMT"
4,51329,2020-11-18 20:24:11-08:00,2020-11-18 21:07:15-08:00,NaT,30.849,2,11900388,2-39-81-4550,9284.0,0 days 00:43:04,400.0,40.00,100.0,30.0,True,"Thu, 19 Nov 2020 04:54:11 GMT"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65004,24487,2019-05-13 12:43:12-07:00,2019-05-13 14:54:33-07:00,2019-05-13 13:46:37-07:00,2.308,2,CA-513,2-39-139-567,560.0,0 days 02:11:21,273.0,5.46,20.0,133.0,True,"Mon, 13 May 2019 21:56:12 GMT"
65005,25429,2019-06-13 09:53:42-07:00,2019-06-13 10:35:12-07:00,2019-06-13 10:24:57-07:00,0.908,2,CA-513,2-39-139-567,,0 days 00:41:30,,,,,,
65006,28030,2019-09-16 06:55:08-07:00,2019-09-16 08:34:27-07:00,2019-09-16 07:25:44-07:00,0.893,2,CA-513,2-39-139-567,,0 days 01:39:19,,,,,,
65007,29515,2019-11-06 07:28:18-08:00,2019-11-06 08:41:56-08:00,2019-11-06 07:58:42-08:00,0.900,2,CA-513,2-39-139-567,,0 days 01:13:38,,,,,,


In [27]:
uniquePayment = df["paymentRequired"].unique()
uniquePayment

array([True, nan], dtype=object)

The column paymentRequired is always True or NaN, so it holds no information and we can delete it.

In [56]:
df = df.drop("paymentRequired", axis=1)

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65009 entries, 0 to 65008
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype                              
---  ------              --------------  -----                              
 0   id                  65009 non-null  int64                              
 1   connectionTime      65009 non-null  datetime64[ns, America/Los_Angeles]
 2   disconnectTime      65009 non-null  datetime64[ns, America/Los_Angeles]
 3   doneChargingTime    60922 non-null  datetime64[ns, America/Los_Angeles]
 4   kWhDelivered        65009 non-null  float64                            
 5   siteID              65009 non-null  int64                              
 6   spaceID             65009 non-null  object                             
 7   stationID           65009 non-null  object                             
 8   userID_x            47816 non-null  float64                            
 9   parkDuration        65009 non-null  tim

Lastly, we map the weather data to the charging data.

In [58]:
weather = pd.read_csv("cleanWeatherData.csv")
weather["day"] = pd.to_datetime(weather["day"])
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1096 entries, 0 to 1095
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Unnamed: 0     1096 non-null   int64         
 1   day            1096 non-null   datetime64[ns]
 2   temperature    1096 non-null   float64       
 3   cloud_cover    1096 non-null   float64       
 4   precipitation  1096 non-null   float64       
dtypes: datetime64[ns](1), float64(3), int64(1)
memory usage: 42.9 KB


We need the day from the charging data so that we can compare those and join them with the weather data

In [66]:
df["dayOnly"] = df["connectionTime"].dt.date
df["dayOnly"] = pd.to_datetime(df["dayOnly"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65009 entries, 0 to 65008
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype                              
---  ------              --------------  -----                              
 0   id                  65009 non-null  int64                              
 1   connectionTime      65009 non-null  datetime64[ns, America/Los_Angeles]
 2   disconnectTime      65009 non-null  datetime64[ns, America/Los_Angeles]
 3   doneChargingTime    60922 non-null  datetime64[ns, America/Los_Angeles]
 4   kWhDelivered        65009 non-null  float64                            
 5   siteID              65009 non-null  int64                              
 6   spaceID             65009 non-null  object                             
 7   stationID           65009 non-null  object                             
 8   userID_x            47816 non-null  float64                            
 9   parkDuration        65009 non-null  tim

Now we merge the charging and weather data on each day

In [60]:
merge = pd.merge(df, weather, left_on= "dayOnly", right_on="day", how="left")
merge = merge.drop("day", axis=1)
merge = merge.drop("dayOnly", axis=1)
merge = merge.drop("Unnamed: 0", axis=1)
merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65009 entries, 0 to 65008
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype                              
---  ------              --------------  -----                              
 0   id                  65009 non-null  int64                              
 1   connectionTime      65009 non-null  datetime64[ns, America/Los_Angeles]
 2   disconnectTime      65009 non-null  datetime64[ns, America/Los_Angeles]
 3   doneChargingTime    60922 non-null  datetime64[ns, America/Los_Angeles]
 4   kWhDelivered        65009 non-null  float64                            
 5   siteID              65009 non-null  int64                              
 6   spaceID             65009 non-null  object                             
 7   stationID           65009 non-null  object                             
 8   userID_x            47816 non-null  float64                            
 9   parkDuration        65009 non-null  tim

We create one csv with all of the cleaned data and one with all the doneChargingTime Null Values and rows without weather data removed.

In [61]:
merge.to_csv("cleanChargingDataFull.csv")
merge.dropna(subset=["doneChargingTime"], inplace=True)
merge.dropna(subset=["temperature"], inplace=True)
merge.info()
merge.to_csv("cleanChargingDataFewNull.csv")

<class 'pandas.core.frame.DataFrame'>
Index: 53474 entries, 1149 to 65007
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype                              
---  ------              --------------  -----                              
 0   id                  53474 non-null  int64                              
 1   connectionTime      53474 non-null  datetime64[ns, America/Los_Angeles]
 2   disconnectTime      53474 non-null  datetime64[ns, America/Los_Angeles]
 3   doneChargingTime    53474 non-null  datetime64[ns, America/Los_Angeles]
 4   kWhDelivered        53474 non-null  float64                            
 5   siteID              53474 non-null  int64                              
 6   spaceID             53474 non-null  object                             
 7   stationID           53474 non-null  object                             
 8   userID_x            37605 non-null  float64                            
 9   parkDuration        53474 non-null  timed

Maybe even one with zero null values, so only the ones with user inputs.

In [62]:
merge.dropna(subset=["userID_x"], inplace=True)
merge.to_csv("cleanChargingDataNoNull.csv")

In [63]:
merge.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37605 entries, 1151 to 65004
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype                              
---  ------              --------------  -----                              
 0   id                  37605 non-null  int64                              
 1   connectionTime      37605 non-null  datetime64[ns, America/Los_Angeles]
 2   disconnectTime      37605 non-null  datetime64[ns, America/Los_Angeles]
 3   doneChargingTime    37605 non-null  datetime64[ns, America/Los_Angeles]
 4   kWhDelivered        37605 non-null  float64                            
 5   siteID              37605 non-null  int64                              
 6   spaceID             37605 non-null  object                             
 7   stationID           37605 non-null  object                             
 8   userID_x            37605 non-null  float64                            
 9   parkDuration        37605 non-null  timed