# Analytics & Applications WS23/24 Project

The following notebook contains the applied steps of the CRISP-DM model on the
project excercise from "Analytics and Applications" by Prof. Ketter in the WS23/24 of University Cologne.

The following steps will be done in the specified order:
1. Data Preparation
2. Modeling 
3. Evaluation

## Imports & Dependencies

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

## Import Data

In [None]:
# ToDo: use path object
df = pd.read_csv("data/charging_sessions.csv")
df_weather = pd.read_csv("data/weather_modified.csv")

## Data Exploration and Preparation

### Basic Data Exploration

In [None]:
print(df.head())

In [None]:
print(df.info())

In [None]:
print(df.describe())

### Set Data Types

In [None]:
# Set Datatypes
df['connectionTime'] = pd.to_datetime(df['connectionTime'])
df['disconnectTime'] = pd.to_datetime(df['disconnectTime'])
df['doneChargingTime'] = pd.to_datetime(df['doneChargingTime'])

df['sessionID'] = df['sessionID'].astype(str)
df['siteID'] = df['siteID'].astype(str)
df['spaceID'] = df['spaceID'].astype(str)
df['stationID'] = df['stationID'].astype(str)
df['userID'] = df['userID'].astype(str)

In [None]:
# Rename first column, contains an index
df = df.rename(columns={df.columns[0]: 'Index'})
df = df.set_index('Index')

In [None]:
# Session ID is a composite value of stationID and connectionTime, and therefor redundant
# Timezone contains only one value, and can therefor be considered meta-data
df = df.drop(['sessionID', 'timezone'], axis=1)

In [None]:


df['NoChargingTime'] = df['disconnectTime'] - df['doneChargingTime']
df['NoChargingTimeMinutes'] = (df['NoChargingTime']).dt.total_seconds() / 60.0  # Convert seconds to minutes

df['ChargingTime'] = df['doneChargingTime'] - df['connectionTime']
df['ChargingTimeMinutes'] = (df['ChargingTime']).dt.total_seconds() / 60.0  # Convert seconds to minutes

# Create a new column 'month' to store the month information
df['month'] = df['connectionTime'].dt.month

In [None]:
exploration = df.set_index('connectionTime')
exploration_monthly = exploration['kWhDelivered'].resample('M').sum()

# Plot the monthly sum
plt.figure(figsize=(12, 6))
exploration_monthly.plot(kind='bar')
plt.title('Monthly Sum of kWhDelivered')
plt.xlabel('Month')
plt.ylabel('Sum of kWhDelivered')
plt.show()

In [None]:
# Calculate rolling mean for a specified window (e.g., 7 days)
rolling_mean = exploration['kWhDelivered'].rolling(window=7).mean()

# Plot the original time series and the rolling mean
plt.figure(figsize=(12, 6))
plt.plot(exploration['kWhDelivered'], label='Original Data')
plt.plot(rolling_mean, label='Rolling Mean (7 days)', color='red')
plt.title('Original Time Series and Rolling Mean')
plt.xlabel('Date')
plt.ylabel('kWhDelivered')
plt.legend()
plt.show()

### Save adjustet dataframe as csv

In [None]:
#convert to csv
#df.to_csv('data.csv')

### Handle Missing and Erronous Data

In [None]:
# TODO:
# - Fill Gap From 2020
# - Handle too long charges / unrealisitc charge volumes (remove / fill)
#   - we have to find a cut-off value I tried something but i dont know if its right
# - Handle Outliers for charging
# - Expand User Inputs column > drop user ID from user Inputs column, since its redundant

### Calculate the Time a Car used the Station without Charging

In [None]:
# Plot the time differences
plt.figure(figsize=(10, 6))
plt.plot(df['NoChargingTimeMinutes'], marker='o', linestyle='-', color='b')
plt.title('Time Charging')
plt.xlabel('Row Index')
plt.ylabel('Time (minutes)')
plt.grid(True)
plt.show()


### Calculate the Charging Time

In [None]:
# Plot the time differences
plt.figure(figsize=(10, 6))
plt.plot(df['ChargingTimeMinutes'], marker='o', linestyle='-', color='b')
plt.title('Time Charging')
plt.xlabel('Row Index')
plt.ylabel('Time (minutes)')
plt.grid(True)
plt.show()

### Calculate Outliers with the IQR

In [None]:
# Calculate the interquartile range (IQR) for time differences
Q1 = df['ChargingTimeMinutes'].quantile(0.25)
Q3 = df['ChargingTimeMinutes'].quantile(0.75)
IQR = Q3 - Q1

# Define a threshold for outliers based on IQR (e.g., 1.5 times IQR)
threshold = 1.5

# Identify outliers based on IQR
outliers = df[(df['ChargingTimeMinutes'] > (Q3 + threshold * IQR))]

# Display or further analyze the outliers
print("Outliers based on IQR:")
print(outliers)

### Split the Stations in to a new Column Station with have the entry 1 or 2

In [None]:
# Create a new column 'is_1_or_2' based on whether 'stationID' starts with '1' or '2'
df['station'] = df['stationID'].astype(str).apply(lambda x: '1' if x.startswith('1') else '2' if x.startswith('2') else None)

In [None]:
# Plot the time differences for station 1 in blue and station 2 in red
plt.figure(figsize=(10, 6))

# Plot time differences for station 1 (blue)
plt.scatter(df[df['station'] == '1'].index, df[df['station'] == '1']['ChargingTimeMinutes'], color='blue', label='Station 1')

# Plot time differences for station 2 (red)
plt.scatter(df[df['station'] == '2'].index, df[df['station'] == '2']['ChargingTimeMinutes'], color='red', label='Station 2')

plt.title('Charging Times for Stations 1 and 2')
plt.xlabel('Row Index')
plt.ylabel('Time Difference (minutes)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Plot the time differences for station 1 in blue and station 2 in red
plt.figure(figsize=(10, 6))

# Plot time differences for station 1 (blue)
plt.scatter(df[df['station'] == '1'].index, df[df['station'] == '1']['NoChargingTimeMinutes'], color='blue', label='Station 1')

# Plot time differences for station 2 (red)
plt.scatter(df[df['station'] == '2'].index, df[df['station'] == '2']['NoChargingTimeMinutes'], color='red', label='Station 2')

plt.title('Time without Charging for Stations 1 and 2')
plt.xlabel('Row Index')
plt.ylabel('Time Difference (minutes)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Plot a histogram for the usage frequency of each station
plt.figure(figsize=(8, 5))

# Plot histogram for station 1
plt.hist(df[df['station'] == '1']['station'], bins=2, color='blue', alpha=0.7, label='Station 1')

# Plot histogram for station 2
plt.hist(df[df['station'] == '2']['station'], bins=2, color='red', alpha=0.7, label='Station 2')

plt.title('Usage Frequency of Stations 1 and 2')
plt.xlabel('Station ID')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Plot the usage frequency of stations for every month
plt.figure(figsize=(12, 6))

# Plot histogram for station 1
plt.hist(df[df['station'] == '1']['month'], bins=12, color='blue', alpha=0.7, label='Station 1')

# Plot histogram for station 2
plt.hist(df[df['station'] == '2']['month'], bins=12, color='red', alpha=0.7, label='Station 2')

plt.title('Usage Frequency of Stations 1 and 2 for Each Month')
plt.xlabel('Month')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
df_weather['timestamp'] = pd.to_datetime(df_weather['timestamp'])

# Group by day and calculate the mean temperature for each day
df_weather_mean_per_day = df_weather.groupby(df_weather['timestamp'].dt.date)['temperature'].mean().reset_index()

# Display or save the resulting DataFrame
print(df_weather_mean_per_day)

In [None]:
df_weather_mean_per_day['timestamp'] = pd.to_datetime(df_weather_mean_per_day['timestamp'])

# Merge the two dataframes based on the day of the month
merged_df = pd.merge(df, df_weather_mean_per_day, left_on=df['connectionTime'].dt.day, right_on=df_weather_mean_per_day['timestamp'].dt.day, how='inner')
print(merged_df[merged_df['station'] == '1'])
# Create a line plot of average temperature against the day of the month
plt.figure(figsize=(12, 6))

plt.hist(merged_df[merged_df['station'] == '1']['temperature'], bins=20, color='blue', alpha=0.7, label='Station 1')

plt.hist(merged_df[merged_df['station'] == '2']['temperature'], bins=20, color='red', alpha=0.7, label='Station 2')

plt.title('Frequency of the stations at different temperatures')
plt.xlabel('temperature')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# TODO
# Schauen wie die durchschnittswerte sind --> Vielleicht kommen verteilungen daher
# Anschauen, wie häufig eine Station am Tag im Schnitt genutzt wird
