# Analytics & Applications WS23/24 Project

The following notebook contains the applied steps of the CRISP-DM model on the
project excercise from "Analytics and Applications" by Prof. Ketter in the WS23/24 of University Cologne.

The following steps will be done in the specified order:
1. Data Preparation
2. Modeling 
3. Evaluation

## Imports & Dependencies

In [220]:
import pandas as pd
import matplotlib.pyplot as plt

## Import Data

In [221]:
df_weather = pd.read_csv("data/weather_burbank_airport.csv")
df_charging = pd.read_csv("data/charging_sessions.csv")

In [222]:
# Limit the weather set to the relevant time frame of the charging sessions
start_date = df_charging['connectionTime'].min()
end_date = df_charging['connectionTime'].max()

df_weather = df_weather[(df_weather['timestamp'] >= start_date) & (df_weather['timestamp'] <= end_date)]

## Data Exploration and Preparation

### Basic Data Exploration

In [223]:
display(df_weather.head())

Unnamed: 0,city,timestamp,temperature,cloud_cover,cloud_cover_description,pressure,windspeed,precipitation,felt_temperature
3037,Burbank,2018-04-25 11:53:00,12.0,27.0,Mostly Cloudy,989.11,6.0,0.0,12.0
3038,Burbank,2018-04-25 12:10:00,12.0,27.0,Mostly Cloudy,989.11,7.0,0.0,12.0
3039,Burbank,2018-04-25 12:53:00,12.0,28.0,Mostly Cloudy,989.11,9.0,0.0,12.0
3040,Burbank,2018-04-25 13:24:00,12.0,20.0,Fog,989.44,9.0,0.0,12.0
3041,Burbank,2018-04-25 13:53:00,12.0,20.0,Fog,989.44,7.0,0.0,12.0


### Data Types


In [224]:
display(df_weather.dtypes)

df_weather = df_weather.astype(
    {
        "city": "category",
        "timestamp": "datetime64[ns, UTC]",
        "temperature": "float64",
        "cloud_cover": "float64",
        "cloud_cover_description": "string",
        "pressure": "float64",
        "windspeed": "float64",
        "precipitation": "float64",
        "felt_temperature": "float64",
    }

)

display(df_weather.dtypes)

city                        object
timestamp                   object
temperature                float64
cloud_cover                float64
cloud_cover_description     object
pressure                   float64
windspeed                  float64
precipitation              float64
felt_temperature           float64
dtype: object

city                                  category
timestamp                  datetime64[ns, UTC]
temperature                            float64
cloud_cover                            float64
cloud_cover_description         string[python]
pressure                               float64
windspeed                              float64
precipitation                          float64
felt_temperature                       float64
dtype: object

In [233]:
# Display dataframe
display(df_weather.head())

# Display where precipitation is not 0
display(df_weather[df_weather['precipitation'] > 0])

Unnamed: 0,city,timestamp,temperature,cloud_cover,cloud_cover_description,pressure,windspeed,precipitation,felt_temperature
3037,Burbank,2018-04-25 11:53:00+00:00,12.0,27.0,Mostly Cloudy,989.11,6.0,0.0,12.0
3038,Burbank,2018-04-25 12:10:00+00:00,12.0,27.0,Mostly Cloudy,989.11,7.0,0.0,12.0
3039,Burbank,2018-04-25 12:53:00+00:00,12.0,28.0,Mostly Cloudy,989.11,9.0,0.0,12.0
3040,Burbank,2018-04-25 13:24:00+00:00,12.0,20.0,Fog,989.44,9.0,0.0,12.0
3041,Burbank,2018-04-25 13:53:00+00:00,12.0,20.0,Fog,989.44,7.0,0.0,12.0


Unnamed: 0,city,timestamp,temperature,cloud_cover,cloud_cover_description,pressure,windspeed,precipitation,felt_temperature
3210,Burbank,2018-05-01 15:53:00+00:00,11.0,11.0,Light Rain,983.52,19.0,0.25,11.0
3495,Burbank,2018-05-12 12:11:00+00:00,13.0,11.0,Light Rain,984.83,13.0,0.25,13.0
3496,Burbank,2018-05-12 12:53:00+00:00,13.0,11.0,Light Rain,985.16,17.0,0.25,13.0
3498,Burbank,2018-05-12 13:53:00+00:00,12.0,26.0,Cloudy,985.49,15.0,0.25,12.0
3755,Burbank,2018-05-21 18:24:00+00:00,14.0,11.0,Light Rain,984.17,11.0,0.51,14.0
...,...,...,...,...,...,...,...,...,...
29159,Burbank,2020-12-28 23:17:00+00:00,8.0,40.0,Heavy Rain,982.86,17.0,0.51,6.0
29160,Burbank,2020-12-28 23:53:00+00:00,8.0,11.0,Light Rain,983.19,0.0,3.30,8.0
29161,Burbank,2020-12-29 00:00:00+00:00,8.0,12.0,Rain,983.19,0.0,0.25,8.0
29162,Burbank,2020-12-29 00:53:00+00:00,7.0,11.0,Light Rain,983.19,13.0,1.78,4.0


In [226]:
## Check if the dataset contains multiple cities
print(df_weather['city'].value_counts())

city
Burbank    26207
Name: count, dtype: int64


### Missing Values

In [227]:
# Check null values
display(df_weather.isna().sum())

city                        0
timestamp                   0
temperature                25
cloud_cover                20
cloud_cover_description    20
pressure                    8
windspeed                  81
precipitation               0
felt_temperature           26
dtype: int64

In [228]:
# Check if there are any time series missing, the data contains one entry per hour
min_date = df_weather['timestamp'].min()
max_date = df_weather['timestamp'].max()

def get_missing_hours(df, min_date, max_date):
    date_range = pd.date_range(start=min_date, end=max_date, freq='H')
    missing_dates = date_range[~date_range.isin(df['timestamp'])]
    return missing_dates

missing_hours = get_missing_hours(df_weather, min_date, max_date)
print(f"Missing hours: {len(missing_hours)}")

# Add missing hours to dataframe
missing_df = pd.DataFrame({
    'timestamp': missing_hours,
    'city': 'Burbank',
    'temperature': None,
    'cloud_cover': None,
    'cloud_cover_description': None,
    'pressure': None,
    'windspeed': None,
    'precipitation': None,
    'felt_temperature': None
})

# Merge missing hours with original dataframe
df_weather = pd.concat([df_weather, missing_df])
df_weather = df_weather.sort_values(by='timestamp')
display(df_weather.isna().sum())

Missing hours: 93


  df_weather = pd.concat([df_weather, missing_df])


city                         0
timestamp                    0
temperature                118
cloud_cover                113
cloud_cover_description    113
pressure                   101
windspeed                  174
precipitation               93
felt_temperature           119
dtype: int64

In [229]:
# Temperature, cloud_cover, pressure, windspeed, precipitation and felt_temperature are all numerical values, so we can fill them with an interpolation
numeric_columns = ['temperature', 'cloud_cover', 'pressure', 'windspeed', 'precipitation', 'felt_temperature']
df_weather[numeric_columns] = df_weather[numeric_columns].interpolate()
display(df_weather.isna().sum())

city                         0
timestamp                    0
temperature                  0
cloud_cover                  0
cloud_cover_description    113
pressure                     0
windspeed                    0
precipitation                0
felt_temperature             0
dtype: int64

In [230]:
# Categorical columns can be filled with the previous value
categorical_columns = ['cloud_cover_description']
df_weather[categorical_columns] = df_weather[categorical_columns].ffill()

display(df_weather.isna().sum())

city                       0
timestamp                  0
temperature                0
cloud_cover                0
cloud_cover_description    0
pressure                   0
windspeed                  0
precipitation              0
felt_temperature           0
dtype: int64

In [231]:
# Display dataframe
display(df_weather.head())

Unnamed: 0,city,timestamp,temperature,cloud_cover,cloud_cover_description,pressure,windspeed,precipitation,felt_temperature
3037,Burbank,2018-04-25 11:53:00+00:00,12.0,27.0,Mostly Cloudy,989.11,6.0,0.0,12.0
3038,Burbank,2018-04-25 12:10:00+00:00,12.0,27.0,Mostly Cloudy,989.11,7.0,0.0,12.0
3039,Burbank,2018-04-25 12:53:00+00:00,12.0,28.0,Mostly Cloudy,989.11,9.0,0.0,12.0
3040,Burbank,2018-04-25 13:24:00+00:00,12.0,20.0,Fog,989.44,9.0,0.0,12.0
3041,Burbank,2018-04-25 13:53:00+00:00,12.0,20.0,Fog,989.44,7.0,0.0,12.0


In [232]:
df_weather.to_pickle('data/weather_modified.pkl')
df_weather.to_csv('data/weather_modified.csv')