# 2.2 Complex machine learning models - keras - CNN
### The following script contains the following:

#### 1. Import libraries, additional requirements
#### 2. Import Data
#### 3. Clean the data
#### 4.  Data preprocessing
#### 5. Build and run CNN keras model
#### 6. Run confusion matrix
        - check accuracy and loss
---------------------------------------------------------------------------------------------------------------------------
## 1. Import libraries, additional requirements
---------------------------------------------------------------------------------------------------------------------------

In [325]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import operator
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from numpy import unique
from numpy import reshape
import tensorflow.keras as keras
from keras.models import Sequential
from keras.layers import Conv1D, Conv2D, Dense, BatchNormalization, Flatten, MaxPooling1D, Dropout
from tensorflow.keras.utils import to_categorical

In [327]:
# Turning off warning feature
import warnings
warnings.filterwarnings('ignore')

# Supress scientific notation for easier analysis profiling
pd.set_option('display.float_format', '{:.2f}'.format)

# Set display options to show all columns without truncation
pd.set_option('display.max_columns', None)

---------------------------------------------------------------------------------------------------------------------
# 2. Import Data

In [377]:
# Create a path for importing
path = r'C:\Users\shrav\_Data_Analysis_CF\ML_ClimateWins_Project'

In [379]:
#Read in the European weather data.
weather = pd.read_csv(os.path.join(path,'02 Data', 'Original_Data', 'Dataset-weather-prediction-dataset-processed.csv'))
weather

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,BASEL_temp_max,BELGRADE_cloud_cover,BELGRADE_humidity,BELGRADE_pressure,BELGRADE_global_radiation,BELGRADE_precipitation,BELGRADE_sunshine,BELGRADE_temp_mean,BELGRADE_temp_min,BELGRADE_temp_max,BUDAPEST_cloud_cover,BUDAPEST_humidity,BUDAPEST_pressure,BUDAPEST_global_radiation,BUDAPEST_precipitation,BUDAPEST_sunshine,BUDAPEST_temp_mean,BUDAPEST_temp_min,BUDAPEST_temp_max,DEBILT_cloud_cover,DEBILT_wind_speed,DEBILT_humidity,DEBILT_pressure,DEBILT_global_radiation,DEBILT_precipitation,DEBILT_sunshine,DEBILT_temp_mean,DEBILT_temp_min,DEBILT_temp_max,DUSSELDORF_cloud_cover,DUSSELDORF_wind_speed,DUSSELDORF_humidity,DUSSELDORF_pressure,DUSSELDORF_global_radiation,DUSSELDORF_precipitation,DUSSELDORF_snow_depth,DUSSELDORF_sunshine,DUSSELDORF_temp_mean,DUSSELDORF_temp_min,DUSSELDORF_temp_max,GDANSK_cloud_cover,GDANSK_humidity,GDANSK_precipitation,GDANSK_snow_depth,GDANSK_temp_mean,GDANSK_temp_min,GDANSK_temp_max,HEATHROW_cloud_cover,HEATHROW_humidity,HEATHROW_pressure,HEATHROW_global_radiation,HEATHROW_precipitation,HEATHROW_snow_depth,HEATHROW_sunshine,HEATHROW_temp_mean,HEATHROW_temp_min,HEATHROW_temp_max,KASSEL_wind_speed,KASSEL_humidity,KASSEL_pressure,KASSEL_global_radiation,KASSEL_precipitation,KASSEL_sunshine,KASSEL_temp_mean,KASSEL_temp_min,KASSEL_temp_max,LJUBLJANA_cloud_cover,LJUBLJANA_wind_speed,LJUBLJANA_humidity,LJUBLJANA_pressure,LJUBLJANA_global_radiation,LJUBLJANA_precipitation,LJUBLJANA_sunshine,LJUBLJANA_temp_mean,LJUBLJANA_temp_min,LJUBLJANA_temp_max,MAASTRICHT_cloud_cover,MAASTRICHT_wind_speed,MAASTRICHT_humidity,MAASTRICHT_pressure,MAASTRICHT_global_radiation,MAASTRICHT_precipitation,MAASTRICHT_sunshine,MAASTRICHT_temp_mean,MAASTRICHT_temp_min,MAASTRICHT_temp_max,MADRID_cloud_cover,MADRID_wind_speed,MADRID_humidity,MADRID_pressure,MADRID_global_radiation,MADRID_precipitation,MADRID_sunshine,MADRID_temp_mean,MADRID_temp_min,MADRID_temp_max,MUNCHENB_cloud_cover,MUNCHENB_humidity,MUNCHENB_global_radiation,MUNCHENB_precipitation,MUNCHENB_snow_depth,MUNCHENB_sunshine,MUNCHENB_temp_mean,MUNCHENB_temp_min,MUNCHENB_temp_max,OSLO_cloud_cover,OSLO_wind_speed,OSLO_humidity,OSLO_pressure,OSLO_global_radiation,OSLO_precipitation,OSLO_snow_depth,OSLO_sunshine,OSLO_temp_mean,OSLO_temp_min,OSLO_temp_max,ROMA_cloud_cover,ROMA_wind_speed,ROMA_humidity,ROMA_pressure,ROMA_sunshine,ROMA_temp_mean,SONNBLICK_cloud_cover,SONNBLICK_wind_speed,SONNBLICK_humidity,SONNBLICK_pressure,SONNBLICK_global_radiation,SONNBLICK_precipitation,SONNBLICK_sunshine,SONNBLICK_temp_mean,SONNBLICK_temp_min,SONNBLICK_temp_max,STOCKHOLM_cloud_cover,STOCKHOLM_pressure,STOCKHOLM_global_radiation,STOCKHOLM_precipitation,STOCKHOLM_sunshine,STOCKHOLM_temp_mean,STOCKHOLM_temp_min,STOCKHOLM_temp_max,TOURS_wind_speed,TOURS_humidity,TOURS_pressure,TOURS_global_radiation,TOURS_precipitation,TOURS_temp_mean,TOURS_temp_min,TOURS_temp_max,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,19600101,1,7,2.10,0.85,1.02,0.32,0.09,0,0.70,6.50,0.80,10.90,1,0.81,1.02,0.88,0.00,7.00,3.70,-0.90,7.90,4,0.67,1.02,0.44,0.01,2.30,2.40,-0.40,5.10,7,7.70,0.85,1.00,0.07,0.25,0.00,9.30,7.40,11.00,8,5.40,0.83,1.02,0.12,0.08,0,0.00,10.00,7.00,11.50,8,0.91,0.00,0,0.80,-0.30,1.60,7,0.91,1.00,0.13,0.22,0,0.00,10.60,9.40,8.30,2.90,0.82,1.01,0.28,0.48,1.60,7.90,3.90,9.40,8,1.40,1.00,1.02,0.20,0.00,0.00,-0.60,-1.90,0.50,7,8.70,0.83,1.01,0.22,0.32,1.00,9.50,8.50,11.10,6,0.00,0.92,1.03,0.53,0.00,1.40,7.60,4.40,10.80,5,0.67,0.20,0.10,0,0.00,6.90,1.10,10.40,8,4.00,0.98,1.00,0.04,1.14,0,0.00,4.90,3.80,5.90,3,2.60,0.73,1.02,7.10,7.80,4,4.50,0.73,1.03,0.48,0.01,2.30,-5.90,-8.50,-3.20,5,1.01,0.05,0.32,0.00,4.20,2.20,4.90,3.80,0.76,1.02,1.54,0.44,10.00,7.80,12.20,5,0.88,1.00,0.45,0.34,0,4.70,8.50,6.00,10.90
1,19600102,1,6,2.10,0.84,1.02,0.36,1.05,0,1.10,6.10,3.30,10.10,6,0.84,1.02,0.25,0.00,0.00,2.90,2.20,4.40,4,0.67,1.02,0.18,0.31,0.00,2.30,1.40,3.10,8,4.10,0.90,1.01,0.14,0.06,0.10,7.70,6.40,8.30,8,3.60,0.89,1.02,0.18,0.66,0,0.50,8.20,7.40,11.00,8,0.93,0.08,0,1.60,0.90,2.20,7,0.98,1.01,0.13,0.23,0,0.00,6.10,3.90,10.60,1.90,0.86,1.01,0.12,0.27,0.00,7.70,6.80,9.10,6,1.40,0.94,1.02,0.56,0.13,3.20,2.10,-1.30,5.50,8,5.70,0.92,1.01,0.17,1.34,0.40,8.60,7.50,9.90,7,0.80,0.86,1.03,0.46,0.00,0.90,9.80,7.40,12.20,6,0.72,0.61,0.30,0,5.10,6.20,4.20,10.20,8,5.10,0.62,1.01,0.04,0.00,0,0.00,3.40,2.80,4.90,3,2.60,0.73,1.02,7.10,12.20,6,6.70,0.97,1.03,0.21,0.61,0.00,-9.50,-10.50,-8.50,5,1.01,0.05,0.06,0.00,4.00,3.00,5.00,3.80,0.76,1.02,1.54,0.71,9.50,7.00,12.00,7,0.91,1.00,0.25,0.84,0,0.70,8.90,5.60,12.10
2,19600103,1,8,2.10,0.90,1.02,0.18,0.30,0,0.00,8.50,5.10,9.90,6,0.77,1.02,0.67,0.00,3.50,3.10,-0.50,6.40,4,0.67,1.02,0.30,0.00,0.60,2.70,1.70,5.30,6,3.10,0.92,1.02,0.28,0.01,3.00,6.80,4.60,9.90,7,3.10,0.95,1.02,0.12,0.07,0,0.00,7.10,6.90,9.10,8,0.94,0.15,0,0.70,0.40,1.70,8,0.96,1.02,0.15,0.07,0,0.10,8.40,6.10,12.20,1.30,0.91,1.01,0.12,0.60,0.00,6.50,6.00,8.00,8,1.40,0.96,1.02,0.20,0.12,0.00,4.60,0.90,6.30,7,3.60,0.97,1.02,0.12,0.46,0.00,6.90,5.50,9.90,5,1.90,0.90,1.03,0.63,0.00,2.30,8.60,6.40,10.80,6,0.91,0.20,0.30,0,0.00,5.80,4.00,8.00,8,2.30,0.69,1.02,0.04,0.08,0,0.00,1.90,0.60,3.10,3,2.60,0.73,1.02,7.10,10.20,8,7.50,0.93,1.03,0.21,3.20,0.00,-9.50,-10.00,-8.90,5,1.01,0.05,0.02,0.00,2.40,1.30,4.10,3.80,0.76,1.02,1.54,0.10,10.30,9.00,11.60,7,0.91,1.01,0.17,0.08,0,0.10,10.50,8.10,12.90
3,19600104,1,3,2.10,0.92,1.02,0.58,0.00,0,4.10,6.30,3.80,10.60,8,0.93,1.03,0.25,0.00,0.00,2.00,-2.00,3.00,4,0.67,1.02,0.19,0.00,0.00,2.00,0.40,4.40,8,5.70,0.95,1.03,0.08,0.09,0.00,6.70,3.60,10.10,8,2.70,0.86,1.02,0.12,0.02,0,0.00,6.80,3.60,8.00,7,0.91,0.00,0,-0.10,-0.90,2.00,8,0.98,1.02,0.13,0.00,0,0.00,9.40,6.70,8.90,1.40,0.87,1.03,0.12,0.00,0.00,5.80,5.20,6.50,6,1.40,0.94,1.02,0.49,0.00,2.20,3.20,1.00,7.00,7,5.10,0.89,1.03,0.16,0.00,0.30,7.00,3.00,10.00,0,1.10,0.75,1.03,1.16,0.00,8.70,10.30,4.50,16.10,6,0.90,0.20,0.01,0,0.00,3.90,3.20,5.40,8,3.90,0.98,1.02,0.04,0.35,0,0.00,3.00,0.40,4.90,3,2.60,0.73,1.02,7.10,10.80,5,7.50,0.93,1.04,0.22,1.10,0.00,-11.50,-12.90,-10.00,5,1.01,0.05,0.00,0.00,1.20,0.40,2.30,3.80,0.76,1.02,1.54,0.10,11.20,9.90,12.60,7,0.86,1.02,0.13,0.98,0,0.00,7.40,7.30,10.60
4,19600105,1,6,2.10,0.95,1.02,0.65,0.14,0,5.40,3.00,-0.70,6.00,8,0.99,1.03,0.25,0.06,0.00,2.00,0.70,2.80,4,0.67,1.02,0.19,0.00,0.00,2.50,1.10,5.30,6,6.70,0.90,1.02,0.04,0.39,0.00,8.00,2.40,11.20,7,4.50,0.92,1.02,0.12,0.62,0,0.00,7.70,6.20,11.00,7,0.94,0.17,0,0.40,-1.70,1.50,5,0.84,1.03,0.30,0.00,0,2.10,8.90,8.90,7.20,2.90,0.86,1.03,0.13,0.71,0.00,5.40,3.70,6.00,7,1.40,0.94,1.02,0.20,0.00,0.00,3.60,0.40,4.80,7,6.20,0.92,1.03,0.12,0.56,0.00,8.10,2.50,11.10,2,1.10,0.64,1.03,1.10,0.00,7.80,12.10,8.20,16.00,5,0.85,0.65,0.96,0,5.60,1.80,-3.00,6.00,8,1.50,0.96,1.01,0.05,0.26,0,0.00,3.70,2.90,4.90,3,2.60,0.73,1.02,7.10,9.90,2,10.20,0.75,1.04,0.72,0.01,6.10,-9.30,-12.00,-6.50,5,1.01,0.05,1.32,0.00,3.30,0.80,4.30,3.80,0.76,1.02,1.54,0.00,11.40,10.70,12.00,3,0.80,1.03,0.46,0.00,0,5.70,5.70,3.00,8.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22945,20221027,10,1,2.10,0.79,1.02,1.34,0.22,0,7.70,15.90,11.40,21.40,2,0.68,1.03,1.57,0.18,5.70,18.20,12.10,24.40,4,0.67,1.02,1.41,0.14,5.40,11.70,7.90,16.20,8,3.00,0.84,1.02,1.13,0.22,2.80,15.70,12.80,19.40,8,6.00,0.75,1.02,1.13,0.20,0,6.40,17.80,13.60,21.40,6,0.70,0.12,0,11.50,8.00,15.00,5,0.87,1.01,1.18,0.16,0,1.90,16.40,11.90,18.90,2.40,0.77,1.02,1.14,0.19,4.00,9.10,5.40,13.10,4,0.10,0.80,1.03,1.35,0.37,5.90,14.70,12.10,21.10,8,3.80,0.67,1.02,1.17,0.20,5.30,18.60,14.10,22.60,8,1.70,0.52,1.02,1.89,0.12,5.30,20.00,16.20,23.90,2,0.76,1.37,0.26,0,9.70,14.30,8.30,22.20,8,1.10,0.98,1.01,1.06,0.21,0,0.10,9.70,5.80,12.00,3,1.00,0.73,1.02,7.10,15.40,2,5.70,0.84,1.03,1.56,0.47,4.70,0.60,-1.40,2.60,5,1.02,1.11,0.14,3.20,11.50,8.20,14.20,3.70,0.00,1.02,1.54,0.18,19.90,14.20,25.70,5,0.82,1.01,1.13,0.41,0,3.40,10.70,7.90,13.50
22946,20221028,10,6,2.10,0.77,1.02,1.34,0.22,0,5.40,16.70,14.30,21.90,0,0.68,1.03,1.57,0.18,5.70,15.90,10.60,21.20,4,0.67,1.02,1.41,0.14,5.40,11.70,7.90,16.20,8,2.90,0.84,1.02,1.13,0.22,3.50,16.00,10.30,20.50,7,5.20,0.71,1.02,1.13,0.20,0,4.90,19.40,15.40,23.90,6,0.70,0.12,0,11.70,7.90,15.50,4,0.82,1.02,1.18,0.16,0,4.20,15.80,12.70,21.80,2.40,0.77,1.02,1.14,0.19,4.00,9.10,5.40,13.10,3,0.10,0.82,1.03,1.35,0.37,4.50,12.90,9.80,19.80,7,3.90,0.70,1.02,1.17,0.20,5.00,18.90,15.80,23.50,8,2.20,0.51,1.02,1.89,0.12,3.90,19.10,14.70,23.50,6,0.70,1.37,0.26,0,7.70,16.10,8.90,26.10,8,1.50,1.00,1.01,1.06,0.21,0,0.00,10.90,8.80,11.70,3,1.20,0.73,1.02,7.10,15.40,5,4.50,0.84,1.03,1.56,0.47,4.70,2.30,0.60,4.00,5,1.01,1.11,0.14,0.80,12.50,11.00,14.30,3.20,0.00,1.02,1.54,0.18,20.30,16.50,24.20,5,0.82,1.01,1.13,0.41,0,3.40,10.70,7.90,13.50
22947,20221029,10,4,2.10,0.76,1.02,1.34,0.22,0,6.10,16.70,13.10,22.40,2,0.68,1.03,1.57,0.18,5.70,13.40,8.60,18.20,4,0.67,1.02,1.41,0.14,5.40,11.70,7.90,16.20,8,2.20,0.86,1.02,1.13,0.22,3.30,15.80,9.30,21.10,8,4.40,0.73,1.02,1.13,0.20,0,4.00,18.20,13.40,22.00,7,0.70,0.12,0,14.20,11.50,16.90,7,0.85,1.01,1.18,0.16,0,4.20,16.50,11.20,17.00,2.40,0.77,1.02,1.14,0.19,4.00,9.10,5.40,13.10,3,0.10,0.81,1.03,1.35,0.37,5.10,13.20,10.20,20.70,8,2.50,0.69,1.02,1.17,0.20,3.20,18.20,13.70,24.30,8,1.90,0.46,1.02,1.89,0.12,8.10,19.00,15.40,22.60,7,0.64,1.37,0.26,0,6.80,17.40,11.20,26.20,3,2.10,0.85,1.01,1.06,0.21,0,6.80,9.70,7.70,14.20,3,1.50,0.73,1.02,7.10,15.40,3,3.70,0.84,1.03,1.56,0.47,4.70,3.30,2.10,4.50,5,1.01,1.11,0.14,6.90,13.10,12.10,14.40,3.70,0.00,1.02,1.54,0.18,20.60,16.70,24.50,5,0.82,1.01,1.13,0.41,0,3.40,10.70,7.90,13.50
22948,20221030,10,5,2.10,0.80,1.02,1.34,0.22,0,5.80,15.40,11.60,21.10,1,0.68,1.02,1.57,0.18,5.70,15.00,9.10,20.90,4,0.67,1.02,1.41,0.14,5.40,11.70,7.90,16.20,8,1.80,0.87,1.02,1.13,0.22,6.00,14.40,10.30,20.20,7,3.90,0.73,1.02,1.13,0.20,0,6.90,16.70,11.90,21.10,6,0.70,0.12,0,11.00,7.50,14.60,5,0.86,1.01,1.18,0.16,0,0.60,15.20,13.40,17.50,2.40,0.77,1.02,1.14,0.19,4.00,9.10,5.40,13.10,3,0.00,0.77,1.02,1.35,0.37,5.70,14.00,10.00,23.10,8,2.80,0.73,1.02,1.17,0.20,6.80,16.30,12.80,21.40,5,1.10,0.66,1.02,1.89,0.12,3.10,15.70,13.10,18.30,6,0.75,1.37,0.26,0,8.30,14.50,9.20,23.50,5,1.20,0.94,1.01,1.06,0.21,0,2.90,5.90,2.10,8.10,3,1.20,0.73,1.02,7.10,15.40,3,7.20,0.84,1.03,1.56,0.47,4.70,3.40,2.70,4.10,5,1.02,1.11,0.14,8.40,7.50,5.10,12.40,2.00,0.00,1.02,1.54,0.18,15.90,12.40,19.40,5,0.82,1.01,1.13,0.41,0,3.40,10.70,7.90,13.50


In [381]:
outcomes = pd.read_csv(os.path.join(path, '02 Data', 'Original_Data','Dataset-Answers-Weather_Prediction_Pleasant_Weather.csv'))
outcomes

Unnamed: 0,DATE,BASEL_pleasant_weather,BELGRADE_pleasant_weather,BUDAPEST_pleasant_weather,DEBILT_pleasant_weather,DUSSELDORF_pleasant_weather,HEATHROW_pleasant_weather,KASSEL_pleasant_weather,LJUBLJANA_pleasant_weather,MAASTRICHT_pleasant_weather,MADRID_pleasant_weather,MUNCHENB_pleasant_weather,OSLO_pleasant_weather,SONNBLICK_pleasant_weather,STOCKHOLM_pleasant_weather,VALENTIA_pleasant_weather
0,19600101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,19600102,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,19600103,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,19600104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,19600105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22945,20221027,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22946,20221028,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22947,20221029,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22948,20221030,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [383]:
# Choosing CNN model over RNN keras model

#Even though we wont be looking at any data with images, the CNN model works better than RNN because
#it also has advantages with analyzing numerical data,
#while the RNN model has advantages in text, writing, and speech.

---------------------------------------------------------------------------------------------------------------------------
## 3. Clean the data

In [386]:
# Find 2 observations that need to be dropped
#Two types of observations are missing multiple years for most weather stations. Remove them.
#Visualized in Tableau, cleaned in jupyter notebook.

In [388]:
# DATE and MONTH columns, 3 weather stations, #remove date and month too
df = weather.drop([
                         # GDANSK features
                         'GDANSK_cloud_cover', 'GDANSK_humidity',
                         'GDANSK_precipitation', 'GDANSK_snow_depth',
                         'GDANSK_temp_mean', 'GDANSK_temp_min', 'GDANSK_temp_max',
                         # ROMA features
                         'ROMA_cloud_cover', 'ROMA_humidity', 'ROMA_pressure',
                         'ROMA_sunshine', 'ROMA_temp_mean', 'ROMA_wind_speed',
                         # TOURS features
                         'TOURS_humidity', 'TOURS_pressure', 'TOURS_wind_speed',
                         'TOURS_global_radiation', 'TOURS_precipitation',
                         'TOURS_temp_mean', 'TOURS_temp_min', 'TOURS_temp_max'], axis=1)

In [390]:
df.shape

(22950, 149)

In [392]:
# Extract the different observation types

observation_types = ['cloud_cover', 'wind_speed', 'humidity', 'pressure',
                     'global_radiation', 'precipitation', 'snow_depth', 
                     'sunshine', 'temp_mean', 'temp_min', 'temp_max']

In [394]:
# Create a dictionary to store the count of stations for each observation type
station_counts = {}

for obs in observation_types:
    # Select columns related to the current observation type
    columns = [col for col in df.columns if col.endswith(obs)]
    
    # Count the number of stations (i.e., the number of columns) for the current observation type
    station_counts[obs] = len(columns)

# Print the count of stations for each observation type
print("Number of stations covered by each observation type:")
for obs, count in station_counts.items():
    print(f"{obs}: {count} stations")

Number of stations covered by each observation type:
cloud_cover: 14 stations
wind_speed: 9 stations
humidity: 14 stations
pressure: 14 stations
global_radiation: 15 stations
precipitation: 15 stations
snow_depth: 6 stations
sunshine: 15 stations
temp_mean: 15 stations
temp_min: 15 stations
temp_max: 15 stations


#### The two columns missing multiple entries for most stations are: wind_speed (only 9 stations) and snow_depth (only 6 stations).

In [397]:
# Remove 2 observations with the missing years and inconsistencies; 

#Drop columns ending with '_snow_depth'
df = df.drop(columns=df.filter(like='_snow_depth').columns)

#Drop columns ending with '_wind_speed'
df = df.drop(columns=df.filter(like='_wind_speed').columns)

In [399]:
df

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,BASEL_temp_max,BELGRADE_cloud_cover,BELGRADE_humidity,BELGRADE_pressure,BELGRADE_global_radiation,BELGRADE_precipitation,BELGRADE_sunshine,BELGRADE_temp_mean,BELGRADE_temp_min,BELGRADE_temp_max,BUDAPEST_cloud_cover,BUDAPEST_humidity,BUDAPEST_pressure,BUDAPEST_global_radiation,BUDAPEST_precipitation,BUDAPEST_sunshine,BUDAPEST_temp_mean,BUDAPEST_temp_min,BUDAPEST_temp_max,DEBILT_cloud_cover,DEBILT_humidity,DEBILT_pressure,DEBILT_global_radiation,DEBILT_precipitation,DEBILT_sunshine,DEBILT_temp_mean,DEBILT_temp_min,DEBILT_temp_max,DUSSELDORF_cloud_cover,DUSSELDORF_humidity,DUSSELDORF_pressure,DUSSELDORF_global_radiation,DUSSELDORF_precipitation,DUSSELDORF_sunshine,DUSSELDORF_temp_mean,DUSSELDORF_temp_min,DUSSELDORF_temp_max,HEATHROW_cloud_cover,HEATHROW_humidity,HEATHROW_pressure,HEATHROW_global_radiation,HEATHROW_precipitation,HEATHROW_sunshine,HEATHROW_temp_mean,HEATHROW_temp_min,HEATHROW_temp_max,KASSEL_humidity,KASSEL_pressure,KASSEL_global_radiation,KASSEL_precipitation,KASSEL_sunshine,KASSEL_temp_mean,KASSEL_temp_min,KASSEL_temp_max,LJUBLJANA_cloud_cover,LJUBLJANA_humidity,LJUBLJANA_pressure,LJUBLJANA_global_radiation,LJUBLJANA_precipitation,LJUBLJANA_sunshine,LJUBLJANA_temp_mean,LJUBLJANA_temp_min,LJUBLJANA_temp_max,MAASTRICHT_cloud_cover,MAASTRICHT_humidity,MAASTRICHT_pressure,MAASTRICHT_global_radiation,MAASTRICHT_precipitation,MAASTRICHT_sunshine,MAASTRICHT_temp_mean,MAASTRICHT_temp_min,MAASTRICHT_temp_max,MADRID_cloud_cover,MADRID_humidity,MADRID_pressure,MADRID_global_radiation,MADRID_precipitation,MADRID_sunshine,MADRID_temp_mean,MADRID_temp_min,MADRID_temp_max,MUNCHENB_cloud_cover,MUNCHENB_humidity,MUNCHENB_global_radiation,MUNCHENB_precipitation,MUNCHENB_sunshine,MUNCHENB_temp_mean,MUNCHENB_temp_min,MUNCHENB_temp_max,OSLO_cloud_cover,OSLO_humidity,OSLO_pressure,OSLO_global_radiation,OSLO_precipitation,OSLO_sunshine,OSLO_temp_mean,OSLO_temp_min,OSLO_temp_max,SONNBLICK_cloud_cover,SONNBLICK_humidity,SONNBLICK_pressure,SONNBLICK_global_radiation,SONNBLICK_precipitation,SONNBLICK_sunshine,SONNBLICK_temp_mean,SONNBLICK_temp_min,SONNBLICK_temp_max,STOCKHOLM_cloud_cover,STOCKHOLM_pressure,STOCKHOLM_global_radiation,STOCKHOLM_precipitation,STOCKHOLM_sunshine,STOCKHOLM_temp_mean,STOCKHOLM_temp_min,STOCKHOLM_temp_max,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,19600101,1,7,0.85,1.02,0.32,0.09,0.70,6.50,0.80,10.90,1,0.81,1.02,0.88,0.00,7.00,3.70,-0.90,7.90,4,0.67,1.02,0.44,0.01,2.30,2.40,-0.40,5.10,7,0.85,1.00,0.07,0.25,0.00,9.30,7.40,11.00,8,0.83,1.02,0.12,0.08,0.00,10.00,7.00,11.50,7,0.91,1.00,0.13,0.22,0.00,10.60,9.40,8.30,0.82,1.01,0.28,0.48,1.60,7.90,3.90,9.40,8,1.00,1.02,0.20,0.00,0.00,-0.60,-1.90,0.50,7,0.83,1.01,0.22,0.32,1.00,9.50,8.50,11.10,6,0.92,1.03,0.53,0.00,1.40,7.60,4.40,10.80,5,0.67,0.20,0.10,0.00,6.90,1.10,10.40,8,0.98,1.00,0.04,1.14,0.00,4.90,3.80,5.90,4,0.73,1.03,0.48,0.01,2.30,-5.90,-8.50,-3.20,5,1.01,0.05,0.32,0.00,4.20,2.20,4.90,5,0.88,1.00,0.45,0.34,4.70,8.50,6.00,10.90
1,19600102,1,6,0.84,1.02,0.36,1.05,1.10,6.10,3.30,10.10,6,0.84,1.02,0.25,0.00,0.00,2.90,2.20,4.40,4,0.67,1.02,0.18,0.31,0.00,2.30,1.40,3.10,8,0.90,1.01,0.14,0.06,0.10,7.70,6.40,8.30,8,0.89,1.02,0.18,0.66,0.50,8.20,7.40,11.00,7,0.98,1.01,0.13,0.23,0.00,6.10,3.90,10.60,0.86,1.01,0.12,0.27,0.00,7.70,6.80,9.10,6,0.94,1.02,0.56,0.13,3.20,2.10,-1.30,5.50,8,0.92,1.01,0.17,1.34,0.40,8.60,7.50,9.90,7,0.86,1.03,0.46,0.00,0.90,9.80,7.40,12.20,6,0.72,0.61,0.30,5.10,6.20,4.20,10.20,8,0.62,1.01,0.04,0.00,0.00,3.40,2.80,4.90,6,0.97,1.03,0.21,0.61,0.00,-9.50,-10.50,-8.50,5,1.01,0.05,0.06,0.00,4.00,3.00,5.00,7,0.91,1.00,0.25,0.84,0.70,8.90,5.60,12.10
2,19600103,1,8,0.90,1.02,0.18,0.30,0.00,8.50,5.10,9.90,6,0.77,1.02,0.67,0.00,3.50,3.10,-0.50,6.40,4,0.67,1.02,0.30,0.00,0.60,2.70,1.70,5.30,6,0.92,1.02,0.28,0.01,3.00,6.80,4.60,9.90,7,0.95,1.02,0.12,0.07,0.00,7.10,6.90,9.10,8,0.96,1.02,0.15,0.07,0.10,8.40,6.10,12.20,0.91,1.01,0.12,0.60,0.00,6.50,6.00,8.00,8,0.96,1.02,0.20,0.12,0.00,4.60,0.90,6.30,7,0.97,1.02,0.12,0.46,0.00,6.90,5.50,9.90,5,0.90,1.03,0.63,0.00,2.30,8.60,6.40,10.80,6,0.91,0.20,0.30,0.00,5.80,4.00,8.00,8,0.69,1.02,0.04,0.08,0.00,1.90,0.60,3.10,8,0.93,1.03,0.21,3.20,0.00,-9.50,-10.00,-8.90,5,1.01,0.05,0.02,0.00,2.40,1.30,4.10,7,0.91,1.01,0.17,0.08,0.10,10.50,8.10,12.90
3,19600104,1,3,0.92,1.02,0.58,0.00,4.10,6.30,3.80,10.60,8,0.93,1.03,0.25,0.00,0.00,2.00,-2.00,3.00,4,0.67,1.02,0.19,0.00,0.00,2.00,0.40,4.40,8,0.95,1.03,0.08,0.09,0.00,6.70,3.60,10.10,8,0.86,1.02,0.12,0.02,0.00,6.80,3.60,8.00,8,0.98,1.02,0.13,0.00,0.00,9.40,6.70,8.90,0.87,1.03,0.12,0.00,0.00,5.80,5.20,6.50,6,0.94,1.02,0.49,0.00,2.20,3.20,1.00,7.00,7,0.89,1.03,0.16,0.00,0.30,7.00,3.00,10.00,0,0.75,1.03,1.16,0.00,8.70,10.30,4.50,16.10,6,0.90,0.20,0.01,0.00,3.90,3.20,5.40,8,0.98,1.02,0.04,0.35,0.00,3.00,0.40,4.90,5,0.93,1.04,0.22,1.10,0.00,-11.50,-12.90,-10.00,5,1.01,0.05,0.00,0.00,1.20,0.40,2.30,7,0.86,1.02,0.13,0.98,0.00,7.40,7.30,10.60
4,19600105,1,6,0.95,1.02,0.65,0.14,5.40,3.00,-0.70,6.00,8,0.99,1.03,0.25,0.06,0.00,2.00,0.70,2.80,4,0.67,1.02,0.19,0.00,0.00,2.50,1.10,5.30,6,0.90,1.02,0.04,0.39,0.00,8.00,2.40,11.20,7,0.92,1.02,0.12,0.62,0.00,7.70,6.20,11.00,5,0.84,1.03,0.30,0.00,2.10,8.90,8.90,7.20,0.86,1.03,0.13,0.71,0.00,5.40,3.70,6.00,7,0.94,1.02,0.20,0.00,0.00,3.60,0.40,4.80,7,0.92,1.03,0.12,0.56,0.00,8.10,2.50,11.10,2,0.64,1.03,1.10,0.00,7.80,12.10,8.20,16.00,5,0.85,0.65,0.96,5.60,1.80,-3.00,6.00,8,0.96,1.01,0.05,0.26,0.00,3.70,2.90,4.90,2,0.75,1.04,0.72,0.01,6.10,-9.30,-12.00,-6.50,5,1.01,0.05,1.32,0.00,3.30,0.80,4.30,3,0.80,1.03,0.46,0.00,5.70,5.70,3.00,8.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22945,20221027,10,1,0.79,1.02,1.34,0.22,7.70,15.90,11.40,21.40,2,0.68,1.03,1.57,0.18,5.70,18.20,12.10,24.40,4,0.67,1.02,1.41,0.14,5.40,11.70,7.90,16.20,8,0.84,1.02,1.13,0.22,2.80,15.70,12.80,19.40,8,0.75,1.02,1.13,0.20,6.40,17.80,13.60,21.40,5,0.87,1.01,1.18,0.16,1.90,16.40,11.90,18.90,0.77,1.02,1.14,0.19,4.00,9.10,5.40,13.10,4,0.80,1.03,1.35,0.37,5.90,14.70,12.10,21.10,8,0.67,1.02,1.17,0.20,5.30,18.60,14.10,22.60,8,0.52,1.02,1.89,0.12,5.30,20.00,16.20,23.90,2,0.76,1.37,0.26,9.70,14.30,8.30,22.20,8,0.98,1.01,1.06,0.21,0.10,9.70,5.80,12.00,2,0.84,1.03,1.56,0.47,4.70,0.60,-1.40,2.60,5,1.02,1.11,0.14,3.20,11.50,8.20,14.20,5,0.82,1.01,1.13,0.41,3.40,10.70,7.90,13.50
22946,20221028,10,6,0.77,1.02,1.34,0.22,5.40,16.70,14.30,21.90,0,0.68,1.03,1.57,0.18,5.70,15.90,10.60,21.20,4,0.67,1.02,1.41,0.14,5.40,11.70,7.90,16.20,8,0.84,1.02,1.13,0.22,3.50,16.00,10.30,20.50,7,0.71,1.02,1.13,0.20,4.90,19.40,15.40,23.90,4,0.82,1.02,1.18,0.16,4.20,15.80,12.70,21.80,0.77,1.02,1.14,0.19,4.00,9.10,5.40,13.10,3,0.82,1.03,1.35,0.37,4.50,12.90,9.80,19.80,7,0.70,1.02,1.17,0.20,5.00,18.90,15.80,23.50,8,0.51,1.02,1.89,0.12,3.90,19.10,14.70,23.50,6,0.70,1.37,0.26,7.70,16.10,8.90,26.10,8,1.00,1.01,1.06,0.21,0.00,10.90,8.80,11.70,5,0.84,1.03,1.56,0.47,4.70,2.30,0.60,4.00,5,1.01,1.11,0.14,0.80,12.50,11.00,14.30,5,0.82,1.01,1.13,0.41,3.40,10.70,7.90,13.50
22947,20221029,10,4,0.76,1.02,1.34,0.22,6.10,16.70,13.10,22.40,2,0.68,1.03,1.57,0.18,5.70,13.40,8.60,18.20,4,0.67,1.02,1.41,0.14,5.40,11.70,7.90,16.20,8,0.86,1.02,1.13,0.22,3.30,15.80,9.30,21.10,8,0.73,1.02,1.13,0.20,4.00,18.20,13.40,22.00,7,0.85,1.01,1.18,0.16,4.20,16.50,11.20,17.00,0.77,1.02,1.14,0.19,4.00,9.10,5.40,13.10,3,0.81,1.03,1.35,0.37,5.10,13.20,10.20,20.70,8,0.69,1.02,1.17,0.20,3.20,18.20,13.70,24.30,8,0.46,1.02,1.89,0.12,8.10,19.00,15.40,22.60,7,0.64,1.37,0.26,6.80,17.40,11.20,26.20,3,0.85,1.01,1.06,0.21,6.80,9.70,7.70,14.20,3,0.84,1.03,1.56,0.47,4.70,3.30,2.10,4.50,5,1.01,1.11,0.14,6.90,13.10,12.10,14.40,5,0.82,1.01,1.13,0.41,3.40,10.70,7.90,13.50
22948,20221030,10,5,0.80,1.02,1.34,0.22,5.80,15.40,11.60,21.10,1,0.68,1.02,1.57,0.18,5.70,15.00,9.10,20.90,4,0.67,1.02,1.41,0.14,5.40,11.70,7.90,16.20,8,0.87,1.02,1.13,0.22,6.00,14.40,10.30,20.20,7,0.73,1.02,1.13,0.20,6.90,16.70,11.90,21.10,5,0.86,1.01,1.18,0.16,0.60,15.20,13.40,17.50,0.77,1.02,1.14,0.19,4.00,9.10,5.40,13.10,3,0.77,1.02,1.35,0.37,5.70,14.00,10.00,23.10,8,0.73,1.02,1.17,0.20,6.80,16.30,12.80,21.40,5,0.66,1.02,1.89,0.12,3.10,15.70,13.10,18.30,6,0.75,1.37,0.26,8.30,14.50,9.20,23.50,5,0.94,1.01,1.06,0.21,2.90,5.90,2.10,8.10,3,0.84,1.03,1.56,0.47,4.70,3.40,2.70,4.10,5,1.02,1.11,0.14,8.40,7.50,5.10,12.40,5,0.82,1.01,1.13,0.41,3.40,10.70,7.90,13.50


In [401]:
# Check for missing values 

#(Ljubljana is near Kassel, Sonnblick is near Munchen, and Olso is close enough to Stockholm)

#(all weather stations should have 9 observations in final 'weather-data-cleaned')

#### We still have 1 missing entry for each the following observations: cloud_cover, humidity, and pressure. Let's find which specific stations are missing.

In [404]:
# Create a list of all unique station names in the dataset

all_stations = set([col.split('_')[0] for col in df.columns if '_' in col])
all_stations

{'BASEL',
 'BELGRADE',
 'BUDAPEST',
 'DEBILT',
 'DUSSELDORF',
 'HEATHROW',
 'KASSEL',
 'LJUBLJANA',
 'MAASTRICHT',
 'MADRID',
 'MUNCHENB',
 'OSLO',
 'SONNBLICK',
 'STOCKHOLM',
 'VALENTIA'}

In [406]:
observation_types = ['cloud_cover', 'humidity', 'pressure']

missing_stations_by_observation = {}

for obs in observation_types:
    # Select columns related to the current observation type
    columns = [col for col in df.columns if col.endswith(obs)]
    
    # Extract station names by removing the observation type from the column names
    station_names = set([col.replace(f'_{obs}', '') for col in columns])
    
    # Identify stations that are in all_stations but missing from the current observation type
    missing_stations = all_stations - station_names
    
    # Store the missing station names in the dictionary
    missing_stations_by_observation[obs] = missing_stations

# Print the missing station names for each observation type
for obs, missing_stations in missing_stations_by_observation.items():
    print(f"\nStations missing from {obs}:")
    if missing_stations:
        for station in missing_stations:
            print(station)
    else:
        print("None")


Stations missing from cloud_cover:
KASSEL

Stations missing from humidity:
STOCKHOLM

Stations missing from pressure:
MUNCHENB


In [408]:
# Fill in missing values with nearby stations. (#KASSEL_cloud_cover, MUNCHENB_pressure, and STOCKHOLM_humidity)

# Deriving and filling the new columns
df['KASSEL_cloud_cover'] = df['LJUBLJANA_cloud_cover']
df['MUNCHENB_pressure'] = df['SONNBLICK_pressure']
df['STOCKHOLM_humidity'] = df['OSLO_humidity']

In [410]:
print(df.columns)


Index(['DATE', 'MONTH', 'BASEL_cloud_cover', 'BASEL_humidity',
       'BASEL_pressure', 'BASEL_global_radiation', 'BASEL_precipitation',
       'BASEL_sunshine', 'BASEL_temp_mean', 'BASEL_temp_min',
       ...
       'VALENTIA_pressure', 'VALENTIA_global_radiation',
       'VALENTIA_precipitation', 'VALENTIA_sunshine', 'VALENTIA_temp_mean',
       'VALENTIA_temp_min', 'VALENTIA_temp_max', 'KASSEL_cloud_cover',
       'MUNCHENB_pressure', 'STOCKHOLM_humidity'],
      dtype='object', length=137)


In [412]:
# Export cleaned dataset with DATE column
df.to_csv(os.path.join(path, '02 Data', 'Unsupervised', 'weather-data-cleaned-date.csv'), index=False)

In [413]:
# Drop DATE
df = df.drop(['DATE', 'MONTH'], axis=1)

df #as preferred, the data is now ready for keras modeling with (22950 rows x 135 columns).

Unnamed: 0,BASEL_cloud_cover,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,BASEL_temp_max,BELGRADE_cloud_cover,BELGRADE_humidity,BELGRADE_pressure,BELGRADE_global_radiation,BELGRADE_precipitation,BELGRADE_sunshine,BELGRADE_temp_mean,BELGRADE_temp_min,BELGRADE_temp_max,BUDAPEST_cloud_cover,BUDAPEST_humidity,BUDAPEST_pressure,BUDAPEST_global_radiation,BUDAPEST_precipitation,BUDAPEST_sunshine,BUDAPEST_temp_mean,BUDAPEST_temp_min,BUDAPEST_temp_max,DEBILT_cloud_cover,DEBILT_humidity,DEBILT_pressure,DEBILT_global_radiation,DEBILT_precipitation,DEBILT_sunshine,DEBILT_temp_mean,DEBILT_temp_min,DEBILT_temp_max,DUSSELDORF_cloud_cover,DUSSELDORF_humidity,DUSSELDORF_pressure,DUSSELDORF_global_radiation,DUSSELDORF_precipitation,DUSSELDORF_sunshine,DUSSELDORF_temp_mean,DUSSELDORF_temp_min,DUSSELDORF_temp_max,HEATHROW_cloud_cover,HEATHROW_humidity,HEATHROW_pressure,HEATHROW_global_radiation,HEATHROW_precipitation,HEATHROW_sunshine,HEATHROW_temp_mean,HEATHROW_temp_min,HEATHROW_temp_max,KASSEL_humidity,KASSEL_pressure,KASSEL_global_radiation,KASSEL_precipitation,KASSEL_sunshine,KASSEL_temp_mean,KASSEL_temp_min,KASSEL_temp_max,LJUBLJANA_cloud_cover,LJUBLJANA_humidity,LJUBLJANA_pressure,LJUBLJANA_global_radiation,LJUBLJANA_precipitation,LJUBLJANA_sunshine,LJUBLJANA_temp_mean,LJUBLJANA_temp_min,LJUBLJANA_temp_max,MAASTRICHT_cloud_cover,MAASTRICHT_humidity,MAASTRICHT_pressure,MAASTRICHT_global_radiation,MAASTRICHT_precipitation,MAASTRICHT_sunshine,MAASTRICHT_temp_mean,MAASTRICHT_temp_min,MAASTRICHT_temp_max,MADRID_cloud_cover,MADRID_humidity,MADRID_pressure,MADRID_global_radiation,MADRID_precipitation,MADRID_sunshine,MADRID_temp_mean,MADRID_temp_min,MADRID_temp_max,MUNCHENB_cloud_cover,MUNCHENB_humidity,MUNCHENB_global_radiation,MUNCHENB_precipitation,MUNCHENB_sunshine,MUNCHENB_temp_mean,MUNCHENB_temp_min,MUNCHENB_temp_max,OSLO_cloud_cover,OSLO_humidity,OSLO_pressure,OSLO_global_radiation,OSLO_precipitation,OSLO_sunshine,OSLO_temp_mean,OSLO_temp_min,OSLO_temp_max,SONNBLICK_cloud_cover,SONNBLICK_humidity,SONNBLICK_pressure,SONNBLICK_global_radiation,SONNBLICK_precipitation,SONNBLICK_sunshine,SONNBLICK_temp_mean,SONNBLICK_temp_min,SONNBLICK_temp_max,STOCKHOLM_cloud_cover,STOCKHOLM_pressure,STOCKHOLM_global_radiation,STOCKHOLM_precipitation,STOCKHOLM_sunshine,STOCKHOLM_temp_mean,STOCKHOLM_temp_min,STOCKHOLM_temp_max,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max,KASSEL_cloud_cover,MUNCHENB_pressure,STOCKHOLM_humidity
0,7,0.85,1.02,0.32,0.09,0.70,6.50,0.80,10.90,1,0.81,1.02,0.88,0.00,7.00,3.70,-0.90,7.90,4,0.67,1.02,0.44,0.01,2.30,2.40,-0.40,5.10,7,0.85,1.00,0.07,0.25,0.00,9.30,7.40,11.00,8,0.83,1.02,0.12,0.08,0.00,10.00,7.00,11.50,7,0.91,1.00,0.13,0.22,0.00,10.60,9.40,8.30,0.82,1.01,0.28,0.48,1.60,7.90,3.90,9.40,8,1.00,1.02,0.20,0.00,0.00,-0.60,-1.90,0.50,7,0.83,1.01,0.22,0.32,1.00,9.50,8.50,11.10,6,0.92,1.03,0.53,0.00,1.40,7.60,4.40,10.80,5,0.67,0.20,0.10,0.00,6.90,1.10,10.40,8,0.98,1.00,0.04,1.14,0.00,4.90,3.80,5.90,4,0.73,1.03,0.48,0.01,2.30,-5.90,-8.50,-3.20,5,1.01,0.05,0.32,0.00,4.20,2.20,4.90,5,0.88,1.00,0.45,0.34,4.70,8.50,6.00,10.90,8,1.03,0.98
1,6,0.84,1.02,0.36,1.05,1.10,6.10,3.30,10.10,6,0.84,1.02,0.25,0.00,0.00,2.90,2.20,4.40,4,0.67,1.02,0.18,0.31,0.00,2.30,1.40,3.10,8,0.90,1.01,0.14,0.06,0.10,7.70,6.40,8.30,8,0.89,1.02,0.18,0.66,0.50,8.20,7.40,11.00,7,0.98,1.01,0.13,0.23,0.00,6.10,3.90,10.60,0.86,1.01,0.12,0.27,0.00,7.70,6.80,9.10,6,0.94,1.02,0.56,0.13,3.20,2.10,-1.30,5.50,8,0.92,1.01,0.17,1.34,0.40,8.60,7.50,9.90,7,0.86,1.03,0.46,0.00,0.90,9.80,7.40,12.20,6,0.72,0.61,0.30,5.10,6.20,4.20,10.20,8,0.62,1.01,0.04,0.00,0.00,3.40,2.80,4.90,6,0.97,1.03,0.21,0.61,0.00,-9.50,-10.50,-8.50,5,1.01,0.05,0.06,0.00,4.00,3.00,5.00,7,0.91,1.00,0.25,0.84,0.70,8.90,5.60,12.10,6,1.03,0.62
2,8,0.90,1.02,0.18,0.30,0.00,8.50,5.10,9.90,6,0.77,1.02,0.67,0.00,3.50,3.10,-0.50,6.40,4,0.67,1.02,0.30,0.00,0.60,2.70,1.70,5.30,6,0.92,1.02,0.28,0.01,3.00,6.80,4.60,9.90,7,0.95,1.02,0.12,0.07,0.00,7.10,6.90,9.10,8,0.96,1.02,0.15,0.07,0.10,8.40,6.10,12.20,0.91,1.01,0.12,0.60,0.00,6.50,6.00,8.00,8,0.96,1.02,0.20,0.12,0.00,4.60,0.90,6.30,7,0.97,1.02,0.12,0.46,0.00,6.90,5.50,9.90,5,0.90,1.03,0.63,0.00,2.30,8.60,6.40,10.80,6,0.91,0.20,0.30,0.00,5.80,4.00,8.00,8,0.69,1.02,0.04,0.08,0.00,1.90,0.60,3.10,8,0.93,1.03,0.21,3.20,0.00,-9.50,-10.00,-8.90,5,1.01,0.05,0.02,0.00,2.40,1.30,4.10,7,0.91,1.01,0.17,0.08,0.10,10.50,8.10,12.90,8,1.03,0.69
3,3,0.92,1.02,0.58,0.00,4.10,6.30,3.80,10.60,8,0.93,1.03,0.25,0.00,0.00,2.00,-2.00,3.00,4,0.67,1.02,0.19,0.00,0.00,2.00,0.40,4.40,8,0.95,1.03,0.08,0.09,0.00,6.70,3.60,10.10,8,0.86,1.02,0.12,0.02,0.00,6.80,3.60,8.00,8,0.98,1.02,0.13,0.00,0.00,9.40,6.70,8.90,0.87,1.03,0.12,0.00,0.00,5.80,5.20,6.50,6,0.94,1.02,0.49,0.00,2.20,3.20,1.00,7.00,7,0.89,1.03,0.16,0.00,0.30,7.00,3.00,10.00,0,0.75,1.03,1.16,0.00,8.70,10.30,4.50,16.10,6,0.90,0.20,0.01,0.00,3.90,3.20,5.40,8,0.98,1.02,0.04,0.35,0.00,3.00,0.40,4.90,5,0.93,1.04,0.22,1.10,0.00,-11.50,-12.90,-10.00,5,1.01,0.05,0.00,0.00,1.20,0.40,2.30,7,0.86,1.02,0.13,0.98,0.00,7.40,7.30,10.60,6,1.04,0.98
4,6,0.95,1.02,0.65,0.14,5.40,3.00,-0.70,6.00,8,0.99,1.03,0.25,0.06,0.00,2.00,0.70,2.80,4,0.67,1.02,0.19,0.00,0.00,2.50,1.10,5.30,6,0.90,1.02,0.04,0.39,0.00,8.00,2.40,11.20,7,0.92,1.02,0.12,0.62,0.00,7.70,6.20,11.00,5,0.84,1.03,0.30,0.00,2.10,8.90,8.90,7.20,0.86,1.03,0.13,0.71,0.00,5.40,3.70,6.00,7,0.94,1.02,0.20,0.00,0.00,3.60,0.40,4.80,7,0.92,1.03,0.12,0.56,0.00,8.10,2.50,11.10,2,0.64,1.03,1.10,0.00,7.80,12.10,8.20,16.00,5,0.85,0.65,0.96,5.60,1.80,-3.00,6.00,8,0.96,1.01,0.05,0.26,0.00,3.70,2.90,4.90,2,0.75,1.04,0.72,0.01,6.10,-9.30,-12.00,-6.50,5,1.01,0.05,1.32,0.00,3.30,0.80,4.30,3,0.80,1.03,0.46,0.00,5.70,5.70,3.00,8.40,7,1.04,0.96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22945,1,0.79,1.02,1.34,0.22,7.70,15.90,11.40,21.40,2,0.68,1.03,1.57,0.18,5.70,18.20,12.10,24.40,4,0.67,1.02,1.41,0.14,5.40,11.70,7.90,16.20,8,0.84,1.02,1.13,0.22,2.80,15.70,12.80,19.40,8,0.75,1.02,1.13,0.20,6.40,17.80,13.60,21.40,5,0.87,1.01,1.18,0.16,1.90,16.40,11.90,18.90,0.77,1.02,1.14,0.19,4.00,9.10,5.40,13.10,4,0.80,1.03,1.35,0.37,5.90,14.70,12.10,21.10,8,0.67,1.02,1.17,0.20,5.30,18.60,14.10,22.60,8,0.52,1.02,1.89,0.12,5.30,20.00,16.20,23.90,2,0.76,1.37,0.26,9.70,14.30,8.30,22.20,8,0.98,1.01,1.06,0.21,0.10,9.70,5.80,12.00,2,0.84,1.03,1.56,0.47,4.70,0.60,-1.40,2.60,5,1.02,1.11,0.14,3.20,11.50,8.20,14.20,5,0.82,1.01,1.13,0.41,3.40,10.70,7.90,13.50,4,1.03,0.98
22946,6,0.77,1.02,1.34,0.22,5.40,16.70,14.30,21.90,0,0.68,1.03,1.57,0.18,5.70,15.90,10.60,21.20,4,0.67,1.02,1.41,0.14,5.40,11.70,7.90,16.20,8,0.84,1.02,1.13,0.22,3.50,16.00,10.30,20.50,7,0.71,1.02,1.13,0.20,4.90,19.40,15.40,23.90,4,0.82,1.02,1.18,0.16,4.20,15.80,12.70,21.80,0.77,1.02,1.14,0.19,4.00,9.10,5.40,13.10,3,0.82,1.03,1.35,0.37,4.50,12.90,9.80,19.80,7,0.70,1.02,1.17,0.20,5.00,18.90,15.80,23.50,8,0.51,1.02,1.89,0.12,3.90,19.10,14.70,23.50,6,0.70,1.37,0.26,7.70,16.10,8.90,26.10,8,1.00,1.01,1.06,0.21,0.00,10.90,8.80,11.70,5,0.84,1.03,1.56,0.47,4.70,2.30,0.60,4.00,5,1.01,1.11,0.14,0.80,12.50,11.00,14.30,5,0.82,1.01,1.13,0.41,3.40,10.70,7.90,13.50,3,1.03,1.00
22947,4,0.76,1.02,1.34,0.22,6.10,16.70,13.10,22.40,2,0.68,1.03,1.57,0.18,5.70,13.40,8.60,18.20,4,0.67,1.02,1.41,0.14,5.40,11.70,7.90,16.20,8,0.86,1.02,1.13,0.22,3.30,15.80,9.30,21.10,8,0.73,1.02,1.13,0.20,4.00,18.20,13.40,22.00,7,0.85,1.01,1.18,0.16,4.20,16.50,11.20,17.00,0.77,1.02,1.14,0.19,4.00,9.10,5.40,13.10,3,0.81,1.03,1.35,0.37,5.10,13.20,10.20,20.70,8,0.69,1.02,1.17,0.20,3.20,18.20,13.70,24.30,8,0.46,1.02,1.89,0.12,8.10,19.00,15.40,22.60,7,0.64,1.37,0.26,6.80,17.40,11.20,26.20,3,0.85,1.01,1.06,0.21,6.80,9.70,7.70,14.20,3,0.84,1.03,1.56,0.47,4.70,3.30,2.10,4.50,5,1.01,1.11,0.14,6.90,13.10,12.10,14.40,5,0.82,1.01,1.13,0.41,3.40,10.70,7.90,13.50,3,1.03,0.85
22948,5,0.80,1.02,1.34,0.22,5.80,15.40,11.60,21.10,1,0.68,1.02,1.57,0.18,5.70,15.00,9.10,20.90,4,0.67,1.02,1.41,0.14,5.40,11.70,7.90,16.20,8,0.87,1.02,1.13,0.22,6.00,14.40,10.30,20.20,7,0.73,1.02,1.13,0.20,6.90,16.70,11.90,21.10,5,0.86,1.01,1.18,0.16,0.60,15.20,13.40,17.50,0.77,1.02,1.14,0.19,4.00,9.10,5.40,13.10,3,0.77,1.02,1.35,0.37,5.70,14.00,10.00,23.10,8,0.73,1.02,1.17,0.20,6.80,16.30,12.80,21.40,5,0.66,1.02,1.89,0.12,3.10,15.70,13.10,18.30,6,0.75,1.37,0.26,8.30,14.50,9.20,23.50,5,0.94,1.01,1.06,0.21,2.90,5.90,2.10,8.10,3,0.84,1.03,1.56,0.47,4.70,3.40,2.70,4.10,5,1.02,1.11,0.14,8.40,7.50,5.10,12.40,5,0.82,1.01,1.13,0.41,3.40,10.70,7.90,13.50,3,1.03,0.94


In [416]:
# Export cleaned data set; 3 weather stations, time data, and 2 observations dropped;
df.to_csv(os.path.join(path, '02 Data', 'Unsupervised', 'weather-data-cleaned.csv'), index=False)

In [418]:
# Create a new 'outcomes' DataFrame without the 'DATE' column
outcomes = outcomes.drop(columns='DATE')
outcomes

Unnamed: 0,BASEL_pleasant_weather,BELGRADE_pleasant_weather,BUDAPEST_pleasant_weather,DEBILT_pleasant_weather,DUSSELDORF_pleasant_weather,HEATHROW_pleasant_weather,KASSEL_pleasant_weather,LJUBLJANA_pleasant_weather,MAASTRICHT_pleasant_weather,MADRID_pleasant_weather,MUNCHENB_pleasant_weather,OSLO_pleasant_weather,SONNBLICK_pleasant_weather,STOCKHOLM_pleasant_weather,VALENTIA_pleasant_weather
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22945,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22946,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22947,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22948,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [420]:
# Export pleasant weather outcomes without date;
outcomes.to_csv(os.path.join(path, '02 Data', 'Unsupervised', 'outcomes-cleaned.csv'), index=False)

---------------------------------------------------------------------------------------------------------------------------
## 4. Data preprocessing

In [423]:
# List of weather stations prefixes/ need help defining these as we have 135 total observations of 15 weather station groups
#by sets of 9 observations per group.
weather = [
    'BASEL_',
    'BELGRADE_',
    'BUDAPEST_',
    'DEBILT_',
    'DUSSELDORF_',
    'HEATHROW_',
    'KASSEL_',
    'LJUBLJANA_',
    'MAASTRICHT_',
    'MADRID_',
    'MUNCHENB_',
    'OSLO_',
    'SONNBLICK_',
    'STOCKHOLM_',
    'VALENTIA_'
]

In [425]:
# Dictionary mapping outcomes to weather stations for confusion matrix
weather_outcomes = {
    0: 'BASEL_pleasant_weather',
    1: 'BELGRADE_pleasant_weather',
    2: 'BUDAPEST_pleasant_weather',
    3: 'DEBILT_pleasant_weather',
    4: 'DUSSELDORF_pleasant_weather',
    5: 'HEATHROW_pleasant_weather',
    6: 'KASSEL_pleasant_weather',
    7: 'LJUBLJANA_pleasant_weather',
    8: 'MAASTRICHT_pleasant_weather',
    9: 'MADRID_pleasant_weather',
    10: 'MUNCHENB_pleasant_weather',
    11: 'OSLO_pleasant_weather',
    12: 'SONNBLICK_pleasant_weather',
    13: 'STOCKHOLM_pleasant_weather',
    14: 'VALENTIA_pleasant_weather'
}

In [427]:
outcomes.shape


(22950, 15)

In [429]:
df.shape

(22950, 135)

In [431]:
# Load data, process it, and format it appropriately for training a machine learning model.

In [433]:
# Reshape df
X = df.values.reshape(-1, 15, 9)

# Ensure the labels are in the same shape as X
y = outcomes.values.reshape(-1, 15)

def _count_classes(y):
    return len(set([tuple(category) for category in y]))

In [435]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [437]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(18360, 15, 9) (18360, 15)
(4590, 15, 9) (4590, 15)


In [439]:
X_train

array([[[ 7.0000e+00,  7.2000e-01,  1.0204e+00, ...,  2.1200e+01,
          1.7600e+01,  2.5800e+01],
        [ 3.0000e+00,  4.9000e-01,  1.0182e+00, ...,  2.3000e+01,
          1.3800e+01,  2.8300e+01],
        [ 7.0000e+00,  6.4000e-01,  1.0173e+00, ...,  1.9500e+01,
          1.5500e+01,  2.3200e+01],
        ...,
        [ 1.0329e+00,  1.6500e+00,  8.2000e-01, ...,  5.3000e+00,
          6.0000e+00,  1.0090e+00],
        [ 1.8000e+00,  1.1000e-01,  5.6000e+00, ...,  7.0000e+00,
          8.8000e-01,  1.0138e+00],
        [ 1.3900e+00,  1.0000e-02,  1.7000e+00, ...,  3.0000e+00,
          1.0329e+00,  6.5000e-01]],

       [[ 2.0000e+00,  6.2000e-01,  1.0248e+00, ...,  1.8500e+01,
          1.0700e+01,  2.5600e+01],
        [ 0.0000e+00,  4.8000e-01,  1.0259e+00, ...,  1.9000e+01,
          1.4000e+01,  2.4000e+01],
        [ 3.0000e+00,  5.2000e-01,  1.0256e+00, ...,  1.8300e+01,
          1.2800e+01,  2.3200e+01],
        ...,
        [ 1.0404e+00,  3.7600e+00,  0.0000e+00, ...,  

In [441]:
len(X_train[0])

15

In [443]:
len(X_train[0][0])

9

In [445]:
len(y_train[0])

15

---------------------------------------------------------------------------------------------------------------------------
## 5. Build and run CNN keras model

In [448]:
# Adjust model hyperparameters
epochs = 50
batch_size = 256
n_hidden = 256
kernel_size = 4 # Adjust the kernel size here

timesteps = len(X_train[0])
input_dim = len(X_train[0][0])
n_classes = len(y_train[0])

# Implement complex layers
model = Sequential()
model.add(Conv1D(n_hidden, kernel_size=kernel_size, activation='relu', input_shape=(timesteps, input_dim)))  # Use the adjusted kernel size
model.add(Dense(256, activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(n_classes, activation='softmax')) #relu #sigmoid #tanh #softmax

# Build model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Run model
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1)

Epoch 1/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.1325 - loss: 36.0365
Epoch 2/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.1073 - loss: 2167.7600
Epoch 3/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.0997 - loss: 19381.2754
Epoch 4/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.0915 - loss: 32101.5723
Epoch 5/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.0822 - loss: 65302.5781
Epoch 6/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.0891 - loss: 125746.1641
Epoch 7/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.1051 - loss: 154311.7188
Epoch 8/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.1097 - loss: 168397.9219
Epoch 9/50
[1m72

<keras.src.callbacks.history.History at 0x2a7680f0590>

- **Notes:** relatively low accuracy and exceptionally high loss, the model may be too simple to capture complex data patterns, preprocessing issues?, unscaled data approach, possible data architectural issues. 

---------------------------------------------------------------------------------------------------------------------------
## 6. Run confusion matrix
        - check accuracy and loss

In [452]:
def confusion_matrix(Y_true, Y_pred):
    Y_true = pd.Series([weather_outcomes[y] for y in np.argmax(Y_true, axis=1)])
    Y_pred = pd.Series([weather_outcomes[y] for y in np.argmax(Y_pred, axis=1)])

    return pd.crosstab(Y_true, Y_pred, rownames=['True'], colnames=['Pred'])

In [454]:
# Evaluate
print(confusion_matrix(y_test, model.predict(X_test))) #relatively low accuracy and high loss

[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Pred                         BASEL_pleasant_weather  \
True                                                  
BASEL_pleasant_weather                           15   
BELGRADE_pleasant_weather                         0   
BUDAPEST_pleasant_weather                         0   
DEBILT_pleasant_weather                           0   
DUSSELDORF_pleasant_weather                       0   
HEATHROW_pleasant_weather                         0   
KASSEL_pleasant_weather                           0   
LJUBLJANA_pleasant_weather                        0   
MAASTRICHT_pleasant_weather                       0   
MADRID_pleasant_weather                           1   
MUNCHENB_pleasant_weather                         0   
OSLO_pleasant_weather                             0   
STOCKHOLM_pleasant_weather                        0   
VALENTIA_pleasant_weather                         0   

Pred                         BELGRADE_pleasa

In [456]:
model.summary()

---------------------------------------------------------------------------------------------------------------------------