In [1]:
#Importing packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from statsmodels.graphics.tsaplots import plot_acf

In [2]:
# Important variables
feature_columns = ['CLOUDINESS', 'HUMIDITY', 'MAX_TEMP', 'MIN_TEMP', 'MEAN_TEMP', 'PRECIPITATION']

In [3]:
#Create time-series graphs
def show_save_graph(dataset, feature, extra, timeframe = '2_years', preprocessing = 'before'):
    if timeframe == '2_years':
        time_var = 733
    elif timeframe == '20_years':
        time_var = 7305
    elif timeframe == '60_years': 
        time_var = 21917
    else:
        print('No correct timeframe given in')
        return
        
    plt.figure(figsize = (10,3))
    plt.plot(dataset['DATE'][0:time_var], dataset[feature][0:time_var])
    plt.title(f'{feature} timeseries')
    plt.xlabel('Date')
    
    if preprocessing == 'before':
        if feature == 'CLOUDINESS':
            plt.ylabel('Cloud cover in oktas')
        elif feature == 'HUMIDITY':
            plt.ylabel('Humidity in %')
        elif feature == 'MAX_TEMP':
            plt.ylabel('Maxiumum temperature in 0.1 °C')
        elif feature == 'MIN_TEMP':
            plt.ylabel('Minimum temperature in 0.1 °C')
        elif feature == 'MEAN_TEMP':
            plt.ylabel('Mean temperature in 0.1 °C')
        elif feature == 'PRECIPITATION':
            plt.ylabel('Precipitation in 0.1 mm')
        else:
            print('No correct feature given in')
            return
        
    elif preprocessing == 'after':
        if feature == 'CLOUDINESS':
            plt.ylabel('Normalized cloud cover')
        elif feature == 'HUMIDITY':
            plt.ylabel('Normalized humidity')
        elif feature == 'MAX_TEMP':
            plt.ylabel('Normalized maxiumum temperature')
        elif feature == 'MIN_TEMP':
            plt.ylabel('Normalized minimum temperature')
        elif feature == 'MEAN_TEMP':
            plt.ylabel('Normalized mean temperature')
        elif feature == 'PRECIPITATION':
            plt.ylabel('Normalized precipitation')
        else:
            print('No correct feature given in')
            return
    else:
        print('Not indicated if before or after preprocessing')
        
        
    plt.grid(True, linestyle = 'dotted')
    plt.savefig(f'graphs/{extra}_{feature}_timeseries_{timeframe}_{preprocessing}.jpg', bbox_inches = "tight")
    plt.close()

In [4]:
#Creating necesary folders
if 'graphs' not in os.listdir():
    os.mkdir('graphs')
if 'dataset' not in os.listdir():
    os.mkdir('dataset')

In [5]:
#Read in txt files
cloudiness = pd.read_csv('cloudiness.txt', skiprows = 17, header = 1)
humidity = pd.read_csv('humidity.txt', skiprows = 17, header = 1)
max_temp = pd.read_csv('max_temp.txt', skiprows = 17, header = 1)
min_temp = pd.read_csv('min_temp.txt', skiprows = 17, header = 1)
mean_temp = pd.read_csv('mean_temp.txt', skiprows = 17, header = 1)
precipitation = pd.read_csv('precipitation.txt', skiprows = 17, header = 1)

In [6]:
#Remove first observation, because missing
cloudiness = cloudiness[1:]
humidity = humidity[1:]
max_temp = max_temp[1:]
min_temp = min_temp[1:]
mean_temp = mean_temp[1:]
precipitation = precipitation[1:]

In [7]:
#Drop irrelevant columns
cloudiness = cloudiness.drop([' STAID', ' SOUID', ' Q_CC'], axis = 1)
humidity = humidity.drop([' STAID', ' SOUID', ' Q_HU'], axis = 1)
max_temp = max_temp.drop([' STAID', ' SOUID', ' Q_TX'], axis = 1)
min_temp = min_temp.drop([' STAID', ' SOUID', ' Q_TN'], axis = 1)
mean_temp = mean_temp.drop([' STAID', ' SOUID', ' Q_TG'], axis = 1)
precipitation = precipitation.drop([' STAID', ' SOUID', ' Q_RR'], axis = 1)

In [8]:
#Rename columns for more clear names
cloudiness = cloudiness.rename(columns = {'    DATE':'DATE','   CC':'CLOUDINESS'})
humidity = humidity.rename(columns = {'    DATE':'DATE','   HU':'HUMIDITY'})
max_temp = max_temp.rename(columns = {'    DATE':'DATE','   TX':'MAX_TEMP'})
min_temp = min_temp.rename(columns = {'    DATE':'DATE','   TN':'MIN_TEMP'})
mean_temp = mean_temp.rename(columns = {'    DATE':'DATE','   TG':'MEAN_TEMP'})
precipitation = precipitation.rename(columns = {'    DATE':'DATE','   RR':'PRECIPITATION'})

In [9]:
#Merge dataframes for consistency
weather_df = cloudiness.merge(humidity, on = 'DATE', suffixes = ('_z', '_y')).merge(max_temp, on = 'DATE').merge(min_temp, on = 'DATE').merge(mean_temp, on = 'DATE').merge(precipitation, on = 'DATE')

In [10]:
weather_df['DATE'] = weather_df['DATE'].astype('str')
weather_df['DATE'] = pd.to_datetime(weather_df['DATE'], format = '%Y/%m/%d')

In [11]:
#Save dataset as .csv file
weather_df.to_csv('dataset/dutch_weather.csv', index = False)

In [12]:
#View graphs of plots before preprocessing
time_frames = ['60_years']

for time in time_frames:
    for feature in feature_columns:
        show_save_graph(dataset = weather_df, feature = feature, timeframe = time, preprocessing = 'before', extra = 'raw')

In [13]:
# Count number of missing values per column
print(weather_df[0:21924][weather_df == -9999].count(axis = 0))

#Use linear interpolation to fill in missing values
weather_df= weather_df.replace(-9999, np.NaN)
weather_df['HUMIDITY'] = weather_df['HUMIDITY'].interpolate(method='linear')
weather_df['CLOUDINESS'] = weather_df['CLOUDINESS'].interpolate(method='linear')

DATE              0
CLOUDINESS        1
HUMIDITY         28
MAX_TEMP          0
MIN_TEMP          0
MEAN_TEMP         0
PRECIPITATION     0
dtype: int64


In [14]:
# Create lists for windowed values
cloudiness_weekly = []
humidity_weekly = []
max_temp_weekly = []
min_temp_weekly = []
mean_temp_weekly = []
precipitation_weekly = []

In [15]:
# Window weekly to amplify signal
for i in range(0, len(weather_df) - 7 + 1):
    cloudiness_weekly.append(weather_df['CLOUDINESS'][i : i + 7].mean())
    humidity_weekly.append(weather_df['HUMIDITY'][i : i + 7].mean())
    max_temp_weekly.append(weather_df['MAX_TEMP'][i : i + 7].mean())
    min_temp_weekly.append(weather_df['MIN_TEMP'][i : i + 7].mean())
    mean_temp_weekly.append(weather_df['MEAN_TEMP'][i : i + 7].mean())
    precipitation_weekly.append(weather_df['PRECIPITATION'][i : i + 7].mean())

In [16]:
#Create new dataframe
weather_dict = {'DATE': weather_df['DATE'][6:], 'CLOUDINESS': cloudiness_weekly, 'HUMIDITY': humidity_weekly, 'MAX_TEMP': max_temp_weekly, 'MIN_TEMP': min_temp_weekly, 'MEAN_TEMP': mean_temp_weekly, 'PRECIPITATION': precipitation_weekly,}
weather_df_denoised = pd.DataFrame(weather_dict)

In [17]:
#Cut dataframe for the amount of time necessary
weather_df = weather_df[0:21916]
weather_df_denoised = weather_df_denoised[0:21916]

In [18]:
#Normalize the dataset
weather_df_norm = weather_df.copy()
weather_df_norm_denoised = weather_df_denoised.copy()

for column in feature_columns:
    weather_df_norm[column] = (weather_df[column] - weather_df[column].min()) / (weather_df[column].max() - weather_df[column].min())
    weather_df_norm_denoised[column] = (weather_df_denoised[column] - weather_df_denoised[column].min()) / (weather_df_denoised[column].max() - weather_df_denoised[column].min())

In [19]:
#Save the graphs of the plots
time_frames = ['60_years']

for time in time_frames:
    for feature in feature_columns:
        show_save_graph(dataset = weather_df_norm, feature = feature, timeframe = time, preprocessing = 'after', extra = 'raw')
        show_save_graph(dataset = weather_df_norm_denoised, feature = feature, timeframe = time, preprocessing = 'after', extra = 'denoised')

In [20]:
for column in feature_columns:
    plot_acf(weather_df_norm[column])
    plt.title(f'Autocorrelation {column}')
    plt.xlabel('Lag')
    plt.ylabel('Autocorrelation')
    plt.savefig(f'graphs/autocorrelation_{column}_50timesteps')
    plt.close()

In [21]:
for column in feature_columns:
    plot_acf(weather_df_norm_denoised[column])
    plt.title(f'Autocorrelation {column}')
    plt.xlabel('Lag')
    plt.ylabel('Autocorrelation')
    plt.savefig(f'graphs/autocorrelation_{column}_50timesteps')
    plt.close()

In [22]:
#Save dataset as .csv file
weather_df_norm.to_csv('dataset/dutch_weather_norm_raw.csv', index = False)

In [23]:
#Save dataset as .csv file
weather_df_norm_denoised.to_csv('dataset/dutch_weather_norm_denoised.csv', index = False)