# Initialisation

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

# Create instances with all stations

In [2]:
daily_trains_demand_pre_covid = pd.read_csv('../data/curated/train_demand/daily_trains_demand_pre_covid.csv')
daily_trains_demand_post_covid = pd.read_csv('../data/curated/train_demand/daily_trains_demand_post_covid.csv')

In [3]:
daily_trains_demand_post_covid_weekday = daily_trains_demand_post_covid[(daily_trains_demand_post_covid['Weekday'] == 1)]
daily_trains_demand_post_covid_weekend = daily_trains_demand_post_covid[(daily_trains_demand_post_covid['Weekday'] == 0)]

In [4]:
# add rows together
mean_daily_trains_demand_post_covid_weekday = pd.DataFrame()

for id, station_data in daily_trains_demand_post_covid_weekday.drop(['Business_Date', 'Weekday', 'PublicHoliday', 'Unnamed: 0'], axis=1).groupby('Station_Name'):
    station_data = station_data.drop('Station_Name', axis=1)
    station_data_mean = station_data.mean()
    station_data_mean = pd.DataFrame(station_data_mean).T
    station_data_mean['Station_Name'] = id
    mean_daily_trains_demand_post_covid_weekday = pd.concat([mean_daily_trains_demand_post_covid_weekday, station_data_mean], axis=0)


mean_daily_trains_demand_post_covid_weekend = pd.DataFrame()

for id, station_data in daily_trains_demand_post_covid_weekend.drop(['Business_Date', 'Weekday', 'PublicHoliday', 'Unnamed: 0'], axis=1).groupby('Station_Name'):
    station_data = station_data.drop('Station_Name', axis=1)
    station_data_mean = station_data.mean()
    station_data_mean = pd.DataFrame(station_data_mean).T
    station_data_mean['Station_Name'] = id
    mean_daily_trains_demand_post_covid_weekend = pd.concat([mean_daily_trains_demand_post_covid_weekend, station_data_mean], axis=0)

In [5]:
mean_daily_trains_demand_post_covid_weekday['log_Total_Demand'] = np.log(mean_daily_trains_demand_post_covid_weekday['Total_Demand'])
mean_daily_trains_demand_post_covid_weekend['log_Total_Demand'] = np.log(mean_daily_trains_demand_post_covid_weekend['Total_Demand'])
mean_daily_trains_demand_post_covid_weekday['log_Passenger_Alightings'] = np.log(mean_daily_trains_demand_post_covid_weekday['Passenger_Alightings'])
mean_daily_trains_demand_post_covid_weekend['log_Passenger_Alightings'] = np.log(mean_daily_trains_demand_post_covid_weekend['Passenger_Alightings'])
mean_daily_trains_demand_post_covid_weekday['log_Passenger_Boardings'] = np.log(mean_daily_trains_demand_post_covid_weekday['Passenger_Boardings'])
mean_daily_trains_demand_post_covid_weekend['log_Passenger_Boardings'] = np.log(mean_daily_trains_demand_post_covid_weekend['Passenger_Boardings'])

In [6]:
def get_daily_demand_in_rows(demand_df: pd.DataFrame, feature: str):

    try:
        df_with_feature = demand_df[['Station_Name', 'Business_Date', feature]]
    except:
        df_with_feature = demand_df[['Station_Name', feature]]

    station_df_list = []

    for id, station_df in tqdm(df_with_feature.groupby('Station_Name')):

        station_df = station_df.rename({feature: f'{feature}_{id}'}, axis=1)
        station_df = station_df.drop('Station_Name', axis=1)
        station_df_list.append(station_df)

    for i, station_df in enumerate(station_df_list):
        if i == 0:
            merged_df = station_df
        else:
            try:
                merged_df = pd.merge(merged_df, station_df, on='Business_Date', how='outer')
            except:
                merged_df = pd.concat([merged_df, station_df], axis=0)

    merged_df = merged_df.fillna(0)
    return merged_df

In [7]:
log_demand_precovid = get_daily_demand_in_rows(daily_trains_demand_pre_covid, 'log_Total_Demand')
log_demand_postcovid = get_daily_demand_in_rows(daily_trains_demand_post_covid, 'log_Total_Demand')
log_alighting_precovid = get_daily_demand_in_rows(daily_trains_demand_pre_covid, 'log_Passenger_Alightings')
log_alighting_postcovid = get_daily_demand_in_rows(daily_trains_demand_post_covid, 'log_Passenger_Alightings')
log_boarding_precovid = get_daily_demand_in_rows(daily_trains_demand_pre_covid, 'log_Passenger_Boardings')
log_boarding_postcovid = get_daily_demand_in_rows(daily_trains_demand_post_covid, 'log_Passenger_Boardings')

100%|██████████| 222/222 [00:00<00:00, 2289.11it/s]
100%|██████████| 223/223 [00:00<00:00, 2556.96it/s]
100%|██████████| 222/222 [00:00<00:00, 2527.64it/s]
100%|██████████| 223/223 [00:00<00:00, 2573.19it/s]
100%|██████████| 222/222 [00:00<00:00, 2620.83it/s]
100%|██████████| 223/223 [00:00<00:00, 1718.73it/s]


In [8]:
mean_daily_trains_demand_post_covid_weekday = mean_daily_trains_demand_post_covid_weekday.sort_values('Station_Name')
mean_daily_trains_demand_post_covid_weekday = mean_daily_trains_demand_post_covid_weekday.set_index('Station_Name')

mean_daily_trains_demand_post_covid_weekend = mean_daily_trains_demand_post_covid_weekend.sort_values('Station_Name')
mean_daily_trains_demand_post_covid_weekend = mean_daily_trains_demand_post_covid_weekend.set_index('Station_Name')

In [9]:
def get_daily_demand_in_rows_inference(df, feature):
    return df[[feature]].T

In [10]:
log_mean_demand_postcovid_weekday = get_daily_demand_in_rows_inference(mean_daily_trains_demand_post_covid_weekday, 'log_Total_Demand')
log_mean_alighting_postcovid_weekday = get_daily_demand_in_rows_inference(mean_daily_trains_demand_post_covid_weekday, 'log_Passenger_Alightings')
log_mean_boarding_postcovid_weekday = get_daily_demand_in_rows_inference(mean_daily_trains_demand_post_covid_weekday, 'log_Passenger_Boardings')

log_mean_demand_postcovid_weekend = get_daily_demand_in_rows_inference(mean_daily_trains_demand_post_covid_weekend, 'log_Total_Demand')
log_mean_alighting_postcovid_weekend = get_daily_demand_in_rows_inference(mean_daily_trains_demand_post_covid_weekend, 'log_Passenger_Alightings')
log_mean_boarding_postcovid_weekend = get_daily_demand_in_rows_inference(mean_daily_trains_demand_post_covid_weekend, 'log_Passenger_Boardings')

In [11]:
os.makedirs('../data/curated/ML_features', exist_ok=True)

In [12]:
# log_demand_precovid.to_csv('../data/curated/ML_features/log_demand_precovid.csv', index=False)
# log_demand_postcovid.to_csv('../data/curated/ML_features/log_demand_postcovid.csv', index=False)
# log_alighting_precovid.to_csv('../data/curated/ML_features/log_alighting_precovid.csv', index=False)
# log_alighting_postcovid.to_csv('../data/curated/ML_features/log_alighting_postcovid.csv', index=False)
# log_boarding_precovid.to_csv('../data/curated/ML_features/log_boarding_precovid.csv', index=False)
# log_boarding_postcovid.to_csv('../data/curated/ML_features/log_boarding_postcovid.csv', index=False)

# log_mean_demand_postcovid_weekday.to_csv('../data/curated/ML_features/log_mean_demand_postcovid_weekday.csv', index=False)
# log_mean_alighting_postcovid_weekday.to_csv('../data/curated/ML_features/log_mean_alighting_postcovid_weekday.csv', index=False)
# log_mean_boarding_postcovid_weekday.to_csv('../data/curated/ML_features/log_mean_boarding_postcovid_weekday.csv', index=False)
# log_mean_alighting_postcovid_weekend.to_csv('../data/curated/ML_features/log_mean_alighting_postcovid_weekend.csv', index=False)
# log_mean_boarding_postcovid_weekend.to_csv('../data/curated/ML_features/log_mean_boarding_postcovid_weekend.csv', index=False)
# log_mean_demand_postcovid_weekend.to_csv('../data/curated/ML_features/log_mean_demand_postcovid_weekend.csv', index=False)

# Weight Matrix

In [13]:
station_weights_withSA2 = pd.read_csv('../data/curated/ML_features/station_weights_withSA2.csv')
station_weights = pd.read_csv('../data/curated/ML_features/station_weights.csv')

station_with_sa2_list_dict = {k:i for i, k in enumerate(station_weights_withSA2['Unnamed: 0'])}
station_list_dict = {k:i for i, k in enumerate(station_weights['Unnamed: 0'])}
reverse_station_with_sa2_list_dict = {i:k for i, k in enumerate(station_weights_withSA2['Unnamed: 0'])}
reverse_station_list_dict = {i:k for i, k in enumerate(station_weights['Unnamed: 0'])}

station_weights_withSA2.set_index('Unnamed: 0', inplace=True)
station_weights.set_index('Unnamed: 0', inplace=True)

In [14]:
# save this numpy
np.save('../data/curated/ML_data/station_weights_matrix.npy', station_weights.values)
np.save('../data/curated/ML_data/station_weights_withSA2_matrix.npy', station_weights_withSA2.values)

# Join up data

In [15]:
daily_trains_demand_post_covid.drop(['Unnamed: 0', 'Passenger_Boardings', 'Passenger_Alightings', 'Total_Demand', 'log_Passenger_Boardings', 'log_Passenger_Alightings'], axis = 1, inplace=True)

In [16]:
rainfall_df = pd.read_csv('../data/curated/ML_features/rainfall_Station_SA2.csv')
rainfall_df_stations = rainfall_df[~rainfall_df['Station_Na'].isna()][['mean_rainfall_value', 'Station_Na']]
rainfall_df_sa2 = rainfall_df[rainfall_df['Station_Na'].isna()][['mean_rainfall_value', 'SA2_NAME21']]
rainfall_df_sa2.rename({'SA2_NAME21': 'Station'}, axis=1, inplace=True)

census_and_buildings_postcovid = pd.read_csv('../data/curated/ML_features/census_and_buildings_postcovid.csv')

In [17]:
station_ML_data = daily_trains_demand_post_covid.merge(rainfall_df_stations, left_on='Station_Name', right_on='Station_Na', how='left')
station_ML_data.drop(['Station_Na'], axis=1, inplace=True)
station_ML_data = station_ML_data.merge(census_and_buildings_postcovid[census_and_buildings_postcovid['point_type'] == 'station'], left_on = 'Station_Name', right_on='Point Name', how = 'left')

station_ML_data.drop(['Point Name', 'point_type'], inplace=True, axis = 1)

In [18]:
# create inference mean data
station_ML_data_weekday = rainfall_df_stations.copy()
station_ML_data_weekday = station_ML_data_weekday.merge(log_mean_demand_postcovid_weekday.T.reset_index(), left_on='Station_Na', right_on='Station_Name', how='left')
station_ML_data_weekday = station_ML_data_weekday.merge(census_and_buildings_postcovid[census_and_buildings_postcovid['point_type'] == 'station'], left_on = 'Station_Na', right_on='Point Name', how = 'left')

station_ML_data_weekday.drop(['Point Name', 'point_type'], axis=1, inplace=True)
station_ML_data_weekday.rename({'Station': 'Station Name'}, axis=1, inplace=True)

station_ML_data_weekend = rainfall_df_stations.copy()
station_ML_data_weekend = station_ML_data_weekend.merge(log_mean_demand_postcovid_weekend.T.reset_index(), left_on='Station_Na', right_on='Station_Name', how='left')
station_ML_data_weekend = station_ML_data_weekend.merge(census_and_buildings_postcovid[census_and_buildings_postcovid['point_type'] == 'station'], left_on = 'Station_Na', right_on='Point Name', how = 'left')

station_ML_data_weekend.drop(['Point Name', 'point_type'], axis=1, inplace=True)
station_ML_data_weekend.rename({'Station': 'Station Name'}, axis=1, inplace=True)

station_ML_data_weekday['Weekday'] = 1
station_ML_data_weekend['Weekday'] = 0

station_inference_ML_data = pd.concat([station_ML_data_weekday, station_ML_data_weekend], axis=0)

In [19]:
# create inference SA2 data
SA2_ML_data_weekday = rainfall_df_sa2.copy()
SA2_ML_data_weekday = SA2_ML_data_weekday.merge(census_and_buildings_postcovid[census_and_buildings_postcovid['point_type'] == 'suburb'], left_on = 'Station', right_on='Point Name', how = 'left')

SA2_ML_data_weekday.drop(['Point Name', 'point_type'], axis=1, inplace=True)
SA2_ML_data_weekday.rename({'Station': 'Station Name'}, axis=1, inplace=True)

SA2_ML_data_weekend = rainfall_df_sa2.copy()
SA2_ML_data_weekend = SA2_ML_data_weekend.merge(census_and_buildings_postcovid[census_and_buildings_postcovid['point_type'] == 'suburb'], left_on = 'Station', right_on='Point Name', how = 'left')

SA2_ML_data_weekend.drop(['Point Name', 'point_type'], axis=1, inplace=True)
SA2_ML_data_weekend.rename({'Station': 'Station Name'}, axis=1, inplace=True)

SA2_ML_data_weekday['Weekday'] = 1
SA2_ML_data_weekend['Weekday'] = 0

SA2_ML_data = pd.concat([SA2_ML_data_weekday, SA2_ML_data_weekend], axis=0)

In [20]:
os.makedirs('../data/curated/ML_data', exist_ok=True)

In [21]:
station_ML_data.to_parquet('../data/curated/ML_data/station_gnn_data.parquet', index=False)
SA2_ML_data.to_parquet('../data/curated/ML_data/SA2_gnn_data.parquet', index=False)
station_inference_ML_data.to_parquet('../data/curated/ML_data/station_inference_gnn_data.parquet', index=False)

# Train Test Split, Feature Selection and Normalisation

In [22]:
import pandas as pd
station_ML_data = pd.read_parquet('../data/curated/ML_data/station_gnn_data.parquet')
SA2_ML_data = pd.read_parquet('../data/curated/ML_data/SA2_gnn_data.parquet')
station_inference_ML_data = pd.read_parquet('../data/curated/ML_data/station_inference_gnn_data.parquet')

In [23]:
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [24]:
drop_columns = [
 'average_hh_size_c2021',
 ' med_tot_hh_inc_wee_c2021',
 ' avg_num_p_per_brm_c2021',
 ' med_age_persns_c2021',]
for column in station_ML_data.columns:
    if 'boarding' in column or 'alighting' in column:
        drop_columns.append(column)

In [25]:
station_ML_data = station_ML_data.drop(drop_columns, axis = 1)
SA2_ML_data = SA2_ML_data.drop(drop_columns, axis = 1)
station_inference_ML_data = station_inference_ML_data.drop(drop_columns, axis = 1)

In [26]:
business_dates = station_ML_data['Business_Date'].unique()

train_dates, val_test_dates = train_test_split(business_dates, test_size=0.3, shuffle = False)
val_dates, test_dates = train_test_split(val_test_dates, test_size=0.5, shuffle = False)

ML_train_data = station_ML_data[station_ML_data['Business_Date'].isin(train_dates)]
ML_val_data = station_ML_data[station_ML_data['Business_Date'].isin(val_dates)]
ML_test_data = station_ML_data[station_ML_data['Business_Date'].isin(test_dates)]

In [27]:
numerical_columns = [col for col in station_ML_data if col not in  ['Station_Name',
                                                                    'Business_Date',
                                                                    'Weekday',
                                                                    'PublicHoliday',
                                                                    'has_school', 
                                                                    'has_sport_facility',
                                                                    'has_shopping_centre',
                                                                    'has_hospital', 
                                                                    'Date',
                                                                    'log_Total_Demand']]

In [28]:
# Initialize the StandardScaler
scaler = StandardScaler()
y_scaler = StandardScaler()

# Fit and transform the numerical columns
ML_train_data[numerical_columns] = scaler.fit_transform(ML_train_data[numerical_columns])
ML_val_data[numerical_columns] = scaler.transform(ML_val_data[numerical_columns])
ML_test_data[numerical_columns] = scaler.transform(ML_test_data[numerical_columns])

ML_train_data[['log_Total_Demand']] = y_scaler.fit_transform(ML_train_data[['log_Total_Demand']])
ML_val_data[['log_Total_Demand']] = y_scaler.transform(ML_val_data[['log_Total_Demand']])
ML_test_data[['log_Total_Demand']] = y_scaler.transform(ML_test_data[['log_Total_Demand']])

SA2_ML_data[numerical_columns] = scaler.transform(SA2_ML_data[numerical_columns])
station_inference_ML_data[numerical_columns] = scaler.transform(station_inference_ML_data[numerical_columns])

with open('../data/curated/ML_data/scaler_gnn.pickle', 'wb') as f:
    pickle.dump(scaler, f) 

with open('../data/curated/ML_data/y_scaler_gnn.pickle', 'wb') as f:
    pickle.dump(y_scaler, f) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ML_train_data[numerical_columns] = scaler.fit_transform(ML_train_data[numerical_columns])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ML_val_data[numerical_columns] = scaler.transform(ML_val_data[numerical_columns])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ML_test_data[numerical_columns] =

In [29]:
ML_train_data.to_parquet('../data/curated/ML_data/gnn_train_data.parquet', index=False)
ML_val_data.to_parquet('../data/curated/ML_data/gnn_val_data.parquet', index=False)
ML_test_data.to_parquet('../data/curated/ML_data/gnn_test_data.parquet', index=False)

station_inference_ML_data.to_parquet('../data/curated/ML_data/station_inference_gnn_data.parquet', index=False)
SA2_ML_data.to_parquet('../data/curated/ML_data/SA2_gnn_data.parquet')