In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
features_matrix = np.load("./continents_new_features.npz")["arr_0"]
features_matrix.shape

In [None]:
flights_matrix = np.load("./continents_flight_zero_diag.npz")["arr_0"]
flights_matrix.shape

In [None]:
flights_matrix[600,:5,:5]

In [None]:
continents = ["Africa", "North America", "South America", "Oceania", "Eastern Europe", "Western Europe", "Middle East", "South Asia", "Southeast-East Asia", "Central Asia"]

# Sync two matrices
March 1st, 2020 to September 30th, 2021

In [None]:
features_matrix[616,:,1]

In [None]:
flights_matrix[424,0:10,0:5]

In [None]:
# Indexes in features matrix: 38 to 616
# Indexes in flights matrix: 424 to 998

synced_feature_matrix = features_matrix[38:616]
synced_flights_matrix = flights_matrix[424:998]

print(synced_feature_matrix.shape)
print(synced_flights_matrix.shape)

In [None]:
# Missing Day in Flight Matrix  -  Index of that day in features matrix, need to delete
# 3/14/2021 - 378
# 5/31/2021 - 456
# 9/4/2021 - 552
# 9/29/2021 - 577

In [None]:
synced_feature_matrix[577,:,1]

In [None]:
synced_feature_matrix = np.delete(synced_feature_matrix, [378, 456, 552, 577], axis=0)

In [None]:
print(synced_feature_matrix.shape)
print(synced_flights_matrix.shape)

# Check Node Features for nans, negatives, and prevalence of 0s

In [None]:
np.where(synced_feature_matrix == np.nan)

In [None]:
np.where(synced_feature_matrix < 0)

In [None]:
np.where(synced_feature_matrix == 0)  # A few regions reported 0 overall cases in March 2020

# Check Flight Matrix For Zeros

In [None]:
flight_zeros_loc = np.where(synced_flights_matrix == 0)
len(flight_zeros_loc[0])

In [None]:
flight_zeros_loc

If want to test fully connected adjacency, run this cell

In [None]:
# synced_flights_matrix += 10

In [None]:
# flight_zeros_loc = np.where(synced_flights_matrix == 0)
# flight_zeros_loc

# Remove self-connections in flight dataset

In [None]:
synced_flights_matrix.shape

In [None]:
for idx in range(10):
    synced_flights_matrix[:,idx,idx] = 0

In [None]:
flight_zeros_loc = np.where(synced_flights_matrix == 0)
print(len(flight_zeros_loc[0]))

In [None]:
synced_flights_matrix[100,:5,:5]

Reset variable names

In [None]:
feature_matrix = synced_feature_matrix
flights_matrix = synced_flights_matrix

print(feature_matrix.shape)
print(flights_matrix.shape)

# Split into Train/Validation/Test Splits

In [None]:
val_test_split_idx = int(len(feature_matrix) * 0.8)
train_val_split_idx = int(val_test_split_idx * 0.8)

print(train_val_split_idx)
print(val_test_split_idx)

In [None]:
train_feat_matrix = feature_matrix[:train_val_split_idx, :, :]
train_flight_matrix = flights_matrix[:train_val_split_idx, :, :]
val_feat_matrix = feature_matrix[train_val_split_idx:val_test_split_idx, :, :]
val_flight_matrix = flights_matrix[train_val_split_idx:val_test_split_idx, :, :]
test_feat_matrix = feature_matrix[val_test_split_idx:, :, :]
test_flight_matrix = flights_matrix[val_test_split_idx:, :, :]

print(train_feat_matrix.shape)
print(train_flight_matrix.shape)
print(val_feat_matrix.shape)
print(val_flight_matrix.shape)
print(test_feat_matrix.shape)
print(test_flight_matrix.shape)

# Smoothen Covid Cases Feature

In [None]:
smoothening_window = 7

In [None]:
# Smoothen Training Dataset
train_feature_matrix_df = pd.DataFrame(train_feat_matrix[:,:,1])
train_rolling_win_df = train_feature_matrix_df.rolling(window=smoothening_window + 1).mean()
train_rolling_win_df.dropna(inplace=True)
train_rolling_win_df.plot()
train_rolling_win_df_np = train_rolling_win_df.values

# Delete first 14 days from feature and flight datasets
train_feat_matrix = train_feat_matrix[smoothening_window:,:,:]
train_flight_matrix = train_flight_matrix[smoothening_window:,:,:]

# Overlay moving averages onto matrix that will be saved
train_feat_matrix[:,:,1] = train_rolling_win_df_np
print(train_feat_matrix.shape)

In [None]:
# Smoothen Validation Dataset
val_feature_matrix_df = pd.DataFrame(val_feat_matrix[:,:,1])
val_rolling_win_df = val_feature_matrix_df.rolling(window=smoothening_window + 1).mean()
val_rolling_win_df.dropna(inplace=True)
val_rolling_win_df.plot()
val_rolling_win_df_np = val_rolling_win_df.values

# Delete first 14 days from feature and flight datasets
val_feat_matrix = val_feat_matrix[smoothening_window:,:,:]
val_flight_matrix = val_flight_matrix[smoothening_window:,:,:]

# Overlay moving averages onto matrix that will be saved
val_feat_matrix[:,:,1] = val_rolling_win_df_np
print(val_feat_matrix.shape)

In [None]:
# Smoothen Test Dataset
test_feature_matrix_df = pd.DataFrame(test_feat_matrix[:,:,1])
test_rolling_win_df = test_feature_matrix_df.rolling(window=smoothening_window + 1).mean()
test_rolling_win_df.dropna(inplace=True)
test_rolling_win_df.plot()
test_rolling_win_df_np = test_rolling_win_df.values

# Delete first 14 days from feature and flight datasets
test_feat_matrix = test_feat_matrix[smoothening_window:,:,:]
test_flight_matrix = test_flight_matrix[smoothening_window:,:,:]

# Overlay moving averages onto matrix that will be saved
test_feat_matrix_smooth = np.copy(test_feat_matrix)
test_feat_matrix_smooth[:,:,1] = test_rolling_win_df_np
print(test_feat_matrix_smooth.shape)

# Scale Adjacency Matrix

In [None]:
train_unscaled_flight_matrix = np.copy(train_flight_matrix)
val_unscaled_flight_matrix = np.copy(val_flight_matrix)
test_unscaled_flight_matrix = np.copy(test_flight_matrix)
print(train_unscaled_flight_matrix.shape)
print(val_unscaled_flight_matrix.shape)
print(test_unscaled_flight_matrix.shape)

In [None]:
print(train_unscaled_flight_matrix.max())
print(train_unscaled_flight_matrix.min())
print(val_unscaled_flight_matrix.max())
print(val_unscaled_flight_matrix.min())
print(test_unscaled_flight_matrix.max())
print(test_unscaled_flight_matrix.min())

In [None]:
print(len(np.where(train_flight_matrix == 0)[0]))
print(len(np.where(val_flight_matrix == 0)[0]))
print(len(np.where(test_flight_matrix == 0)[0]))

In [None]:
# Important: replaces 1s and 1.1s so that zeros are not introduced by log10 transformation of flight weights. That would affect edge creation in graph networks
train_flight_matrix[train_flight_matrix == 1] = 1.1
for roll_win in range(len(train_flight_matrix)):
    for row_idx in range(len(train_flight_matrix[roll_win])):
        for col_idx in range(len(train_flight_matrix[roll_win][row_idx])):
            if train_flight_matrix[roll_win][row_idx][col_idx] > 0:
                train_flight_matrix[roll_win][row_idx][col_idx] = np.log10(train_flight_matrix[roll_win][row_idx][col_idx])

val_flight_matrix[val_flight_matrix == 1] = 1.1
for roll_win in range(len(val_flight_matrix)):
    for row_idx in range(len(val_flight_matrix[roll_win])):
        for col_idx in range(len(val_flight_matrix[roll_win][row_idx])):
            if val_flight_matrix[roll_win][row_idx][col_idx] > 0:
                val_flight_matrix[roll_win][row_idx][col_idx] = np.log10(val_flight_matrix[roll_win][row_idx][col_idx])

test_flight_matrix[test_flight_matrix == 1] = 1.1
for roll_win in range(len(test_flight_matrix)):
    for row_idx in range(len(test_flight_matrix[roll_win])):
        for col_idx in range(len(test_flight_matrix[roll_win][row_idx])):
            if test_flight_matrix[roll_win][row_idx][col_idx] > 0:
                test_flight_matrix[roll_win][row_idx][col_idx] = np.log10(test_flight_matrix[roll_win][row_idx][col_idx])

In [None]:
print(len(np.where(train_flight_matrix == 0)[0]))
print(len(np.where(val_flight_matrix == 0)[0]))
print(len(np.where(test_flight_matrix == 0)[0]))

In [None]:
print(train_flight_matrix.max())
print(train_flight_matrix.min())
print(val_flight_matrix.max())
print(val_flight_matrix.min())
print(test_flight_matrix.max())
print(test_flight_matrix.min())

Check that outgoing flights are log10 scaled

In [None]:
sns.reset_orig()
plt.figure(figsize=(30, 15))
for j in range(10):
    ax = plt.subplot(5,2,j+1)
    for i in range(10):
        if i != j:
            # train_flight_matrix val_flight_matrix test_flight_matrix
            # train_unscaled_flight_matrix val_unscaled_flight_matrix test_unscaled_flight_matrix
            plt.plot(test_flight_matrix[:,j,i], label=continents[i])
        else:
            plt.plot(np.zeros((len(test_flight_matrix))), label=continents[i])
    plt.title(continents[j] + " to Other Continents")
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    # plt.ylim(0.0, 3.0)
    plt.xlabel('Day Index')  
    plt.ylabel('Number of Flights (Log10 Scale)')

plt.suptitle("Continents Dataset v19 Test Set Flight Trends After Log10 Scaling")
plt.tight_layout()
# plt.savefig("./continents_v19_test_flight_trends_log10_7day_smoothened.png", facecolor="white", bbox_inches="tight")
plt.show()

# Log Transform Covid Cases and Containment Index Features

In [None]:
print(train_feat_matrix[:,:,1].max())
print(train_feat_matrix[:,:,1].min())
print(val_feat_matrix[:,:,1].max())
print(val_feat_matrix[:,:,1].min())
print(train_feat_matrix[:,:,1].max())
print(train_feat_matrix[:,:,1].min())
print(test_feat_matrix_smooth[:,:,1].max())
print(test_feat_matrix_smooth[:,:,1].min())

In [None]:
for i in range(train_feat_matrix[:,:,1].shape[0]):
    for j in range(train_feat_matrix[:,:,1].shape[1]):
        if train_feat_matrix[i][j][1] > 0:
            train_feat_matrix[i][j][1] = np.log10(train_feat_matrix[i][j][1])
        train_feat_matrix[i][j][0] = np.log10(train_feat_matrix[i][j][0])

for i in range(val_feat_matrix[:,:,1].shape[0]):
    for j in range(val_feat_matrix[:,:,1].shape[1]):
        if val_feat_matrix[i][j][1] > 0:
            val_feat_matrix[i][j][1] = np.log10(val_feat_matrix[i][j][1])
        val_feat_matrix[i][j][0] = np.log10(val_feat_matrix[i][j][0])

for i in range(test_feat_matrix[:,:,1].shape[0]):
    for j in range(test_feat_matrix[:,:,1].shape[1]):
        if test_feat_matrix[i][j][1] > 0:
            test_feat_matrix[i][j][1] = np.log10(test_feat_matrix[i][j][1])
        test_feat_matrix[i][j][0] = np.log10(test_feat_matrix[i][j][0])

for i in range(test_feat_matrix_smooth[:,:,1].shape[0]):
    for j in range(test_feat_matrix_smooth[:,:,1].shape[1]):
        if test_feat_matrix_smooth[i][j][1] > 0:
            test_feat_matrix_smooth[i][j][1] = np.log10(test_feat_matrix_smooth[i][j][1])
        test_feat_matrix_smooth[i][j][0] = np.log10(test_feat_matrix_smooth[i][j][0])

In [None]:
print(train_feat_matrix[:,:,1].max())
print(train_feat_matrix[:,:,1].min())
print(val_feat_matrix[:,:,1].max())
print(val_feat_matrix[:,:,1].min())
print(test_feat_matrix[:,:,1].max())
print(test_feat_matrix[:,:,1].min())
print(test_feat_matrix_smooth[:,:,1].max())
print(test_feat_matrix_smooth[:,:,1].min())

# Save Dataset

In [None]:
print(train_flight_matrix.max())
print(train_unscaled_flight_matrix.max())
print(val_flight_matrix.max())
print(val_unscaled_flight_matrix.max())
print(test_flight_matrix.max())
print(test_unscaled_flight_matrix.max())

In [None]:
test_flight_matrix[72,:5,:5]

In [None]:
np.savez('./10_continents_dataset_v19_training',
    train_features_log10=train_feat_matrix,
    train_log10_scaled_flight_matrix=train_flight_matrix,
    train_unscaled_flight_matrix=train_unscaled_flight_matrix,
    val_features_log10=val_feat_matrix,
    val_log10_scaled_flight_matrix=val_flight_matrix,
    val_unscaled_flight_matrix=val_unscaled_flight_matrix,
    test_features_log10_unsmooth=test_feat_matrix,
    test_features_log10_smooth=test_feat_matrix_smooth,
    test_log10_scaled_flight_matrix=test_flight_matrix,
    test_unscaled_flight_matrix=test_unscaled_flight_matrix,
)