In [34]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM

In [35]:
import os
import pandas as pd
import numpy as np

# Directory containing the CSV files
directory = '../draft-final-data'

# Dictionary to store the dataframes
dataframes = {}

# Iterate over all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Extract the file name without extension and convert it to int
        key = int(os.path.splitext(filename)[0])
        
        # Read the CSV file into a dataframe
        df = pd.read_csv(os.path.join(directory, filename))
        # Drop columns that contain non-numerical data
        df = df.select_dtypes(include=[np.number])
        
        # Check if the dataframe has 13 columns
        if df.shape[1] == 13:
            # Store the dataframe in the dictionary
            dataframes[key] = df

# Print the dictionary keys to verify
print(dataframes.keys())

dict_keys([72790024141, 72785524114, 72789094197, 72793024233, 72785794129, 72220804224, 72788594266, 72797624217, 72797094240, 72027294282, 72798594276, 72792424223, 74207124201, 72792894263, 72781024243, 72698824219, 74206024207, 72782724110, 72793724222, 72782594239, 72794504205, 72792024227, 72025400119])


In [36]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GNNWithEdgeFeatures(torch.nn.Module):
    def __init__(self, in_channels, edge_in_channels, hidden_channels, out_channels):
        super(GNNWithEdgeFeatures, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)  # Additional hidden layer
        self.edge_mlp = torch.nn.Linear(edge_in_channels, hidden_channels)
        self.output_layer = torch.nn.Linear(hidden_channels, out_channels)

    def forward(self, x, edge_index, edge_attr):
        # Apply the edge MLP to the edge features
        edge_features = self.edge_mlp(edge_attr)
        
        # Apply the first GCN convolution
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        
        # Apply the second GCN convolution
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        
        # Combine node and edge features (this is a simple example, you might want to use a more complex method)
        x = x + edge_features.mean(dim=0)
        
        # Apply the output layer
        x = self.output_layer(x)
        
        return x

In [37]:
for key, df in dataframes.items():
    print(f"DataFrame with key {key} has {df.shape[1]} columns.")

DataFrame with key 72790024141 has 13 columns.
DataFrame with key 72785524114 has 13 columns.
DataFrame with key 72789094197 has 13 columns.
DataFrame with key 72793024233 has 13 columns.
DataFrame with key 72785794129 has 13 columns.
DataFrame with key 72220804224 has 13 columns.
DataFrame with key 72788594266 has 13 columns.
DataFrame with key 72797624217 has 13 columns.
DataFrame with key 72797094240 has 13 columns.
DataFrame with key 72027294282 has 13 columns.
DataFrame with key 72798594276 has 13 columns.
DataFrame with key 72792424223 has 13 columns.
DataFrame with key 74207124201 has 13 columns.
DataFrame with key 72792894263 has 13 columns.
DataFrame with key 72781024243 has 13 columns.
DataFrame with key 72698824219 has 13 columns.
DataFrame with key 74206024207 has 13 columns.
DataFrame with key 72782724110 has 13 columns.
DataFrame with key 72793724222 has 13 columns.
DataFrame with key 72782594239 has 13 columns.
DataFrame with key 72794504205 has 13 columns.
DataFrame wit

In [38]:
# Check for null values in each dataframe
for key, df in dataframes.items():
    null_values = df.isnull().sum()
    print(f"DataFrame with key {key} null values:\n{null_values}\n")

DataFrame with key 72790024141 null values:
STATION                     0
DailyAltimeterSetting       0
DailyDewPointTemperature    0
DailyDryBulbTemperature     0
DailyPressureChange         0
DailyPressureTendency       0
DailyRelativeHumidity       0
DailySeaLevelPressure       0
DailyStationPressure        0
DailyVisibility             0
DailyWetBulbTemperature     0
DailyWindGustSpeed          0
DailyWindSpeed              0
dtype: int64

DataFrame with key 72785524114 null values:
STATION                     0
DailyAltimeterSetting       0
DailyDewPointTemperature    0
DailyDryBulbTemperature     0
DailyPressureChange         0
DailyPressureTendency       0
DailyRelativeHumidity       0
DailySeaLevelPressure       0
DailyStationPressure        0
DailyWetBulbTemperature     0
DailyWindDirection          0
DailyWindGustSpeed          0
DailyWindSpeed              0
dtype: int64

DataFrame with key 72789094197 null values:
STATION                     0
DailyAltimeterSetting       0


In [39]:
# Drop any dataframe with null values
dataframes = {key: df for key, df in dataframes.items() if not df.isnull().values.any()}

# Print the keys of the remaining dataframes to verify
print(dataframes.keys())

dict_keys([72790024141, 72785524114, 72793024233, 72785794129, 72788594266, 72797624217, 72797094240, 72792424223, 74207124201, 72781024243, 74206024207, 72782724110, 72793724222, 72782594239, 72794504205, 72792024227])


In [40]:
num_nodes = len(dataframes)
print(f"Number of nodes: {num_nodes}")
# Calculate the number of edges if every node is connected to every other node
num_edges = num_nodes * (num_nodes - 1) // 2
print(f"Number of edges: {num_edges}")

Number of nodes: 16
Number of edges: 120


In [41]:
import itertools
import torch

# Generate all possible pairs of nodes
edge_index = torch.tensor(list(itertools.combinations(range(num_nodes), 2)), dtype=torch.long).t()

print(edge_index)

tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,
          2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
          4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,
          5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7,  7,  7,  7,  7,
          7,  7,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9, 10, 10, 10,
         10, 10, 11, 11, 11, 11, 12, 12, 12, 13, 13, 14],
        [ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  2,  3,  4,
          5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  3,  4,  5,  6,  7,  8,  9,
         10, 11, 12, 13, 14, 15,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
          5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  6,  7,  8,  9, 10, 11, 12,
         13, 14, 15,  7,  8,  9, 10, 11, 12, 13, 14, 15,  8,  9, 10, 11, 12, 13,
         14, 15,  9, 10, 11, 12, 13, 14, 15, 10, 11

In [42]:
# Dictionary to store the split dataframes
split_dataframes = {}

for key, df in dataframes.items():
    # Calculate the split index
    split_index = int(len(df) * 0.8)
    
    # Split the dataframe
    train_df = df.iloc[:split_index]
    test_df = df.iloc[split_index:]
    
    # Store the split dataframes in the dictionary
    split_dataframes[key] = {'train': train_df, 'test': test_df}

# Print the keys of the split dataframes to verify
print(split_dataframes.keys())

dict_keys([72790024141, 72785524114, 72793024233, 72785794129, 72788594266, 72797624217, 72797094240, 72792424223, 74207124201, 72781024243, 74206024207, 72782724110, 72793724222, 72782594239, 72794504205, 72792024227])


In [43]:
# Dictionary to store the train dataframes for each node
node_train_dataframes = {}

# Iterate over the split_dataframes and assign the train_df to each node
for key, split_df in split_dataframes.items():
    node_train_dataframes[key] = split_df['train']

# Print the keys of the node_train_dataframes to verify
print(node_train_dataframes.keys())

dict_keys([72790024141, 72785524114, 72793024233, 72785794129, 72788594266, 72797624217, 72797094240, 72792424223, 74207124201, 72781024243, 74206024207, 72782724110, 72793724222, 72782594239, 72794504205, 72792024227])


In [44]:
# Assign in_channels and out_channels as the number of columns in the dataframe
in_channels = 13
# Assign edge_in_channels as the number of connections to every other dataframe
edge_in_channels = 1
hidden_channels = 128  # Number of hidden features
out_channels = in_channels  # Number of output features per node

print(f"in_channels: {in_channels}, out_channels: {out_channels}, edge_in_channels: {edge_in_channels}")
# Initialize the GNN model with edge features
gnn = GNNWithEdgeFeatures(in_channels, edge_in_channels, hidden_channels, out_channels)

in_channels: 13, out_channels: 13, edge_in_channels: 1


In [45]:
from math import radians, sin, cos, sqrt, atan2

# Import the location-datamap.csv file as a dataframe
location_datamap_df = pd.read_csv('../location-datamap.csv')

# Print the first few rows of the dataframe to verify
print(location_datamap_df.head())

        STATION  LONGITUDE  LATITUDE  ELEVATION
0  7.279372e+10 -122.28308  47.92322      167.1
1  7.278462e+10 -118.28572  46.09456      356.7
2  7.420712e+10 -122.58333  47.08333       91.4
3  7.279762e+10 -122.54069  48.79910       45.9
4  7.279239e+10 -123.93074  46.97288        4.5


In [46]:
def haversine_distance(lat1, lon1, lat2, lon2, el1=0, el2=0):
    # Radius of the Earth in kilometers
    R = 6371.0

    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Difference in coordinates
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Haversine formula
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    # Distance in kilometers
    distance = R * c

    # Elevation difference
    height = el2 - el1

    # Calculate the total distance considering elevation
    total_distance = sqrt(distance**2 + height**2)

    return total_distance

In [47]:
# Create a dictionary to store the distances between nodes
distances = {}

# Iterate over each pair of nodes
for i, j in itertools.combinations(range(num_nodes), 2):
    # Get the station IDs for the nodes
    station_i = list(dataframes.keys())[i]
    station_j = list(dataframes.keys())[j]
    
    # Get the location data for the stations
    location_i = location_datamap_df[location_datamap_df['STATION'] == station_i].iloc[0]
    location_j = location_datamap_df[location_datamap_df['STATION'] == station_j].iloc[0]
    
    # Calculate the distance between the stations
    distance = haversine_distance(location_i['LATITUDE'], location_i['LONGITUDE'], location_j['LATITUDE'], location_j['LONGITUDE'], location_i['ELEVATION'], location_j['ELEVATION'])
    
    # Store the distance in the dictionary
    distances[(i, j)] = distance

# Print the distances to verify
print(distances)

{(0, 1): 395.4679457314314, (0, 2): 342.5330553210191, (0, 3): 438.10430559873856, (0, 4): 432.0504921333856, (0, 5): 437.11321302131137, (0, 6): 503.7031148557305, (0, 7): 474.59654599988056, (0, 8): 372.6269101788848, (0, 9): 128.6980383337319, (0, 10): 362.1843030623904, (0, 11): 32.53657446575969, (0, 12): 306.4908073667999, (0, 13): 52.81240627430448, (0, 14): 404.8085057270694, (0, 15): 412.6227567328569, (1, 2): 727.6917541369755, (1, 3): 109.85904921372176, (1, 4): 798.995937782049, (1, 5): 802.4581375032445, (1, 6): 865.1737430163511, (1, 7): 860.6728795779842, (1, 8): 758.7243205326865, (1, 9): 495.92270395996894, (1, 10): 748.5486591666476, (1, 11): 416.12981588895354, (1, 12): 678.7815012654146, (1, 13): 419.79175595055284, (1, 14): 786.18443118649, (1, 15): 798.318691116012, (2, 3): 775.0588941479449, (2, 4): 120.26611512741678, (2, 5): 165.52848353942377, (2, 6): 185.28412975619443, (2, 7): 187.27787071696568, (2, 8): 49.712417563568934, (2, 9): 266.9618608270944, (2, 10)

In [48]:
import numpy as np


# Create edge index and edge attributes from distances dictionary
edge_index = []
edge_attr = []

for (i, j), distance in distances.items():
    edge_index.append([i, j])
    edge_index.append([j, i])  # Assuming undirected graph
    edge_attr.append([distance])
    edge_attr.append([distance])  # Assuming undirected graph

edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
edge_attr = torch.tensor(edge_attr, dtype=torch.float)


In [None]:
# Prepare the list of dataframes from node_train_dataframes
dataframe_list = list(node_train_dataframes.values())

# Find the minimum length among all dataframes to align time steps
min_length = min(df.shape[0] for df in dataframe_list)

# Truncate dataframes to the minimum length
truncated_dataframes = [df.iloc[:min_length] for df in dataframe_list]

# Convert dataframes to tensors, selecting only numerical columns and filling NaN values with 0
node_features_sequence = torch.stack(
    [torch.tensor(df.select_dtypes(include=[np.number]).fillna(0).values, dtype=torch.float) for df in truncated_dataframes],
    dim=1  # Stack along a new dimension for nodes
)  # Shape: [time_steps, num_nodes, num_features]

# Prepare the target by shifting node_features_sequence by one time step
node_features_future_sequence = node_features_sequence[1:]  # Shape: [time_steps - 1, num_nodes, num_features]
node_features_sequence = node_features_sequence[:-1]  # Align with target

# Define the optimizer and loss function
optimizer = torch.optim.Adam(gnn.parameters(), lr=0.01)
loss_fn = torch.nn.MSELoss()

# Training loop
num_epochs = 200  # Adjust the number of epochs as needed
time_steps = node_features_sequence.shape[0]

for epoch in range(num_epochs):
    total_loss = 0.0

    for t in range(time_steps):
        gnn.train()
        optimizer.zero_grad()
        
        # Get the node features at current time step
        node_features = node_features_sequence[t]  # Shape: [num_nodes, num_features]
        target = node_features_future_sequence[t]  # Shape: [num_nodes, num_features]
        
        # Forward pass
        output = gnn(node_features, edge_index, edge_attr)
        
        # Compute loss
        loss = loss_fn(output, target)
        total_loss += loss.item()
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
    
    average_loss = total_loss / time_steps
    print(f"Epoch {epoch+1}, Average Loss: {average_loss}")

# After training, you can make predictions
gnn.eval()

Epoch 1, Average Loss: 4.058255900562151e+20
Epoch 2, Average Loss: 4.058255900562151e+20
Epoch 3, Average Loss: 4.058255900562151e+20
Epoch 4, Average Loss: 4.058255900562151e+20
Epoch 5, Average Loss: 4.058255900562151e+20
Epoch 6, Average Loss: 4.058255900562151e+20
Epoch 7, Average Loss: 4.058255900562151e+20
Epoch 8, Average Loss: 4.058255900562151e+20
Epoch 9, Average Loss: 4.058255900562151e+20
Epoch 10, Average Loss: 4.058255900562151e+20
Epoch 11, Average Loss: 4.058255900562151e+20
Epoch 12, Average Loss: 4.058255900562151e+20
Epoch 13, Average Loss: 4.058255900562151e+20
Epoch 14, Average Loss: 4.058255900562151e+20
Epoch 15, Average Loss: 4.058255900562151e+20
Epoch 16, Average Loss: 4.058255900562151e+20
Epoch 17, Average Loss: 4.058255900562151e+20
Epoch 18, Average Loss: 4.058255900562151e+20
Epoch 19, Average Loss: 4.058255900562151e+20
Epoch 20, Average Loss: 4.058255900562151e+20
Epoch 21, Average Loss: 4.058255900562151e+20
Epoch 22, Average Loss: 4.058255900562151e+

KeyboardInterrupt: 

In [None]:
# Print the feature of each node
for i, features in enumerate(output):
    print(f"Node {i} features:\n{features}\n")

Node 0 features:
tensor([ 7.2820e+10, -4.8418e+07, -3.1947e+07,  2.0847e+07,  4.2840e+07,
         8.8175e+05, -2.7285e+07, -5.3300e+07,  4.3184e+07,  6.4392e+07,
        -2.8386e+06, -3.6995e+07,  3.9368e+07], grad_fn=<UnbindBackward0>)

Node 1 features:
tensor([ 7.2820e+10, -4.8417e+07, -3.1947e+07,  2.0847e+07,  4.2839e+07,
         8.8226e+05, -2.7284e+07, -5.3300e+07,  4.3183e+07,  6.4392e+07,
        -2.8388e+06, -3.6994e+07,  3.9369e+07], grad_fn=<UnbindBackward0>)

Node 2 features:
tensor([ 7.2820e+10, -4.8417e+07, -3.1947e+07,  2.0847e+07,  4.2839e+07,
         8.8244e+05, -2.7285e+07, -5.3301e+07,  4.3183e+07,  6.4392e+07,
        -2.8385e+06, -3.6995e+07,  3.9368e+07], grad_fn=<UnbindBackward0>)

Node 3 features:
tensor([ 7.2820e+10, -4.8417e+07, -3.1947e+07,  2.0847e+07,  4.2839e+07,
         8.8123e+05, -2.7285e+07, -5.3300e+07,  4.3183e+07,  6.4392e+07,
        -2.8391e+06, -3.6995e+07,  3.9368e+07], grad_fn=<UnbindBackward0>)

Node 4 features:
tensor([ 7.2820e+10, -4.841