In [1]:
'''# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks
#from stellargraph import StellarGraph
#from stellargraph.layer import GAT, GCN


# Load the STEAD dataset
dataset_url = "D:\APURAV\K. K. Wagh\Study\BE\Semester VII\Final Year Project Sem VII\dataset\STEAD\merge.csv"
stead_data = pd.read_csv(dataset_url)


# Data Preprocessing
selected_columns = ['network_code', 'receiver_code', 'receiver_latitude', 'receiver_longitude',
                    'receiver_elevation_m', 'source_latitude', 'source_longitude', 'source_depth_km',
                    'source_magnitude', 'trace_start_time', 'trace_category']
data = stead_data[selected_columns]


# Convert trace_start_time to datetime format
data['trace_start_time'] = pd.to_datetime(data['trace_start_time'])


# Extract features from the timestamp
data['year'] = data['trace_start_time'].dt.year
data['month'] = data['trace_start_time'].dt.month
data['day'] = data['trace_start_time'].dt.day
data['hour'] = data['trace_start_time'].dt.hour
data['minute'] = data['trace_start_time'].dt.minute
data['second'] = data['trace_start_time'].dt.second


# Drop unnecessary columns
data = data.drop(['trace_start_time'], axis=1)


# Create a binary target variable indicating earthquake or non-earthquake
data['target'] = np.where(data['trace_category'] == 'earthquake', 1, 0)


# Feature engineering
# Feature 1: Time of Day (morning, afternoon, evening, night)
data['time_of_day'] = pd.cut(data['hour'], bins=[0, 6, 12, 18, 24], labels=['night', 'morning', 'afternoon', 'evening'])

# Feature 2: Distance from the earthquake source
data['distance_from_source'] = np.sqrt((data['receiver_latitude'] - data['source_latitude'])**2 +
                                       (data['receiver_longitude'] - data['source_longitude'])**2)

# Feature 3: Magnitude-weighted distance
data['weighted_distance'] = data['distance_from_source'] * data['source_magnitude']

# Feature 4: Duration of the seismic signal
data['signal_duration'] = data['minute'] * 60 + data['second']


# Drop the original columns used for feature engineering
data = data.drop(['hour', 'minute', 'second'], axis=1)



# Convert categorical columns to numerical representations
categorical_columns = ['network_code', 'receiver_code', 'time_of_day']
for column in categorical_columns:
    data[column] = pd.Categorical(data[column])
    data[column] = data[column].cat.codes

    
# Create a graph from the data
graph = StellarGraph.from_pandas(data, node_features=["receiver_latitude", "receiver_longitude",
                                                       "receiver_elevation_m", "source_latitude",
                                                       "source_longitude", "source_depth_km",
                                                       "source_magnitude"],
                                 edge_features=["distance_from_source", "weighted_distance"],
                                 node_type_default="receiver_code", edge_type_default="trace_category")

# Train-test split
X = data.drop(['trace_category', 'target'], axis=1)
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Convert data to StellarGraph instances
G_train = graph.node_features(X_train_scaled)
G_test = graph.node_features(X_test_scaled)


# Build the GNN model
model = models.Sequential()
model.add(GCN(layer_sizes=[32], activations=["relu"], generator=graph, dropout=0.5))
model.add(layers.Dense(units=16, activation="relu"))
model.add(layers.Dense(units=1, activation="sigmoid"))


# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])


# Define callbacks (e.g., early stopping to prevent overfitting)
early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)


# Train the model
model.fit(G_train, y_train, epochs=50, batch_size=64, validation_split=0.2, callbacks=[early_stopping])


# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(G_test, y_test)
print(f'Test Accuracy: {test_accuracy}')


# Make predictions for user input
def predict_earthquake_probability(user_input):
    # Process user input (similar to preprocessing steps above)
    user_input = pd.DataFrame(user_input, index=[0])
    
    # Feature engineering for user input
    
    # Scaling
    user_input_scaled = scaler.transform(user_input)
    
    # Convert to StellarGraph instances
    G_user_input = graph.node_features(user_input_scaled)
    
    # Make prediction
    probability = model.predict(G_user_input)
    
    return probability[0][0]


# Example usage
user_location_input = {
    'network_code': 'XYZ',
    'receiver_code': 'ABC',
    'receiver_latitude': 37.7749,
    'receiver_longitude': -122.4194,
    'receiver_elevation_m': 10.0,
    'source_latitude': 34.0522,
    'source_longitude': -118.2437,
    'source_depth_km': 10.0,
    'source_magnitude': 5.0,
    'year': 2024,
    'month': 2,
    'day': 5,
    'time_of_day': 'morning'
}

predicted_probability = predict_earthquake_probability(user_location_input)
print(f'Predicted Probability of Earthquake: {predicted_probability}')
'''

'# Import necessary libraries\nimport numpy as np\nimport pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom tensorflow import keras\nfrom tensorflow.keras import layers, models, callbacks\n#from stellargraph import StellarGraph\n#from stellargraph.layer import GAT, GCN\n\n\n# Load the STEAD dataset\ndataset_url = "D:\\APURAV\\K. K. Wagh\\Study\\BE\\Semester VII\\Final Year Project Sem VII\\dataset\\STEAD\\merge.csv"\nstead_data = pd.read_csv(dataset_url)\n\n\n# Data Preprocessing\nselected_columns = [\'network_code\', \'receiver_code\', \'receiver_latitude\', \'receiver_longitude\',\n                    \'receiver_elevation_m\', \'source_latitude\', \'source_longitude\', \'source_depth_km\',\n                    \'source_magnitude\', \'trace_start_time\', \'trace_category\']\ndata = stead_data[selected_columns]\n\n\n# Convert trace_start_time to datetime format\ndata[\'trace_start_time\'] = pd.to_datetime(data[\'t

2nd try

In [2]:
'''import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
data = pd.read_csv("D:\APURAV\K. K. Wagh\Study\BE\Semester VII\Final Year Project Sem VII\dataset\STEAD\merge.csv")

# Explore the dataset
print(data.head())
print(data.info())
print(data.describe())

# Handle missing values
data.dropna(inplace=True)

# Handle outliers (if necessary)
# Perform data consistency checks and corrections

# Feature engineering
# Example:
# Convert source_origin_time to datetime
data['source_origin_time'] = pd.to_datetime(data['source_origin_time'])

# Scaling, encoding, dimensionality reduction (if necessary)
# Example:
# Feature scaling using Min-Max normalization
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data[['source_latitude', 'source_longitude']])

# Convert scaled_data back to a DataFrame
scaled_data_df = pd.DataFrame(scaled_data, columns=['source_latitude', 'source_longitude'])

# Dimensionality reduction using PCA (Principal Component Analysis) or other methods if necessary

# Save preprocessed data
scaled_data_df.to_csv("preprocessed_data.csv", index=False)
'''

'import pandas as pd\nfrom sklearn.preprocessing import MinMaxScaler\n\n# Load the dataset\ndata = pd.read_csv("D:\\APURAV\\K. K. Wagh\\Study\\BE\\Semester VII\\Final Year Project Sem VII\\dataset\\STEAD\\merge.csv")\n\n# Explore the dataset\nprint(data.head())\nprint(data.info())\nprint(data.describe())\n\n# Handle missing values\ndata.dropna(inplace=True)\n\n# Handle outliers (if necessary)\n# Perform data consistency checks and corrections\n\n# Feature engineering\n# Example:\n# Convert source_origin_time to datetime\ndata[\'source_origin_time\'] = pd.to_datetime(data[\'source_origin_time\'])\n\n# Scaling, encoding, dimensionality reduction (if necessary)\n# Example:\n# Feature scaling using Min-Max normalization\nscaler = MinMaxScaler()\nscaled_data = scaler.fit_transform(data[[\'source_latitude\', \'source_longitude\']])\n\n# Convert scaled_data back to a DataFrame\nscaled_data_df = pd.DataFrame(scaled_data, columns=[\'source_latitude\', \'source_longitude\'])\n\n# Dimensionality 

In [3]:
'''# Implement GNN architecture using TensorFlow or PyTorch
# Example:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNN, self).__init__()
        # Define GNN layers
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Define forward pass
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Instantiate GNN model
input_dim = 2  # Example: latitude and longitude
hidden_dim = 64
output_dim = 1  # Probability of earthquake occurrence
model = GNN(input_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
'''

'# Implement GNN architecture using TensorFlow or PyTorch\n# Example:\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nclass GNN(nn.Module):\n    def __init__(self, input_dim, hidden_dim, output_dim):\n        super(GNN, self).__init__()\n        # Define GNN layers\n        self.fc1 = nn.Linear(input_dim, hidden_dim)\n        self.fc2 = nn.Linear(hidden_dim, output_dim)\n\n    def forward(self, x):\n        # Define forward pass\n        x = F.relu(self.fc1(x))\n        x = self.fc2(x)\n        return x\n\n# Instantiate GNN model\ninput_dim = 2  # Example: latitude and longitude\nhidden_dim = 64\noutput_dim = 1  # Probability of earthquake occurrence\nmodel = GNN(input_dim, hidden_dim, output_dim)\n\n# Define loss function and optimizer\ncriterion = nn.MSELoss()\noptimizer = torch.optim.Adam(model.parameters(), lr=0.001)\n'

In [4]:
'''import matplotlib.pyplot as plt
import seaborn as sns

# Visualize seismic data distributions
sns.pairplot(data[['source_latitude', 'source_longitude', 'source_depth_km']])
plt.show()

# Display GNN architecture diagrams (if necessary)

# Use interactive plots for seismic waveforms and earthquake characteristics (if necessary)
# Example: Plot seismic waveforms over time using Plotly or Bokeh
'''

"import matplotlib.pyplot as plt\nimport seaborn as sns\n\n# Visualize seismic data distributions\nsns.pairplot(data[['source_latitude', 'source_longitude', 'source_depth_km']])\nplt.show()\n\n# Display GNN architecture diagrams (if necessary)\n\n# Use interactive plots for seismic waveforms and earthquake characteristics (if necessary)\n# Example: Plot seismic waveforms over time using Plotly or Bokeh\n"

3rd try

In [5]:
'''import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv("D:\APURAV\K. K. Wagh\Study\BE\Semester VII\Final Year Project Sem VII\dataset\STEAD\merge.csv")

# Drop unnecessary columns
data.drop(['network_code', 'receiver_code', 'receiver_type', 'receiver_elevation_m',
           'p_status', 'p_weight', 'p_travel_sec', 's_status', 's_weight', 'source_id',
           'source_origin_uncertainty_sec', 'source_error_sec', 'source_gap_deg',
           'source_horizontal_uncertainty_km', 'source_depth_uncertainty_km',
           'source_magnitude_type', 'source_magnitude_author', 'source_mechanism_strike_dip_rake',
           'source_distance_deg', 'trace_start_time', 'trace_category', 'trace_name'], axis=1, inplace=True)

# Convert timestamp columns to datetime objects
data['source_origin_time'] = pd.to_datetime(data['source_origin_time'])

# Extract features from timestamp columns
data['hour'] = data['source_origin_time'].dt.hour
data['minute'] = data['source_origin_time'].dt.minute
data['second'] = data['source_origin_time'].dt.second

# Feature scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data[['hour', 'minute', 'second']])
data[['hour', 'minute', 'second']] = scaled_features

# Handle missing values
data.fillna(0, inplace=True)

# Encode categorical variables if needed

# Final processed dataset
processed_data = data.copy()
'''

'import pandas as pd\nimport numpy as np\n\n# Load the dataset\ndata = pd.read_csv("D:\\APURAV\\K. K. Wagh\\Study\\BE\\Semester VII\\Final Year Project Sem VII\\dataset\\STEAD\\merge.csv")\n\n# Drop unnecessary columns\ndata.drop([\'network_code\', \'receiver_code\', \'receiver_type\', \'receiver_elevation_m\',\n           \'p_status\', \'p_weight\', \'p_travel_sec\', \'s_status\', \'s_weight\', \'source_id\',\n           \'source_origin_uncertainty_sec\', \'source_error_sec\', \'source_gap_deg\',\n           \'source_horizontal_uncertainty_km\', \'source_depth_uncertainty_km\',\n           \'source_magnitude_type\', \'source_magnitude_author\', \'source_mechanism_strike_dip_rake\',\n           \'source_distance_deg\', \'trace_start_time\', \'trace_category\', \'trace_name\'], axis=1, inplace=True)\n\n# Convert timestamp columns to datetime objects\ndata[\'source_origin_time\'] = pd.to_datetime(data[\'source_origin_time\'])\n\n# Extract features from timestamp columns\ndata[\'hour\'] =

In [6]:
'''import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GraphConvolutionalNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GraphConvolutionalNetwork, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        return x

# Initialize the GNN model
input_dim = len(processed_data.columns)  # Adjust based on your processed data
hidden_dim = 64
output_dim = 1  # Adjust based on your prediction task
model = GraphConvolutionalNetwork(input_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
'''

'import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch_geometric.nn import GCNConv\n\nclass GraphConvolutionalNetwork(nn.Module):\n    def __init__(self, input_dim, hidden_dim, output_dim):\n        super(GraphConvolutionalNetwork, self).__init__()\n        self.conv1 = GCNConv(input_dim, hidden_dim)\n        self.conv2 = GCNConv(hidden_dim, output_dim)\n\n    def forward(self, data):\n        x, edge_index = data.x, data.edge_index\n        x = F.relu(self.conv1(x, edge_index))\n        x = F.relu(self.conv2(x, edge_index))\n        return x\n\n# Initialize the GNN model\ninput_dim = len(processed_data.columns)  # Adjust based on your processed data\nhidden_dim = 64\noutput_dim = 1  # Adjust based on your prediction task\nmodel = GraphConvolutionalNetwork(input_dim, hidden_dim, output_dim)\n\n# Define loss function and optimizer\ncriterion = nn.MSELoss()\noptimizer = torch.optim.Adam(model.parameters(), lr=0.001)\n'

4th try

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Step 1: Data Understanding and Preprocessing

# Load the dataset
data = pd.read_csv("D:\APURAV\K. K. Wagh\Study\BE\Semester VII\Final Year Project Sem VII\dataset\STEAD\merge.csv")

# Review the dataset
print(data.head())

# Handle missing values
data.dropna(inplace=True)

# Handle outliers
# You may use techniques like Z-score, IQR, or domain-specific knowledge

# Feature engineering
# Extract relevant features
# Example: Extracting seismic waveforms into separate arrays
seismic_waveforms = data[['trace_start_time', 'trace_category', 'trace_name']]

# Data encoding and scaling
# Example: Scale numerical features
scaler = StandardScaler()
data[['source_magnitude', 'source_depth_km']] = scaler.fit_transform(data[['source_magnitude', 'source_depth_km']])

# Dimensionality reduction
# You may use techniques like PCA if needed


  exec(code_obj, self.user_global_ns, self.user_ns)


  network_code receiver_code receiver_type  receiver_latitude  \
0           TA          109C            HH            32.8889   
1           TA          109C            HH            32.8889   
2           TA          109C            HH            32.8889   
3           TA          109C            HH            32.8889   
4           TA          109C            HH            32.8889   

   receiver_longitude  receiver_elevation_m  p_arrival_sample p_status  \
0           -117.1051                 150.0               NaN      NaN   
1           -117.1051                 150.0               NaN      NaN   
2           -117.1051                 150.0               NaN      NaN   
3           -117.1051                 150.0               NaN      NaN   
4           -117.1051                 150.0               NaN      NaN   

   p_weight  p_travel_sec  ...  source_magnitude_author  \
0       NaN           NaN  ...                      NaN   
1       NaN           NaN  ...                

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)
    
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

# Instantiate the GNN model
input_dim = len(X.columns)  # Adjust input dimension based on the number of features
hidden_dim = 64
output_dim = 2  # Assuming binary classification for earthquake prediction
model = GNNModel(input_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


NameError: name 'X' is not defined

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize data distributions, feature relationships, model predictions, etc.

# For example, visualize the correlation matrix of features
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

# Visualize model performance metrics (e.g., accuracy, loss) during training
def plot_metrics(train_losses, val_losses, train_accuracies, val_accuracies):
    plt.figure(figsize=(10, 5))
    epochs = range(1, len(train_losses) + 1)
    plt.plot(epochs, train_losses, label='Train Loss')
    plt.plot(epochs, val_losses, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.show()

    plt.figure(figsize=(10, 5))
    plt.plot(epochs, train_accuracies, label='Train Accuracy')
    plt.plot(epochs, val_accuracies, label='Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.legend()
    plt.show()

# Usage: Call plot_metrics with training and validation metrics lists


In [None]:
# Train the model
def train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs=10):
    train_losses, val_losses = [], []
    train_accuracies, val_accuracies = [], []
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct_train = 0
        total_train = 0
        
        for data in train_loader:
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, data.y)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_train += data.y.size(0)
            correct_train += (predicted == data.y).sum().item()
        
        train_losses.append(running_loss / len(train_loader))
        train_accuracies.append(correct_train / total_train)
        
        val_loss, val_acc = evaluate_model(model, criterion, val_loader)
        val_losses.append(val_loss)
        val_accuracies.append(val_acc)
        
        print(f'Epoch [{epoch + 1}/{num_epochs}], '
              f'Train Loss: {train_losses[-1]:.4f}, Train Acc: {train_accuracies[-1]:.4f}, '
              f'Val Loss: {val_losses[-1]:.4f}, Val Acc: {val_accuracies[-1]:.4f}')
    
    return train_losses, val_losses, train_accuracies, val_accuracies

# Evaluate the model
def evaluate_model(model, criterion, data_loader):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for data in data_loader:
            outputs = model(data)
            loss = criterion(outputs, data.y)
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += data.y.size(0)
            correct += (predicted == data.y).sum().item()
    
    return running_loss / len(data_loader), correct / total

# Make predictions
def predict(model, data):
    model.eval()
    with torch.no_grad():
        outputs = model(data)
        _, predicted = torch.max(outputs, 1)
    return predicted

# Usage: train the model, evaluate it on validation data, and make predictions on test data


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.feature_extraction import FeatureHasher


# Assuming X is your DataFrame with features
# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Perform feature hashing on categorical columns
hasher = FeatureHasher(n_features=10, input_type='string')
X_hashed = hasher.transform(X[categorical_cols].astype(str))

# Convert hashed features to a DataFrame
X_hashed_df = pd.DataFrame(X_hashed.toarray())

# Concatenate hashed features with numerical features
X_processed = pd.concat([X.drop(columns=categorical_cols), X_hashed_df], axis=1)



# Assuming X_processed is your DataFrame with processed features
# Check the dimensions of X_processed and y
print("Dimensions of X_processed:", X_processed.shape)
print("Dimensions of y:", y.shape)

# Find the indices of rows that are present in X_processed but not in y
missing_indices = set(X_processed.index) - set(y.index)
print("Missing indices:", missing_indices)

# Find the indices of rows that are present in y but not in X_processed
extra_indices = set(y.index) - set(X_processed.index)
print("Extra indices:", extra_indices)

# Check if the number of samples in X_processed and y is consistent
if len(X_processed) != len(y):
    # If there are extra indices in y, drop those rows
    if extra_indices:
        print("Dropping extra indices in y:", extra_indices)
        y.drop(index=extra_indices, inplace=True)
    
    # If there are missing indices in y, drop those rows from X_processed
    if missing_indices:
        print("Dropping missing indices in X_processed:", missing_indices)
        X_processed.drop(index=missing_indices, inplace=True)
    
    # Check again for consistency
    if len(X_processed) != len(y):
        raise ValueError("Number of samples in features and target are not consistent!")

# Ensure that the indices of X_processed and y are aligned
X_processed.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

# Concatenate X_processed and y along the columns axis
#data = pd.concat([X_processed, y], axis=1)

# Check the dimensions of the concatenated DataFrame
print("Dimensions of concatenated data:", data.shape)

# Drop columns 'trace_start_time' and 'trace_name' from the concatenated DataFrame
data.drop(columns=['trace_start_time', 'trace_name'], inplace=True)

# Split data into features and target
X = data.drop(columns=['trace_category'])
y = data['trace_category']



# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_processed, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Convert data to appropriate numeric types
X_train_values = X_train.values.astype(np.float32)
y_train_values = y_train.values.astype(np.long)
X_val_values = X_val.values.astype(np.float32)
y_val_values = y_val.values.astype(np.long)
X_test_values = X_test.values.astype(np.float32)
y_test_values = y_test.values.astype(np.long)

# Assuming train_edge_index, val_edge_index, and test_edge_index are properly defined
# Define the data objects
train_data = Data.Data(x=torch.tensor(X_train_values),
                       y=torch.tensor(y_train_values),
                       edge_index=torch.tensor(train_edge_index, dtype=torch.long))
val_data = Data.Data(x=torch.tensor(X_val_values),
                     y=torch.tensor(y_val_values),
                     edge_index=torch.tensor(val_edge_index, dtype=torch.long))
test_data = Data.Data(x=torch.tensor(X_test_values),
                      y=torch.tensor(y_test_values),
                      edge_index=torch.tensor(test_edge_index, dtype=torch.long))
