4th try

In [None]:
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import FeatureHasher
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn as nn
import torch.nn.functional as F


# Step 1: Data Understanding and Preprocessing

# Load the dataset
data = pd.read_csv("D:\APURAV\K. K. Wagh\Study\BE\Semester VII\Final Year Project Sem VII\dataset\STEAD\merge.csv")

# Review the dataset
print(data.head())

# Handle missing values
data.dropna(inplace=True)

# Handle outliers
# You may use techniques like Z-score, IQR, or domain-specific knowledge

# Feature engineering
# Extract relevant features
# Example: Extracting seismic waveforms into separate arrays
seismic_waveforms = data[['trace_start_time', 'trace_category', 'trace_name']]

# Data encoding and scaling
# Example: Scale numerical features
scaler = StandardScaler()
data[['source_magnitude', 'source_depth_km']] = scaler.fit_transform(data[['source_magnitude', 'source_depth_km']])

# Dimensionality reduction
# You may use techniques like PCA if needed
X = data.drop(columns=['trace_category'])
y = data['trace_category']

In [None]:
class GNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)
    
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

# Instantiate the GNN model
input_dim = len(X.columns)  # Adjust input dimension based on the number of features
hidden_dim = 64
output_dim = 2  # Assuming binary classification for earthquake prediction
model = GNNModel(input_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize data distributions, feature relationships, model predictions, etc.

# For example, visualize the correlation matrix of features
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

# Visualize model performance metrics (e.g., accuracy, loss) during training
def plot_metrics(train_losses, val_losses, train_accuracies, val_accuracies):
    plt.figure(figsize=(10, 5))
    epochs = range(1, len(train_losses) + 1)
    plt.plot(epochs, train_losses, label='Train Loss')
    plt.plot(epochs, val_losses, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.show()

    plt.figure(figsize=(10, 5))
    plt.plot(epochs, train_accuracies, label='Train Accuracy')
    plt.plot(epochs, val_accuracies, label='Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.legend()
    plt.show()

# Usage: Call plot_metrics with training and validation metrics lists


In [None]:
# Train the model
def train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs=10):
    train_losses, val_losses = [], []
    train_accuracies, val_accuracies = [], []
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct_train = 0
        total_train = 0
        
        for data in train_loader:
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, data.y)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_train += data.y.size(0)
            correct_train += (predicted == data.y).sum().item()
        
        train_losses.append(running_loss / len(train_loader))
        train_accuracies.append(correct_train / total_train)
        
        val_loss, val_acc = evaluate_model(model, criterion, val_loader)
        val_losses.append(val_loss)
        val_accuracies.append(val_acc)
        
        print(f'Epoch [{epoch + 1}/{num_epochs}], '
              f'Train Loss: {train_losses[-1]:.4f}, Train Acc: {train_accuracies[-1]:.4f}, '
              f'Val Loss: {val_losses[-1]:.4f}, Val Acc: {val_accuracies[-1]:.4f}')
    
    return train_losses, val_losses, train_accuracies, val_accuracies

# Evaluate the model
def evaluate_model(model, criterion, data_loader):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for data in data_loader:
            outputs = model(data)
            loss = criterion(outputs, data.y)
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += data.y.size(0)
            correct += (predicted == data.y).sum().item()
    
    return running_loss / len(data_loader), correct / total

# Make predictions
def predict(model, data):
    model.eval()
    with torch.no_grad():
        outputs = model(data)
        _, predicted = torch.max(outputs, 1)
    return predicted

# Usage: train the model, evaluate it on validation data, and make predictions on test data


In [None]:
# Assuming X is your DataFrame with features
# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Perform feature hashing on categorical columns
hasher = FeatureHasher(n_features=10, input_type='string')
X_hashed = hasher.transform(X[categorical_cols].astype(str))

# Convert hashed features to a DataFrame
X_hashed_df = pd.DataFrame(X_hashed.toarray())

# Concatenate hashed features with numerical features
X_processed = pd.concat([X.drop(columns=categorical_cols), X_hashed_df], axis=1)

# Assuming X_processed is your DataFrame with processed features
# Check the dimensions of X_processed and y
print("Dimensions of X_processed:", X_processed.shape)
print("Dimensions of y:", y.shape)

# Find the indices of rows that are present in X_processed but not in y
missing_indices = set(X_processed.index) - set(y.index)
print("Missing indices:", missing_indices)

# Find the indices of rows that are present in y but not in X_processed
extra_indices = set(y.index) - set(X_processed.index)
print("Extra indices:", extra_indices)

# Check if the number of samples in X_processed and y is consistent
if len(X_processed) != len(y):
    # If there are extra indices in y, drop those rows
    if extra_indices:
        print("Dropping extra indices in y:", extra_indices)
        y.drop(index=extra_indices, inplace=True)
    
    # If there are missing indices in y, drop those rows from X_processed
    if missing_indices:
        print("Dropping missing indices in X_processed:", missing_indices)
        X_processed.drop(index=missing_indices, inplace=True)
    
    # Check again for consistency
    if len(X_processed) != len(y):
        raise ValueError("Number of samples in features and target are not consistent!")

# Ensure that the indices of X_processed and y are aligned
X_processed.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

# Concatenate X_processed and y along the columns axis
data = pd.concat([X_processed, y], axis=1)


# Assuming y is your target variable
# Encode categorical labels in y
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split data into train, validation, and test sets
X_train, X_temp, y_train_encoded, y_temp_encoded = train_test_split(X_processed, y_encoded, test_size=0.2, random_state=42)
X_val, X_test, y_val_encoded, y_test_encoded = train_test_split(X_temp, y_temp_encoded, test_size=0.5, random_state=42)

# Convert data to appropriate numeric types
X_train_values = X_train.values.astype(np.float32)
y_train_values = y_train_encoded.astype(np.int64)
X_val_values = X_val.values.astype(np.float32)
y_val_values = y_val_encoded.astype(np.int64)
X_test_values = X_test.values.astype(np.float32)
y_test_values = y_test_encoded.astype(np.int64)

# Assuming train_edge_index, val_edge_index, and test_edge_index are properly defined
# Define the data objects
train_data = Data.Data(x=torch.tensor(X_train_values),
                       y=torch.tensor(y_train_values),
                       edge_index=torch.tensor(train_edge_index, dtype=torch.long))
val_data = Data.Data(x=torch.tensor(X_val_values),
                     y=torch.tensor(y_val_values),
                     edge_index=torch.tensor(val_edge_index, dtype=torch.long))
test_data = Data.Data(x=torch.tensor(X_test_values),
                      y=torch.tensor(y_test_values),
                      edge_index=torch.tensor(test_edge_index, dtype=torch.long))


5th try version 1

In [None]:
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import FeatureHasher
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F

# Load the dataset
data = pd.read_csv("D:/APURAV/K. K. Wagh/Study/BE/Semester VII/Final Year Project Sem VII/dataset/STEAD/merge.csv")

# Explore the dataset
print(data.head())
print(data.info())

# Handle missing values
data.dropna(inplace=True)

# Feature engineering
categorical_cols = data.select_dtypes(include=['object']).columns
hasher = FeatureHasher(n_features=10, input_type='string')
hashed_features = hasher.transform(data[categorical_cols].astype(str)).toarray()
hashed_df = pd.DataFrame(hashed_features)
processed_data = pd.concat([data.drop(columns=categorical_cols), hashed_df], axis=1)

# Encoding categorical labels in y
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data['trace_category'])

# Ensure consistency in the number of samples
processed_data = processed_data.iloc[:len(y_encoded)]

# Split data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(processed_data, y_encoded, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Data preprocessing
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert data to appropriate tensor types
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Define the GCN model
class GNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Instantiate the model
input_dim = X_train_tensor.shape[1]
hidden_dim = 64
output_dim = len(label_encoder.classes_)
model = GNNModel(input_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Define data objects
train_data = Data(x=X_train_tensor, y=y_train_tensor)
val_data = Data(x=X_val_tensor, y=y_val_tensor)
test_data = Data(x=X_test_tensor, y=y_test_tensor)

# Print dimensions
print("Dimensions of train_data:", len(train_data), train_data.num_node_features, train_data.num_edge_features)
print("Dimensions of val_data:", len(val_data), val_data.num_node_features, val_data.num_edge_features)
print("Dimensions of test_data:", len(test_data), test_data.num_node_features, test_data.num_edge_features)


In [None]:
pip install torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.0.0+cu118.html

5th try version 2

In [None]:
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import FeatureHasher
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader  # Import DataLoader from torch.utils.data

# Load the dataset
data = pd.read_csv("D:/APURAV/K. K. Wagh/Study/BE/Semester VII/Final Year Project Sem VII/dataset/STEAD/merge.csv")

# Explore the dataset
print(data.head())
print(data.info())

# Handle missing values
data.dropna(inplace=True)

# Feature engineering
categorical_cols = data.select_dtypes(include=['object']).columns
hasher = FeatureHasher(n_features=10, input_type='string')
hashed_features = hasher.transform(data[categorical_cols].astype(str)).toarray()
hashed_df = pd.DataFrame(hashed_features)
processed_data = pd.concat([data.drop(columns=categorical_cols), hashed_df], axis=1)

# Encoding categorical labels in y
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data['trace_category'])

# Ensure consistency in the number of samples
processed_data = processed_data.iloc[:len(y_encoded)]

# Split data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(processed_data, y_encoded, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Data preprocessing
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert data to appropriate tensor types
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Define the GCN model
class GNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, x, edge_index=None):
        if edge_index is not None:
            x = F.relu(self.conv1(x, edge_index))
        else:
            x = F.relu(self.conv1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Instantiate the model
input_dim = X_train_tensor.shape[1]
hidden_dim = 64
output_dim = len(label_encoder.classes_)
model = GNNModel(input_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Define data objects
train_data = [Data(x=X_train_tensor, edge_index=None, y=y_train_tensor)]  # Include edge_index=None
val_data = [Data(x=X_val_tensor, edge_index=None, y=y_val_tensor)]  # Include edge_index=None
test_data = [Data(x=X_test_tensor, edge_index=None, y=y_test_tensor)]  # Include edge_index=None

# Print dimensions
print("Dimensions of train_data:", len(train_data), train_data[0].num_node_features, train_data[0].num_edge_features)
print("Dimensions of val_data:", len(val_data), val_data[0].num_node_features, val_data[0].num_edge_features)
print("Dimensions of test_data:", len(test_data), test_data[0].num_node_features, test_data[0].num_edge_features)

# Define the training parameters
num_epochs = 20
batch_size = 64

# Create data loaders using torch.utils.data.DataLoader
def collate(data_list):
    X = [data.x for data in data_list]
    y = [data.y for data in data_list]
    return Data(x=torch.stack(X), edge_index=None, y=torch.stack(y))  # Include edge_index=None

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate)
val_loader = DataLoader(val_data, batch_size=batch_size, collate_fn=collate)
test_loader = DataLoader(test_data, batch_size=batch_size, collate_fn=collate)

# Rest of the code remains unchanged
def train_model(model, criterion, optimizer, train_loader, val_loader, device, num_epochs=10):
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        correct_train = 0
        total_train = 0
        
        for batch in train_loader:
            optimizer.zero_grad()
            data = batch.to(device)  # Move data to the appropriate device
            outputs = model(data)
            loss = criterion(outputs, data.y.view(-1))  # Assuming 'y' is the target label
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct_train += (predicted == data.y.view(-1)).sum().item()
            total_train += data.y.size(0)
        
        train_losses.append(train_loss / len(train_loader))
        train_accuracies.append(correct_train / total_train)
        
        model.eval()
        val_loss = 0.0
        correct_val = 0
        total_val = 0
        
        with torch.no_grad():
            for batch in val_loader:
                data = batch.to(device)  # Move data to the appropriate device
                outputs = model(data)
                loss = criterion(outputs, data.y.view(-1))  # Assuming 'y' is the target label
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                correct_val += (predicted == data.y.view(-1)).sum().item()
                total_val += data.y.size(0)
        
        val_losses.append(val_loss / len(val_loader))
        val_accuracies.append(correct_val / total_val)
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_losses[-1]:.4f}, Train Acc: {train_accuracies[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}, Val Acc: {val_accuracies[-1]:.4f}')
    
    return train_losses, val_losses, train_accuracies, val_accuracies

def evaluate_model(model, criterion, dataloader):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for data in dataloader:
            outputs = model(data)
            loss = criterion(outputs, data.y)
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == data.y).sum().item()
            total += data.y.size(0)
    
    loss = total_loss / len(dataloader)
    accuracy = correct / total
    
    return loss, accuracy

def predict(model, data_loader):
    model.eval()
    predictions = []
    
    with torch.no_grad():
        for data in data_loader:
            outputs = model(data)
            _, predicted = torch.max(outputs, 1)
            predictions.extend(predicted.tolist())
    
    return predictions

# Assuming you have the train_model, evaluate_model, and predict functions defined as shown above,
# you can then proceed to use these functions with your model.

# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Train the model
train_losses, val_losses, train_accuracies, val_accuracies = train_model(model, criterion, optimizer, train_loader, val_loader, device=device, num_epochs=num_epochs)

# Evaluate the model on the test set (assuming you have defined the evaluate_model function elsewhere)
test_loss, test_accuracy = evaluate_model(model, criterion, test_loader)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

# Make predictions on the test set
predictions = []
ground_truth = []
model.eval()
with torch.no_grad():
    for data in test_loader:
        outputs = predict(model, data)
        predictions.extend(outputs.tolist())
        ground_truth.extend(data.y.tolist())

# Convert predictions and ground truth to numpy arrays
predictions = np.array(predictions)
ground_truth = np.array(ground_truth)


5th try version 3

In [None]:
import torch
import torch_sparse
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import FeatureHasher
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

# Load the dataset
data = pd.read_csv("D:/APURAV/K. K. Wagh/Study/BE/Semester VII/Final Year Project Sem VII/dataset/STEAD/merge.csv")

# Explore the dataset
print(data.head())
print(data.info())

# Handle missing values
data.dropna(inplace=True)

# Feature engineering
categorical_cols = data.select_dtypes(include=['object']).columns
hasher = FeatureHasher(n_features=10, input_type='string')
hashed_features = hasher.transform(data[categorical_cols].astype(str)).toarray()
hashed_df = pd.DataFrame(hashed_features)
processed_data = pd.concat([data.drop(columns=categorical_cols), hashed_df], axis=1)

# Encoding categorical labels in y
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data['trace_category'])

# Ensure consistency in the number of samples
processed_data = processed_data.iloc[:len(y_encoded)]

# Split data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(processed_data, y_encoded, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Data preprocessing
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert data to appropriate tensor types
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Update the forward method of GNNModel
class GNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Update the training loop
def train_model(model, criterion, optimizer, train_loader, val_loader, device, num_epochs=10):
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        correct_train = 0
        total_train = 0
        
        for data in train_loader:
            optimizer.zero_grad()
            data = data.to(device)  # Move data to the appropriate device
            outputs = model(data)
            loss = criterion(outputs, data.y.view(-1))  # Assuming 'y' is the target label
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct_train += (predicted == data.y.view(-1)).sum().item()
            total_train += data.y.size(0)
        
        train_losses.append(train_loss / len(train_loader))
        train_accuracies.append(correct_train / total_train)
        
        model.eval()
        val_loss = 0.0
        correct_val = 0
        total_val = 0
        
        with torch.no_grad():
            for data in val_loader:
                data = data.to(device)  # Move data to the appropriate device
                outputs = model(data)
                loss = criterion(outputs, data.y.view(-1))  # Assuming 'y' is the target label
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                correct_val += (predicted == data.y.view(-1)).sum().item()
                total_val += data.y.size(0)
        
        val_losses.append(val_loss / len(val_loader))
        val_accuracies.append(correct_val / total_val)
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_losses[-1]:.4f}, Train Acc: {train_accuracies[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}, Val Acc: {val_accuracies[-1]:.4f}')
    
    return train_losses, val_losses, train_accuracies, val_accuracies


# Instantiate the model
input_dim = X_train_tensor.shape[1]
hidden_dim = 64
output_dim = len(label_encoder.classes_)
model = GNNModel(input_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Define data objects
train_data = [Data(x=X_train_tensor, edge_index=None, y=y_train_tensor)]
val_data = [Data(x=X_val_tensor, edge_index=None, y=y_val_tensor)]
test_data = [Data(x=X_test_tensor, edge_index=None, y=y_test_tensor)]

# Print dimensions
print("Dimensions of train_data:", len(train_data), train_data[0].num_node_features, train_data[0].num_edge_features)
print("Dimensions of val_data:", len(val_data), val_data[0].num_node_features, val_data[0].num_edge_features)
print("Dimensions of test_data:", len(test_data), test_data[0].num_node_features, test_data[0].num_edge_features)

# Define the training parameters
num_epochs = 20
batch_size = 64

# Create data loaders using torch.utils.data.DataLoader
def collate(data_list):
    X = [data.x for data in data_list]
    y = [data.y for data in data_list]
    return Data(x=torch.stack(X), edge_index=None, y=torch.stack(y))

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate)
val_loader = DataLoader(val_data, batch_size=batch_size, collate_fn=collate)
test_loader = DataLoader(test_data, batch_size=batch_size, collate_fn=collate)

# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Train the model
train_losses, val_losses, train_accuracies, val_accuracies = train_model(model, criterion, optimizer, train_loader, val_loader, device=device, num_epochs=num_epochs)

# Evaluate the model on the test set
test_loss, test_accuracy = evaluate_model(model, criterion, test_loader)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

# Make predictions on the test set
predictions = []
ground_truth = []
model.eval()
with torch.no_grad():
    for data in test_loader:
        outputs = predict(model, data)
        predictions.extend(outputs.tolist())
        ground_truth.extend(data.y.tolist())

# Convert predictions and ground truth to numpy arrays
predictions = np.array(predictions)
ground_truth = np.array(ground_truth)


5th try version 4

In [None]:
import torch
import torch_scatter
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import FeatureHasher
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

# Load the dataset
data = pd.read_csv("D:/APURAV/K. K. Wagh/Study/BE/Semester VII/Final Year Project Sem VII/dataset/STEAD/merge.csv")

# Explore the dataset
print(data.head())
print(data.info())

# Handle missing values
data.dropna(inplace=True)

# Feature engineering
categorical_cols = data.select_dtypes(include=['object']).columns
hasher = FeatureHasher(n_features=10, input_type='string')
hashed_features = hasher.transform(data[categorical_cols].astype(str)).toarray()
hashed_df = pd.DataFrame(hashed_features)
processed_data = pd.concat([data.drop(columns=categorical_cols), hashed_df], axis=1)

# Encoding categorical labels in y
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data['trace_category'])

# Ensure consistency in the number of samples
processed_data = processed_data.iloc[:len(y_encoded)]

# Split data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(processed_data, y_encoded, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Data preprocessing
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert data to appropriate tensor types
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Define the GCN model
class GNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        if edge_index is None:
            x = F.relu(self.conv1(x, torch.zeros(2, 0)))  # Provide a dummy edge_index when it's None
        else:
            x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Instantiate the model
input_dim = X_train_tensor.shape[1]
hidden_dim = 64
output_dim = len(label_encoder.classes_)
model = GNNModel(input_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Define data objects
train_data = [Data(x=X_train_tensor, edge_index=None, y=y_train_tensor)]
val_data = [Data(x=X_val_tensor, edge_index=None, y=y_val_tensor)]
test_data = [Data(x=X_test_tensor, edge_index=None, y=y_test_tensor)]

# Print dimensions
print("Dimensions of train_data:", len(train_data), train_data[0].num_node_features, train_data[0].num_edge_features)
print("Dimensions of val_data:", len(val_data), val_data[0].num_node_features, val_data[0].num_edge_features)
print("Dimensions of test_data:", len(test_data), test_data[0].num_node_features, test_data[0].num_edge_features)

# Define the training parameters
num_epochs = 20
batch_size = 64

# Create data loaders using torch.utils.data.DataLoader
def collate(data_list):
    X = [data.x for data in data_list]
    y = [data.y for data in data_list]
    return Data(x=torch.stack(X), edge_index=None, y=torch.stack(y))

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate)
val_loader = DataLoader(val_data, batch_size=batch_size, collate_fn=collate)
test_loader = DataLoader(test_data, batch_size=batch_size, collate_fn=collate)

# Define the training function
def train_model(model, criterion, optimizer, train_loader, val_loader, device, num_epochs):
    # Move model to the appropriate device
    model.to(device)
    # Set model to train mode
    model.train()
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        total_train = 0
        correct_train = 0
        running_loss = 0.0
        for batch in train_loader:
            optimizer.zero_grad()
            data = batch.to(device)  # Move data to the appropriate device
            outputs = model(data)
            loss = criterion(outputs, data.y.view(-1))  # Assuming 'y' is the target label
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_train += data.y.size(0)
            correct_train += (predicted == data.y).sum().item()
        train_loss = running_loss / len(train_loader)
        train_accuracy = correct_train / total_train
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)
        
        # Validation phase
        model.eval()
        total_val = 0
        correct_val = 0
        val_running_loss = 0.0
        with torch.no_grad():
            for batch in val_loader:
                data = batch.to(device)  # Move data to the appropriate device
                outputs = model(data)
                val_loss = criterion(outputs, data.y.view(-1))  # Assuming 'y' is the target label
                val_running_loss += val_loss.item()
                _, predicted = torch.max(outputs, 1)
                total_val += data.y.size(0)
                correct_val += (predicted == data.y).sum().item()
            val_loss = val_running_loss / len(val_loader)
            val_accuracy = correct_val / total_val
            val_losses.append(val_loss)
            val_accuracies.append(val_accuracy)
        
        print(f"Epoch [{epoch+1}/{num_epochs}], "
              f"Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")
        
    return train_losses, val_losses, train_accuracies, val_accuracies

def evaluate_model(model, criterion, dataloader):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for data in dataloader:
            outputs = model(data)
            loss = criterion(outputs, data.y)
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == data.y).sum().item()
            total += data.y.size(0)
    
    loss = total_loss / len(dataloader)
    accuracy = correct / total
    
    return loss, accuracy

def predict(model, data_loader):
    model.eval()
    predictions = []
    
    with torch.no_grad():
        for data in data_loader:
            outputs = model(data)
            _, predicted = torch.max(outputs, 1)
            predictions.extend(predicted.tolist())
    
    return predictions

# Assuming you have the train_model, evaluate_model, and predict functions defined as shown above,
# you can then proceed to use these functions with your model.

# Train the model
train_losses, val_losses, train_accuracies, val_accuracies = train_model(model, criterion, optimizer, train_loader, val_loader, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'), num_epochs=num_epochs)

# Evaluate the model on the test set (assuming you have defined the evaluate_model function elsewhere)
test_loss, test_accuracy = evaluate_model(model, criterion, test_loader)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

# Make predictions on the test set
predictions = []
ground_truth = []
model.eval()
with torch.no_grad():
    for data in test_loader:
        outputs = predict(model, data)
        predictions.extend(outputs.tolist())
        ground_truth.extend(data.y.tolist())

# Convert predictions and ground truth to numpy arrays
predictions = np.array(predictions)
ground_truth = np.array(ground_truth)
