In [None]:
# This notebook will guide you through the process of loading, analyzing, and utilizing
# a glioma dataset from the UCI Machine Learning Repository.


In [None]:
!pip install ucimlrepo

In [None]:
# Now that we have the necessary library installed, let's fetch the glioma dataset from the UCI repository.
# This dataset contains clinical and molecular features used for glioma grading.

from ucimlrepo import fetch_ucirepo

# Fetch dataset
glioma_grading_clinical_and_mutation_features = fetch_ucirepo(id=759)

# Data (as pandas dataframes)
X = glioma_grading_clinical_and_mutation_features.data.features
y = glioma_grading_clinical_and_mutation_features.data.targets

# Metadata
print(glioma_grading_clinical_and_mutation_features.metadata)

# Variable information
print(glioma_grading_clinical_and_mutation_features.variables)


In [None]:
# Now that we have the dataset, let's explore it to understand its structure and content.
# We will start by examining the first few rows of the features and target dataframes.

import pandas as pd

# Display the first few rows of the features dataframe
print("Features:")
print(X.head())

# Display the first few rows of the target dataframe
print("\nTargets:")
print(y.head())

# Check for missing values in the dataset
print("\nMissing values in features:")
print(X.isnull().sum())

print("\nMissing values in targets:")
print(y.isnull().sum())

# Replace missing values in numeric columns with the mean of the column
for column in X.select_dtypes(include=['number']).columns:
    X[column].fillna(X[column].mean(), inplace=True)

# Replace missing values in non-numeric columns with the mode of the column
for column in X.select_dtypes(exclude=['number']).columns:
    X[column].fillna(X[column].mode()[0], inplace=True)

# Handle missing values in the target variable 'y' if it's numeric
if y.dtypes['Grade'] in ['int64', 'float64']:
    y.fillna(y.mean(), inplace=True)


In [None]:
# Let's split the dataset into training and testing sets for model training and evaluation.
# We will use 80% of the data for training and 20% for testing.

from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the training and testing sets
print("Training set shape (features):", X_train.shape)
print("Training set shape (targets):", y_train.shape)
print("Testing set shape (features):", X_test.shape)
print("Testing set shape (targets):", y_test.shape)


In [None]:
# Let's define a simple neural network architecture using PyTorch.
# We will also ensure that the model uses GPU resources if available.

import torch
import torch.nn as nn
import torch.optim as optim

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define a simple neural network model
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

# Initialize the model, loss function, and optimizer
input_size = X_train.shape[1]
hidden_size = 50
output_size = 1

model = SimpleNN(input_size, hidden_size, output_size).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print(model)


In [None]:
# Let's encode the categorical variables before converting the pandas dataframes into PyTorch tensors.
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
for col in categorical_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

# Convert the pandas dataframes into PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).to(device)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1).to(device)

# Create DataLoader objects for training and testing datasets
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=32, shuffle=False)

# Verify the shapes of the data
print("Training set tensor shape:", X_train_tensor.shape, y_train_tensor.shape)
print("Testing set tensor shape:", X_test_tensor.shape, y_test_tensor.shape)

# Verify if data is on GPU
print(f"X_train_tensor is on device: {X_train_tensor.device}")
print(f"y_train_tensor is on device: {y_train_tensor.device}")
print(f"X_test_tensor is on device: {X_test_tensor.device}")
print(f"y_test_tensor is on device: {y_test_tensor.device}")

# Verify if model is on GPU
print(f"Model is on device: {next(model.parameters()).device}")


In [None]:
# Define the training loop to train the model using the training dataset

num_epochs = 50  # Number of epochs to train the model

# Function to train the model
def train_model(model, train_loader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(train_loader):
            # Move inputs and labels to GPU if available
            inputs, labels = inputs.to(device), labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            # Print statistics
            running_loss += loss.item()
            if (i + 1) % 10 == 0:    # Print every 10 batches
                print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}")

        print(f"Epoch [{epoch + 1}/{num_epochs}], Average Loss: {running_loss / len(train_loader):.4f}")

# Train the model
train_model(model, train_loader, criterion, optimizer, num_epochs)


In [None]:
# Function to evaluate the model
def evaluate_model(model, test_loader):
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for inputs, labels in test_loader:
            # Move inputs and labels to GPU if available
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
            outputs = model(inputs)
            predicted = (outputs > 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        accuracy = 100 * correct / total
        print(f'Accuracy of the model on the test set: {accuracy:.2f}%')

# Evaluate the model
evaluate_model(model, test_loader)


In [None]:
# Enhanced training loop with early stopping functionality

class EarlyStopping:
    def __init__(self, patience=5, delta=0):
        self.patience = patience
        self.delta = delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.best_loss = float('inf')

    def __call__(self, val_loss, model):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decreases.'''
        torch.save(model.state_dict(), 'checkpoint.pt')
        self.best_loss = val_loss

# Function to train the model with early stopping
def train_model_with_early_stopping(model, train_loader, test_loader, criterion, optimizer, num_epochs, patience):
    early_stopping = EarlyStopping(patience=patience)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            # Move inputs and labels to GPU if available
            inputs, labels = inputs.to(device), labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_train_loss = running_loss / len(train_loader)

        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, labels in test_loader:
                # Move inputs and labels to GPU if available
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

        avg_val_loss = val_loss / len(test_loader)

        print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

        early_stopping(avg_val_loss, model)

        if early_stopping.early_stop:
            print("Early stopping triggered. Stopping training.")
            model.load_state_dict(torch.load('checkpoint.pt'))
            break

# Train the model with early stopping
train_model_with_early_stopping(model, train_loader, test_loader, criterion, optimizer, num_epochs=50, patience=5)


In [None]:
# Reload the best model from the checkpoint
model.load_state_dict(torch.load('checkpoint.pt'))

# Evaluate the model with the best state
evaluate_model(model, test_loader)


In [None]:
import matplotlib.pyplot as plt

# Function to plot the training and validation loss
def plot_loss(train_losses, val_losses):
    plt.figure(figsize=(10, 5))
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Training and Validation Loss over Epochs')
    plt.show()

# Function to print model insights and performance details
def print_model_insights(model, train_loader, test_loader, criterion):
    model.eval()
    with torch.no_grad():
        # Calculate final training loss
        train_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            train_loss += loss.item()
        avg_train_loss = train_loss / len(train_loader)

        # Calculate final validation loss
        val_loss = 0.0
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
        avg_val_loss = val_loss / len(test_loader)

        # Calculate final accuracy
        correct = 0
        total = 0
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            predicted = (outputs > 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        accuracy = 100 * correct / total

    print(f"Final Training Loss: {avg_train_loss:.4f}")
    print(f"Final Validation Loss: {avg_val_loss:.4f}")
    print(f"Accuracy on Test Set: {accuracy:.2f}%")

# Store training and validation loss for plotting
train_losses = []
val_losses = []

# Modified training function to track losses
def train_model_with_early_stopping_and_tracking(model, train_loader, test_loader, criterion, optimizer, num_epochs, patience):
    early_stopping = EarlyStopping(patience=patience)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            # Move inputs and labels to GPU if available
            inputs, labels = inputs.to(device), labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_train_loss = running_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, labels in test_loader:
                # Move inputs and labels to GPU if available
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

        avg_val_loss = val_loss / len(test_loader)
        val_losses.append(avg_val_loss)

        print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

        early_stopping(avg_val_loss, model)

        if early_stopping.early_stop:
            print("Early stopping triggered. Stopping training.")
            model.load_state_dict(torch.load('checkpoint.pt'))
            break

# Train the model with early stopping and tracking
train_model_with_early_stopping_and_tracking(model, train_loader, test_loader, criterion, optimizer, num_epochs=50, patience=5)

# Plot the training and validation loss
plot_loss(train_losses, val_losses)

# Print the model insights and performance details
print_model_insights(model, train_loader, test_loader, criterion)


In [None]:
import numpy as np

# Function to calculate and plot feature importance
def plot_feature_importance(model, feature_names):
    # Extract the weights from the first fully connected layer
    fc1_weights = model.fc1.weight.cpu().detach().numpy()

    # Calculate the absolute sum of weights for each feature
    feature_importance = np.sum(np.abs(fc1_weights), axis=0)

    # Normalize the importance values
    feature_importance = feature_importance / np.sum(feature_importance)

    # Create a DataFrame for better visualization
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importance
    })

    # Sort the DataFrame by importance
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    # Plot the feature importance
    plt.figure(figsize=(12, 6))
    plt.bar(importance_df['Feature'], importance_df['Importance'])
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.title('Feature Importance')
    plt.xticks(rotation=90)
    plt.show()

# Plot the feature importance
feature_names = X.columns
plot_feature_importance(model, feature_names)
