In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [None]:
#Load data
Data = pd.read_csv('colorectal_cancer_dataset.csv')

In [None]:

# Check for NULL values in the dataset
null_values = Data.isnull().sum()

# Print columns with NULL values
print("Columns with NULL values:")
print(null_values[null_values > 0])

In [None]:
# Replace NULL values with the mean of the respective columns
Data = Data.fillna(Data.mean())

# Verify that there are no more NULL values
null_values_after = Data.isnull().sum()
print("Columns with NULL values after replacement:")
print(null_values_after[null_values_after > 0])

In [None]:
# Data preprocessing
Data = Data.drop(columns=['Patient_ID'])
numeric_columns = Data.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
Data[numeric_columns] = scaler.fit_transform(Data[numeric_columns])
Data = pd.get_dummies(Data, drop_first=True)

# Ensure all data is numeric and handle missing values
Data = Data.apply(pd.to_numeric, errors='coerce')
Data = Data.fillna(0)  # Fill missing values with 0 or use another strategy

# Convert boolean columns to integers
bool_columns = Data.select_dtypes(include=['bool']).columns
Data[bool_columns] = Data[bool_columns].astype(int)

# Define target and features
target = 'Survival_Prediction_Yes'
features = Data.drop(columns=[target]).columns

# Normalize the target variable
Data[target] = Data[target].astype(float)

In [None]:
# Compute the correlation matrix
correlation_matrix = Data.corr()

# Set a threshold for significant correlations
threshold = 0.5

# Filter the correlation matrix
filtered_corr_matrix = correlation_matrix[(correlation_matrix >= threshold) | (correlation_matrix <= -threshold)]

# Plot the heatmap
plt.figure(figsize=(20, 16))
sns.heatmap(filtered_corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5, linecolor='black')
plt.title('Filtered Correlation Matrix')
plt.show()

In [None]:
# Identify highly correlated features
high_corr_pairs = [(col1, col2) for col1 in filtered_corr_matrix.columns for col2 in filtered_corr_matrix.columns if col1 != col2 and abs(filtered_corr_matrix.loc[col1, col2]) > threshold]

# Print highly correlated pairs
print("Highly correlated pairs:")
for pair in high_corr_pairs:
    print(pair, filtered_corr_matrix.loc[pair[0], pair[1]])

# Remove one feature from each highly correlated pair
features_to_remove = set()
for col1, col2 in high_corr_pairs:
    # Example: Remove the second feature in each pair
    features_to_remove.add(col2)

# Drop the selected features from the dataset
Data_reduced = Data.drop(columns=features_to_remove)

In [None]:
# Define target and features
X = Data[features]
y = Data[target]

# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X, y)

# Compute permutation importance
perm_importance = permutation_importance(model, X, y, n_repeats=10, random_state=42)

# Create a DataFrame for permutation importances
perm_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': perm_importance.importances_mean
}).sort_values(by='Importance', ascending=False)

print(perm_importance_df.sort_values(by='Importance', ascending=False))

In [None]:
# Drop columns where importance is below 0.1
low_importance_features = perm_importance_df[perm_importance_df['Importance'] < 0.02]['Feature']
Data = Data.drop(columns=low_importance_features)

# Update features after dropping low-importance columns
features = Data.drop(columns=[target]).columns

# Show the updated list of features
print("Updated list of features:")
print(Data.columns)

In [None]:
# Define target and features
X = Data[features]
y = Data[target]

# Convert data to PyTorch tensors
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)

# Define the neural network model
class SimpleNNClassifier(nn.Module):
    def __init__(self, input_size):
        super(SimpleNNClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, 64)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(64, 32)
        self.dropout3 = nn.Dropout(0.5)
        self.fc4 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = torch.relu(self.fc3(x))
        x = self.dropout3(x)
        x = self.fc4(x)
        x = self.sigmoid(x)
        return x

# Initialize K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store evaluation metrics
accuracy_scores = []
f1_scores = []

# Early stopping parameters
patience = 20
best_loss = float('inf')
patience_counter = 0

# K-Fold Cross Validation
for train_index, test_index in kf.split(X_tensor):
    X_train, X_test = X_tensor[train_index], X_tensor[test_index]
    y_train, y_test = y_tensor[train_index], y_tensor[test_index]
    
    # Initialize the model, loss function, and optimizer
    input_size = X_train.shape[1]
    model = SimpleNNClassifier(input_size)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.1)
    
    # Train the model
    model.train()
    for epoch in range(200):  # Number of epochs
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()
        
        # Early stopping
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_test)
            val_loss = criterion(val_outputs, y_test)
        
        if val_loss.item() < best_loss:
            best_loss = val_loss.item()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch}")
                break
    
    # Evaluate the model
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test)
        y_pred_class = (y_pred > 0.5).float()
        accuracy = accuracy_score(y_test, y_pred_class)
        f1 = f1_score(y_test, y_pred_class)
    
    accuracy_scores.append(accuracy)
    f1_scores.append(f1)

# Print evaluation metrics
print(f'Mean Accuracy: {np.mean(accuracy_scores):.4f}')
print(f'Mean F1 Score: {np.mean(f1_scores):.4f}')

In [None]:
# Save the trained model
torch.save(model.state_dict(), 'ColorectalCancerPreditionModel.pth')

In [None]:
# Load the trained model
input_size = X_tensor.shape[1]
loaded_model = SimpleNNClassifier(input_size)
loaded_model.load_state_dict(torch.load('ColorectalCancerPreditionModel.pth'))
loaded_model.eval()

SimpleNNClassifier(
  (fc1): Linear(in_features=5, out_features=128, bias=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc3): Linear(in_features=64, out_features=32, bias=True)
  (dropout3): Dropout(p=0.5, inplace=False)
  (fc4): Linear(in_features=32, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [22]:
# Example new case (replace with actual feature values)
new_case = np.array([[0.5, -1.2, 0.3, 1.5, -0.7]])  # Example feature values

# Convert the new case to a PyTorch tensor
new_case_tensor = torch.tensor(new_case, dtype=torch.float32)

# Make a prediction
with torch.no_grad():
    prediction = loaded_model(new_case_tensor)
    prediction_class = (prediction > 0.5).float()

print(f'Prediction: {prediction.item()}')
print(f'Predicted Class: {prediction_class.item()}')

Prediction: 1.0
Predicted Class: 1.0
