<a href="https://colab.research.google.com/github/PPancham/PhD/blob/main/Random_state_Iteration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib
import datetime
import google.colab.files as files

# Track all metrics with their parameters
results_df = pd.DataFrame(columns=[
    'Iteration', 'Random_State', 'Accuracy', 'F1_Score',
    'Is_Train', 'Is_Test'
])

# Assuming you have defined these variables already:
# data, columns_to_process, Group

def OutlierRemoval(data, columns, filter_data=True):
    if filter_data == False:
        print("\n --- Data has outliers and was not filtered ---")
        return data
    else:
        data_filtered = data.copy()
        for column in columns:
            Q1 = data_filtered[column].quantile(0.25)
            Q3 = data_filtered[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            data_filtered = data_filtered[(data_filtered[column] >= lower_bound) &
                                        (data_filtered[column] <= upper_bound)]
        return data_filtered

def SaveData(Pumpkin, prefix=""):
    filename = f"{prefix}Trained_Model_Pumpkin{datetime.date.today()}.pkl"
    joblib.dump(Pumpkin, filename)
    return f"Data saved to '{filename}'"

def plot_rf_performance(results_df):
    """
    Create visualizations for Random Forest performance across iterations and random states.
    """
    # Set up the figure with subplots
    fig = plt.figure(figsize=(20, 15))
    grid = plt.GridSpec(3, 2, hspace=0.35, wspace=0.3)

    # 1. Performance by Iteration (top left)
    ax1 = fig.add_subplot(grid[0, 0])

    # Get test data only
    test_data = results_df[results_df['Is_Test'] == True]

    # Group by iteration and calculate mean metrics
    by_iteration = test_data.groupby('Iteration').agg({
        'Accuracy': 'mean',
        'F1_Score': 'mean'
    }).reset_index()

    # Plot iteration performance
    ax1.plot(by_iteration['Iteration'], by_iteration['Accuracy'], 'o-', color='blue', label='Accuracy')
    ax1.plot(by_iteration['Iteration'], by_iteration['F1_Score'], 'o-', color='green', label='F1-Score')

    ax1.set_title('Average Test Performance by Iteration', fontsize=14)
    ax1.set_xlabel('Iteration (Train/Test Split)', fontsize=12)
    ax1.set_ylabel('Score', fontsize=12)
    ax1.grid(True, linestyle='--', alpha=0.7)
    ax1.legend()

    # 2. Performance by Random State (top right)
    ax2 = fig.add_subplot(grid[0, 1])

    # Group by random state and calculate mean metrics
    by_random_state = test_data.groupby('Random_State').agg({
        'Accuracy': 'mean',
        'F1_Score': 'mean'
    }).reset_index()

    # Plot random state performance
    ax2.plot(by_random_state['Random_State'], by_random_state['Accuracy'], 'o-', color='blue', label='Accuracy')
    ax2.plot(by_random_state['Random_State'], by_random_state['F1_Score'], 'o-', color='green', label='F1-Score')

    ax2.set_title('Average Test Performance by Random State', fontsize=14)
    ax2.set_xlabel('Random State', fontsize=12)
    ax2.set_ylabel('Score', fontsize=12)
    ax2.grid(True, linestyle='--', alpha=0.7)
    ax2.legend()

    # 3. Heatmap of Accuracy by Iteration and Random State (middle)
    ax3 = fig.add_subplot(grid[1, :])

    # Create a pivot table for the heatmap
    # Limit to first 20 random states for clarity if there are many
    heatmap_data = test_data[test_data['Random_State'] <= 20].pivot_table(
        index='Iteration',
        columns='Random_State',
        values='Accuracy'
    )

    # Plot heatmap
    sns.heatmap(heatmap_data, cmap='viridis', annot=False, fmt='.3f',
                cbar_kws={'label': 'Accuracy'}, ax=ax3)

    ax3.set_title('Test Accuracy Heatmap (Iteration vs Random State)', fontsize=14)
    ax3.set_xlabel('Random State', fontsize=12)
    ax3.set_ylabel('Iteration', fontsize=12)

    # 4. Train vs Test Performance (bottom left)
    ax4 = fig.add_subplot(grid[2, 0])

    # Prepare data for train vs test comparison
    train_data = results_df[results_df['Is_Train'] == True]
    train_mean = train_data.groupby('Iteration')['Accuracy'].mean().reset_index()
    test_mean = test_data.groupby('Iteration')['Accuracy'].mean().reset_index()

    # Calculate train-test gap
    merged_data = pd.merge(train_mean, test_mean, on='Iteration', suffixes=('_train', '_test'))
    merged_data['gap'] = merged_data['Accuracy_train'] - merged_data['Accuracy_test']

    # Plot train vs test
    ax4.plot(merged_data['Iteration'], merged_data['Accuracy_train'], 'o-', color='blue', label='Train Accuracy')
    ax4.plot(merged_data['Iteration'], merged_data['Accuracy_test'], 'o-', color='red', label='Test Accuracy')

    # Add gap as a shaded region
    ax4.fill_between(merged_data['Iteration'],
                   merged_data['Accuracy_test'],
                   merged_data['Accuracy_train'],
                   alpha=0.3, color='gray', label='Train-Test Gap')

    ax4.set_title('Train vs Test Accuracy by Iteration', fontsize=14)
    ax4.set_xlabel('Iteration', fontsize=12)
    ax4.set_ylabel('Accuracy', fontsize=12)
    ax4.grid(True, linestyle='--', alpha=0.7)
    ax4.legend()

    # 5. Best Models (bottom right)
    ax5 = fig.add_subplot(grid[2, 1])

    # Get top 10 models by test accuracy
    top_models = test_data.sort_values('Accuracy', ascending=False).head(10)

    # Create a scatter plot of the best models
    scatter = ax5.scatter(
        top_models['Iteration'],
        top_models['Random_State'],
        c=top_models['Accuracy'],
        s=top_models['F1_Score']*100,  # Size based on F1 score
        cmap='viridis',
        alpha=0.7
    )

    # Add colorbar and legend
    cbar = plt.colorbar(scatter, ax=ax5)
    cbar.set_label('Accuracy')

    # Add annotations for the top 3 models
    for i in range(min(3, len(top_models))):
        row = top_models.iloc[i]
        ax5.annotate(
            f"Acc: {row['Accuracy']:.4f}\nF1: {row['F1_Score']:.4f}",
            (row['Iteration'], row['Random_State']),
            xytext=(10, 10),
            textcoords='offset points',
            bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5)
        )

    ax5.set_title('Top 10 Models (Bubble Size = F1 Score)', fontsize=14)
    ax5.set_xlabel('Iteration', fontsize=12)
    ax5.set_ylabel('Random State', fontsize=12)
    ax5.grid(True, linestyle='--', alpha=0.7)

    # Add overall summary statistics as text
    test_stats = test_data.agg({
        'Accuracy': ['mean', 'std', 'min', 'max'],
        'F1_Score': ['mean', 'std', 'min', 'max']
    }).round(4)

    stats_text = (
        f"Test Accuracy: mean={test_stats['Accuracy']['mean']}, std={test_stats['Accuracy']['std']}\n"
        f"              min={test_stats['Accuracy']['min']}, max={test_stats['Accuracy']['max']}\n\n"
        f"Test F1-Score: mean={test_stats['F1_Score']['mean']}, std={test_stats['F1_Score']['std']}\n"
        f"              min={test_stats['F1_Score']['min']}, max={test_stats['F1_Score']['max']}"
    )

    plt.figtext(0.5, 0.01, stats_text, ha='center', fontsize=12,
               bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

    # Adjust layout and add main title
    plt.tight_layout(rect=[0, 0.05, 1, 0.95])
    plt.suptitle('Random Forest Performance Analysis by Iteration and Random State', fontsize=16)

    # Save the figure
    plt.savefig('rf_performance_analysis.png', dpi=300, bbox_inches='tight')

    # Show plot
    plt.show()

    return fig


uploadedSpreadsheet = files.upload()
fileName = list(uploadedSpreadsheet.keys())[0]
data = pd.read_excel(fileName)
columns_to_process = ['IgG1 Average', 'IgG2 Average', 'IgG3 Average', 'IgG4 Average', 'IgA Average','IgE Average','IgM Average']

# Remove outliers
filter_data = False  # Set to True if you want to remove outliers
data = OutlierRemoval(data, columns_to_process, filter_data)
print("Data shape after outlier removal:", data.shape)

# Save to Excel file
output_file = "after_outlier.xlsx"
data.to_excel(output_file, index=False)
print(f"Data saved to '{output_file}'")

# Data Input for Model
feature_names = data[columns_to_process]
Group = data['Group']

# Perform multiple train/test splits and evaluate model on each
train_indexes = []
test_indexes = []
accuracies = []
f1_scores = []
models = []
best_models_per_iteration = []

print("\n--- Multiple Train-Test Split Evaluation ---")
num_iterations = 100    # Reduced for demonstration, use 100 for full analysis
num_random_states = 100  # Reduced for demonstration, use 99 for full analysis

# Global results tracking
results_df = pd.DataFrame(columns=[
    'Iteration', 'Random_State', 'Accuracy', 'F1_Score',
    'Is_Train', 'Is_Test'
])

for i in range(1, num_iterations + 1):
    print(f"\nIteration {i}/{num_iterations}")

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        feature_names, Group, test_size=0.3, shuffle=True, random_state=i, stratify=Group
    )

    # Store indexes for potential later use
    train_indexes.append(X_train.index)
    test_indexes.append(X_test.index)

    # Track best model for this iteration
    best_acc_for_iteration = 0
    best_model_for_iteration = None
    best_random_state_for_iteration = None

    for p in range(1, num_random_states + 1):
        # Train Random Forest model
        Pumpkin = RandomForestClassifier(n_estimators=100, random_state=p, bootstrap=True)
        Pumpkin.fit(X_train, y_train)
        models.append(Pumpkin)

        # Test set predictions and metrics
        y_test_pred = Pumpkin.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        test_report = classification_report(y_test, y_test_pred, output_dict=True)
        test_f1 = test_report['weighted avg']['f1-score']

        # Training set predictions and metrics
        y_train_pred = Pumpkin.predict(X_train)
        train_accuracy = accuracy_score(y_train, y_train_pred)
        train_report = classification_report(y_train, y_train_pred, output_dict=True)
        train_f1 = train_report['weighted avg']['f1-score']

        # Track if this is the best model for this iteration
        if test_accuracy > best_acc_for_iteration:
            best_acc_for_iteration = test_accuracy
            best_model_for_iteration = Pumpkin
            best_random_state_for_iteration = p

        # Add test results to the DataFrame
        results_df = pd.concat([results_df, pd.DataFrame([{
            'Iteration': i,
            'Random_State': p,
            'Accuracy': test_accuracy,
            'F1_Score': test_f1,
            'Is_Train': False,
            'Is_Test': True
        }])], ignore_index=True)

        # Add train results to the DataFrame
        results_df = pd.concat([results_df, pd.DataFrame([{
            'Iteration': i,
            'Random_State': p,
            'Accuracy': train_accuracy,
            'F1_Score': train_f1,
            'Is_Train': True,
            'Is_Test': False
        }])], ignore_index=True)

        # Print every 5th model to reduce output
        if p % 5 == 0 or p == 1:
            print(f'Iteration {i} random_state: {p} Test Accuracy: {test_accuracy:.4f}')
            print(f"Test Weighted F1-score: {test_f1:.4f}")

    # Save the best model for this iteration
    best_models_per_iteration.append((i, best_random_state_for_iteration, best_acc_for_iteration, best_model_for_iteration))
    print(f"Best model for iteration {i}: random_state={best_random_state_for_iteration}, accuracy={best_acc_for_iteration:.4f}")

# Print overall performance statistics
print("\n--- Overall Performance Statistics ---")
test_only = results_df[results_df['Is_Test'] == True]
print(f"Mean Test Accuracy: {test_only['Accuracy'].mean():.4f}")
print(f"Standard Deviation: {test_only['Accuracy'].std():.4f}")
print(f"Min Accuracy: {test_only['Accuracy'].min():.4f}")
print(f"Max Accuracy: {test_only['Accuracy'].max():.4f}")

# Find overall best model
best_result = results_df[results_df['Is_Test'] == True].sort_values('Accuracy', ascending=False).iloc[0]
print(f"\nBest model overall: Iteration {best_result['Iteration']}, Random State {best_result['Random_State']}")
print(f"Accuracy: {best_result['Accuracy']:.4f}, F1-Score: {best_result['F1_Score']:.4f}")

# Plot the results
print("\nCreating performance visualization...")
plot_rf_performance(results_df)

# Select best model details
best_iteration = int(best_result['Iteration'])
best_random_state = int(best_result['Random_State'])

# Find the best model's index
best_iter_idx = best_iteration - 1  # Convert to 0-based index
best_model_idx = (best_iter_idx * num_random_states) + (best_random_state - 1)
best_model = models[best_model_idx]

# Evaluate the best model more thoroughly
best_X_train_idx = train_indexes[best_iter_idx]
best_X_test_idx = test_indexes[best_iter_idx]
X_test_best = feature_names.loc[best_X_test_idx]
y_test_best = Group.loc[best_X_test_idx]
y_pred_best = best_model.predict(X_test_best)

# Print detailed classification report for the best model
print("\nClassification Report (Best Model):")
print(classification_report(y_test_best, y_pred_best))

# Print confusion matrix for the best model
print("\nConfusion Matrix (Best Model):")
print(confusion_matrix(y_test_best, y_pred_best))

# Feature importance for the best model
feature_importance = pd.DataFrame({
    'Feature': columns_to_process,
    'Importance': best_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nFeature Importance (Best Model):")
print(feature_importance)

# Feature importance plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance (Best Model)')
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300)
plt.show()

# Implement k-fold cross-validation on best model
cv_scores = cross_val_score(best_model, feature_names, Group, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV accuracy: {cv_scores.mean():.4f}")

# Save the best model
save_result = SaveData(best_model, prefix="Best_")
print(save_result)

print("\nAnalysis complete!")

Saving Biomarker_16032025_without_SWD_with_GNV.xlsx to Biomarker_16032025_without_SWD_with_GNV (1).xlsx

 --- Data has outliers and was not filtered ---
Data shape after outlier removal: (389, 10)
Data saved to 'after_outlier.xlsx'

--- Multiple Train-Test Split Evaluation ---

Iteration 1/100


  results_df = pd.concat([results_df, pd.DataFrame([{


Iteration 1 random_state: 1 Test Accuracy: 0.7607
Test Weighted F1-score: 0.7468
Iteration 1 random_state: 5 Test Accuracy: 0.7778
Test Weighted F1-score: 0.7704
Iteration 1 random_state: 10 Test Accuracy: 0.7607
Test Weighted F1-score: 0.7478
Iteration 1 random_state: 15 Test Accuracy: 0.7863
Test Weighted F1-score: 0.7789
Iteration 1 random_state: 20 Test Accuracy: 0.7692
Test Weighted F1-score: 0.7554
Iteration 1 random_state: 25 Test Accuracy: 0.7778
Test Weighted F1-score: 0.7689
Iteration 1 random_state: 30 Test Accuracy: 0.7607
Test Weighted F1-score: 0.7507
Iteration 1 random_state: 35 Test Accuracy: 0.7692
Test Weighted F1-score: 0.7596
Iteration 1 random_state: 40 Test Accuracy: 0.8034
Test Weighted F1-score: 0.7921
Iteration 1 random_state: 45 Test Accuracy: 0.7863
Test Weighted F1-score: 0.7791
Iteration 1 random_state: 50 Test Accuracy: 0.7607
Test Weighted F1-score: 0.7562
Iteration 1 random_state: 55 Test Accuracy: 0.7692
Test Weighted F1-score: 0.7581
Iteration 1 random