In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import io
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings('ignore')

plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report

In [3]:
with open('/content/drive/MyDrive/gene_attribute_matrix.txt', 'r', encoding='latin-1') as f:
  data = f.readlines()

In [4]:
data_start_idx = None
for i, line in enumerate(data):
    if line.startswith('!series_matrix_table_begin'):
        data_start_idx = i + 1
        break

data_end_idx = None
for i, line in enumerate(data):
    if line.startswith('!series_matrix_table_end'):
        data_end_idx = i
        break

matrix_lines = data[data_start_idx:data_end_idx]
matrix_data = ''.join(matrix_lines)
df = pd.read_csv(io.StringIO(matrix_data), sep='\t', index_col=0)

In [5]:
mastocytosis_terms = ['mastocytosis', 'mast cell', 'systemic mastocytosis']

for term in mastocytosis_terms:
    matches = [col for col in df.columns if term.lower() in col.lower()]
    if matches:
        print(f"'{term}': {matches}")
    else:
        print(f"'{term}': No matches found")

'mastocytosis': ['diffuse cutaneous mastocytosis', 'aggressive systemic mastocytosis', 'mastocytosis', 'cutaneous mastocytosis', 'systemic mastocytosis', 'indolent systemic mastocytosis']
'mast cell': ['mast cell neoplasm']
'systemic mastocytosis': ['aggressive systemic mastocytosis', 'systemic mastocytosis', 'indolent systemic mastocytosis']


In [6]:
print("\nUnique values in 'Disease' column:")
print(df['Disease'].value_counts())


Unique values in 'Disease' column:
Disease
7008         1
DOID         1
GeneID/NA    1
80059        1
22847        1
            ..
9721         1
29035        1
145858       1
460          1
129684       1
Name: count, Length: 15311, dtype: int64


In [7]:
mastocytosis_columns = [
    'systemic mastocytosis',
    'indolent systemic mastocytosis',
    'aggressive systemic mastocytosis',
    'cutaneous mastocytosis',
    'diffuse cutaneous mastocytosis',
    'mastocytosis',
    'mast cell neoplasm'
]

In [8]:
mastocytosis_data = df[mastocytosis_columns]
print("Mastocytosis data shape:", mastocytosis_data.shape)
print("\nMastocytosis data summary:")
print(mastocytosis_data.describe())

Mastocytosis data shape: (15311, 7)

Mastocytosis data summary:
        systemic mastocytosis  indolent systemic mastocytosis  \
count                 15311.0                         15311.0   
unique                    5.0                             5.0   
top                       0.0                             0.0   
freq                  15015.0                         15126.0   

        aggressive systemic mastocytosis  cutaneous mastocytosis  \
count                            15311.0                 15311.0   
unique                               5.0                     5.0   
top                                  0.0                     0.0   
freq                             15128.0                 15003.0   

        diffuse cutaneous mastocytosis  mastocytosis  mast cell neoplasm  
count                          15311.0       15311.0             15311.0  
unique                             5.0           5.0                 5.0  
top                                0.0      

In [9]:
print("\nNon-zero values per mastocytosis type:")
for col in mastocytosis_columns:
    non_zero_count = (df[col] != 0).sum()
    print(f"{col}: {non_zero_count} non-zero values")


Non-zero values per mastocytosis type:
systemic mastocytosis: 296 non-zero values
indolent systemic mastocytosis: 185 non-zero values
aggressive systemic mastocytosis: 183 non-zero values
cutaneous mastocytosis: 308 non-zero values
diffuse cutaneous mastocytosis: 134 non-zero values
mastocytosis: 485 non-zero values
mast cell neoplasm: 298 non-zero values


In [None]:
def clean_data(df):
    metadata_cols = ['#.1', 'Disease']
    data_cols = [col for col in df.columns if col not in metadata_cols]
    df_clean = df.copy()
    for col in data_cols:
        df_clean[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
    print(f"Data cleaning complete. Shape: {df_clean.shape}")
    return df_clean

df_clean = clean_data(df)

In [None]:
def extract_mastocytosis_genes(df, threshold=0.0):
    mastocytosis_genes = df[df[mastocytosis_columns].abs().max(axis=1) > threshold]

    print(f"Found {len(mastocytosis_genes)} genes associated with mastocytosis")
    return mastocytosis_genes, mastocytosis_columns

In [None]:
def prepare_classification_data(df):
    mastocytosis_mask = df[mastocytosis_columns].abs().max(axis=1) > 0
    mastocytosis_genes = df[mastocytosis_mask].copy()

    print(f"Found {len(mastocytosis_genes)} genes associated with mastocytosis")

    y_binary = (mastocytosis_genes[mastocytosis_columns].abs().max(axis=1) > 0).astype(int)


    y_multiclass = mastocytosis_genes[mastocytosis_columns].abs().idxmax(axis=1)

    feature_columns = [col for col in df.columns if col not in mastocytosis_columns + ['#.1', 'Disease']]
    X = mastocytosis_genes[feature_columns]

    print(f"Features shape: {X.shape}")
    print(f"Binary target distribution:\n{y_binary.value_counts()}")
    print(f"Multi-class target distribution:\n{y_multiclass.value_counts()}")

    return X, y_binary, y_multiclass, mastocytosis_genes

In [None]:
for col in mastocytosis_columns:
    print(f"{col}: {df[col].dtype}")
    print(f"  Sample values: {df[col].unique()[:10]}")
    print(f"  Value counts: {df[col].value_counts().head()}")
    print()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_nonzero, y_binary, test_size=0.2, random_state=42, stratify=y_binary)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
y_multiclass = mastocytosis_genes[mastocytosis_columns].abs().idxmax(axis=1)

print(f"Multi-class target distribution:\n{y_multiclass.value_counts()}")

X_train_mc, X_test_mc, y_train_mc, y_test_mc = train_test_split(
    X_nonzero, y_multiclass, test_size=0.2, random_state=42, stratify=y_multiclass
)

scaler = StandardScaler()
X_train_mc_scaled = scaler.fit_transform(X_train_mc)
X_test_mc_scaled = scaler.transform(X_test_mc)

print(f"\nMulti-class training set: {X_train_mc.shape}")
print(f"Multi-class test set: {X_test_mc.shape}")
print(f"Training target distribution:\n{y_train_mc.value_counts()}")

print("\nTraining multi-class classification models...")

models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    'SVM': SVC(kernel='rbf', random_state=42, class_weight='balanced', probability=True),
    'Logistic Regression': LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000),
    'KNN': KNeighborsClassifier(n_neighbors=3)
}

model_results = {}

for name, model in models.items():
    print(f"\n{'='*50}")
    print(f"Training {name}...")

    try:
        model.fit(X_train_mc_scaled, y_train_mc)
        y_pred = model.predict(X_test_mc_scaled)
        accuracy = accuracy_score(y_test_mc, y_pred)
        cv_scores = cross_val_score(model, X_train_mc_scaled, y_train_mc, cv=3, scoring='accuracy')

        print(f"Test Accuracy: {accuracy:.4f}")
        print(f"Cross-validation Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

        model_results[name] = {
            'model': model,
            'accuracy': accuracy,
            'cv_score': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'predictions': y_pred
        }

        print(f"\nClassification Report for {name}:")
        print(classification_report(y_test_mc, y_pred))

    except Exception as e:
        print(f"Error training {name}: {e}")
        continue

print("\n" + "="*70)
print("MODEL COMPARISON SUMMARY")
print("="*70)

if model_results:
    comparison_df = pd.DataFrame({
        'Model': list(model_results.keys()),
        'Test_Accuracy': [results['accuracy'] for results in model_results.values()],
        'CV_Mean': [results['cv_score'] for results in model_results.values()],
        'CV_Std': [results['cv_std'] for results in model_results.values()]
    })
    comparison_df = comparison_df.sort_values('Test_Accuracy', ascending=False)
    print(comparison_df)
else:
    print("No models were successfully trained.")

print("\n" + "="*70)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*70)

if 'Random Forest' in model_results:
    rf_model = model_results['Random Forest']['model']
    if hasattr(rf_model, 'feature_importances_'):
        feature_importance = pd.DataFrame({
            'feature': X_train_mc.columns,
            'importance': rf_model.feature_importances_
        }).sort_values('importance', ascending=False)

        print("Top 20 Most Important Features (Random Forest):")
        print(feature_importance.head(20))
    else:
        print("Feature importance not available for this model type.")
else:
    print("Random Forest model not available for feature importance analysis.")

print(f"\n" + "="*70)
print("DETAILED ANALYSIS - BEST MODEL")
print("="*70)

if model_results:
    best_model_name = max(model_results.keys(), key=lambda x: model_results[x]['accuracy'])
    best_model = model_results[best_model_name]['model']
    best_predictions = model_results[best_model_name]['predictions']

    print(f"Best Model: {best_model_name}")
    print(f"Test Accuracy: {model_results[best_model_name]['accuracy']:.4f}")

    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test_mc, best_predictions)
    print(cm)

    print(f"\nDetailed Classification Report:")
    print(classification_report(y_test_mc, best_predictions))

print("\n" + "="*50)
print("MISCLASSIFICATION ANALYSIS")
print("="*50)

if model_results:
    best_model_name = max(model_results.keys(), key=lambda x: model_results[x]['accuracy'])
    best_predictions = model_results[best_model_name]['predictions']

    misclassified_mask = y_test_mc != best_predictions
    misclassified_indices = y_test_mc[misclassified_mask].index

    if len(misclassified_indices) > 0:
        print(f"Number of misclassified genes: {len(misclassified_indices)}")

        print("\nMisclassification patterns:")
        misclass_df = pd.DataFrame({
            'Actual': y_test_mc[misclassified_mask],
            'Predicted': best_predictions[misclassified_mask]
        })
        print(misclass_df.value_counts())
    else:
        print("Perfect classification! No misclassified genes.")

print(f"\n" + "="*50)
print("SAVING CLASSIFICATION RESULTS")
print("="*50)

if model_results:
    best_model_name = max(model_results.keys(), key=lambda x: model_results[x]['accuracy'])
    best_model = model_results[best_model_name]['model']

    model_filename = f'best_mastocytosis_classifier_{best_model_name.lower().replace(" ", "_")}.pkl'
    joblib.dump(best_model, model_filename)
    print(f"Best model saved as: {model_filename}")

    scaler_filename = 'mastocytosis_classification_scaler.pkl'
    joblib.dump(scaler, scaler_filename)
    print(f"Feature scaler saved as: {scaler_filename}")

    feature_names_filename = 'mastocytosis_classification_features.pkl'
    joblib.dump(list(X_train_mc.columns), feature_names_filename)
    print(f"Feature names saved as: {feature_names_filename}")

    print("\n" + "="*70)
    print("CLASSIFICATION ANALYSIS COMPLETE!")
    print("="*70)
    print(f"Best performing model: {best_model_name}")
    print(f"Test accuracy: {model_results[best_model_name]['accuracy']:.4f}")
    print(f"Cross-validation score: {model_results[best_model_name]['cv_score']:.4f}")

In [None]:
def plot_confusion_matrices_enhanced(model_results, y_test_mc, title="Multi-class Classification"):
    n_models = len(model_results)
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    axes = axes.ravel()

    for idx, (name, results) in enumerate(model_results.items()):
        if idx >= 4:
            break

        cm = confusion_matrix(y_test_mc, results['predictions'])
        im = sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                        ax=axes[idx], cbar=True,
                        xticklabels=True, yticklabels=True)

        axes[idx].set_title(f'{name}\nAccuracy: {results["accuracy"]:.4f}', fontsize=12, fontweight='bold')
        axes[idx].set_xlabel('Predicted', fontsize=10)
        axes[idx].set_ylabel('Actual', fontsize=10)
        axes[idx].tick_params(axis='x', rotation=45)
        axes[idx].tick_params(axis='y', rotation=0)

    for idx in range(len(model_results), 4):
        axes[idx].set_visible(False)

    plt.suptitle(f'Confusion Matrices - {title}', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()

In [None]:
def plot_model_comparison(model_results):
    models = list(model_results.keys())
    accuracies = [results['accuracy'] for results in model_results.values()]
    cv_means = [results['cv_score'] for results in model_results.values()]
    cv_stds = [results['cv_std'] for results in model_results.values()]

    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

    bars1 = ax1.bar(models, accuracies, color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'][:len(models)])
    ax1.set_title('Test Accuracy Comparison', fontsize=14, fontweight='bold')
    ax1.set_ylabel('Accuracy')
    ax1.set_ylim(0, 1)
    ax1.tick_params(axis='x', rotation=45)

    for bar, acc in zip(bars1, accuracies):
        ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                f'{acc:.3f}', ha='center', va='bottom', fontweight='bold')

    bars2 = ax2.bar(models, cv_means, yerr=cv_stds,
                   color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'][:len(models)],
                   capsize=5)
    ax2.set_title('Cross-Validation Scores (±1 std)', fontsize=14, fontweight='bold')
    ax2.set_ylabel('CV Score')
    ax2.set_ylim(0, 1)
    ax2.tick_params(axis='x', rotation=45)

    for bar, cv_mean in zip(bars2, cv_means):
        ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                f'{cv_mean:.3f}', ha='center', va='bottom', fontweight='bold')

    ax3.scatter(cv_means, accuracies, s=100, alpha=0.7,
               c=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'][:len(models)])

    for i, model in enumerate(models):
        ax3.annotate(model, (cv_means[i], accuracies[i]),
                    xytext=(5, 5), textcoords='offset points', fontsize=10)

    ax3.set_xlabel('Cross-Validation Score')
    ax3.set_ylabel('Test Accuracy')
    ax3.set_title('Test Accuracy vs CV Score', fontsize=14, fontweight='bold')
    ax3.plot([0, 1], [0, 1], 'k--', alpha=0.5)
    ax3.grid(True, alpha=0.3)

    ax4.axis('tight')
    ax4.axis('off')

    table_data = []
    for model, results in model_results.items():
        table_data.append([
            model,
            f"{results['accuracy']:.4f}",
            f"{results['cv_score']:.4f}",
            f"±{results['cv_std']:.3f}"
        ])

    table = ax4.table(cellText=table_data,
                     colLabels=['Model', 'Test Acc', 'CV Mean', 'CV Std'],
                     cellLoc='center',
                     loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(12)
    table.scale(1.2, 1.5)
    ax4.set_title('Performance Summary', fontsize=14, fontweight='bold', pad=20)

    plt.tight_layout()
    plt.show()

In [None]:
def plot_feature_importance_enhanced(model_results, X_train_mc, top_n=20):
    if 'Random Forest' not in model_results:
        print("Random Forest not available for feature importance analysis")
        return

    rf_model = model_results['Random Forest']['model']

    feature_importance = pd.DataFrame({
        'feature': X_train_mc.columns,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

    top_features = feature_importance.head(top_n)
    bars = ax1.barh(range(len(top_features)), top_features['importance'],
                   color=plt.cm.viridis(np.linspace(0, 1, len(top_features))))

    ax1.set_yticks(range(len(top_features)))
    ax1.set_yticklabels(top_features['feature'], fontsize=10)
    ax1.set_xlabel('Feature Importance')
    ax1.set_title(f'Top {top_n} Most Important Features\n(Random Forest)', fontsize=14, fontweight='bold')
    ax1.invert_yaxis()

    for i, (bar, importance) in enumerate(zip(bars, top_features['importance'])):
        ax1.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2,
                f'{importance:.4f}', va='center', fontsize=9)

    ax2.hist(feature_importance['importance'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
    ax2.axvline(feature_importance['importance'].mean(), color='red', linestyle='--',
               label=f'Mean: {feature_importance["importance"].mean():.4f}')
    ax2.set_xlabel('Feature Importance')
    ax2.set_ylabel('Number of Features')
    ax2.set_title('Feature Importance Distribution', fontsize=14, fontweight='bold')
    ax2.legend()
    ax2.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    print("\n" + "="*60)
    print(f"TOP {top_n} MOST IMPORTANT FEATURES (RANDOM FOREST)")
    print("="*60)
    for i, (_, row) in enumerate(top_features.iterrows(), 1):
        print(f"{i:2d}. {row['feature']:<40} {row['importance']:.6f}")

In [None]:
def plot_class_distribution_analysis(y_test_mc, y_multiclass):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

    train_counts = y_multiclass.value_counts()
    ax1.pie(train_counts.values, labels=train_counts.index, autopct='%1.1f%%', startangle=90)
    ax1.set_title(f'Training Set Class Distribution\n(Total: {len(y_multiclass)} samples)',
                 fontsize=14, fontweight='bold')

    test_counts = y_test_mc.value_counts()
    ax2.pie(test_counts.values, labels=test_counts.index, autopct='%1.1f%%', startangle=90)
    ax2.set_title(f'Test Set Class Distribution\n(Total: {len(y_test_mc)} samples)',
                 fontsize=14, fontweight='bold')

    plt.tight_layout()
    plt.show()

    print("\n" + "="*50)
    print("CLASS DISTRIBUTION ANALYSIS")
    print("="*50)
    print("Training set:")
    for class_name, count in train_counts.items():
        print(f"  {class_name}: {count} ({count/len(y_multiclass)*100:.1f}%)")

    print("\nTest set:")
    for class_name, count in test_counts.items():
        print(f"  {class_name}: {count} ({count/len(y_test_mc)*100:.1f}%)")

In [None]:
def plot_detailed_misclassification_analysis(model_results, y_test_mc):
    best_model_name = max(model_results.keys(), key=lambda x: model_results[x]['accuracy'])
    best_predictions = model_results[best_model_name]['predictions']

    misclass_df = pd.DataFrame({
        'Actual': y_test_mc,
        'Predicted': best_predictions
    })

    cm = confusion_matrix(y_test_mc, best_predictions)
    cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1,
                xticklabels=True, yticklabels=True)
    ax1.set_title(f'Confusion Matrix - Absolute Counts\n({best_model_name})',
                 fontsize=14, fontweight='bold')
    ax1.set_xlabel('Predicted')
    ax1.set_ylabel('Actual')

    sns.heatmap(cm_percent, annot=True, fmt='.1f', cmap='Reds', ax=ax2,
                xticklabels=True, yticklabels=True)
    ax2.set_title(f'Confusion Matrix - Percentages\n({best_model_name})',
                 fontsize=14, fontweight='bold')
    ax2.set_xlabel('Predicted')
    ax2.set_ylabel('Actual')

    plt.tight_layout()
    plt.show()

    misclassified_mask = y_test_mc != best_predictions
    if misclassified_mask.sum() > 0:
        print("\n" + "="*60)
        print("MISCLASSIFICATION PATTERNS")
        print("="*60)

        misclass_patterns = misclass_df[misclassified_mask].groupby(['Actual', 'Predicted']).size()
        for (actual, predicted), count in misclass_patterns.items():
            print(f"{actual} → {predicted}: {count} cases")
    else:
        print("\nPerfect classification! No misclassifications found.")

In [None]:
print("\nGenerating model comparison plots...")
plot_model_comparison(model_results)

print("\nGenerating enhanced confusion matrices...")
plot_confusion_matrices_enhanced(model_results, y_test_mc, "Mastocytosis Subtype Classification")

print("\nGenerating feature importance analysis...")
plot_feature_importance_enhanced(model_results, X_train_mc, top_n=25)

print("\nGenerating class distribution analysis...")
plot_class_distribution_analysis(y_test_mc, y_multiclass)

print("\nGenerating misclassification analysis...")
plot_detailed_misclassification_analysis(model_results, y_test_mc)

print("\nALL VISUALIZATIONS COMPLETE!")
print("Generated plots:")
print("• Model performance comparison")
print("• Enhanced confusion matrices")
print("• Feature importance analysis")
print("• Class distribution analysis")
print("• Detailed misclassification analysis")

In [None]:
class AttentionClassifier(nn.Module):
    def __init__(self, input_dim, num_classes, hidden_dim=256, attention_dim=128):
        super(AttentionClassifier, self).__init__()

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.attention_dim = attention_dim
        self.num_classes = num_classes

        self.feature_encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2)
        )

        self.attention_weights = nn.Linear(hidden_dim, attention_dim)
        self.attention_context = nn.Linear(attention_dim, 1, bias=False)

        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim // 2, num_classes)
        )

    def forward(self, x):
        encoded_features = self.feature_encoder(x)

        attention_scores = self.attention_weights(encoded_features)
        attention_scores = torch.tanh(attention_scores)
        attention_weights = self.attention_context(attention_scores)
        attention_weights = F.softmax(attention_weights, dim=1)

        attended_features = encoded_features * attention_weights

        output = self.classifier(attended_features)

        return output, attention_weights

In [None]:
def train_attention_model(X_train, y_train, X_test, y_test, num_epochs=100, batch_size=32, learning_rate=0.001):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    X_train_tensor = torch.FloatTensor(X_train).to(device)
    y_train_tensor = torch.LongTensor(y_train_encoded).to(device)
    X_test_tensor = torch.FloatTensor(X_test).to(device)
    y_test_tensor = torch.LongTensor(y_test_encoded).to(device)

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    input_dim = X_train.shape[1]
    num_classes = len(label_encoder.classes_)

    model = AttentionClassifier(input_dim, num_classes).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=10, factor=0.5)

    train_losses = []
    train_accuracies = []

    print(f"\nTraining Attention-Based Deep Learning Classifier...")
    print(f"Model architecture: {input_dim} -> {model.hidden_dim} -> {num_classes} classes")
    print(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        correct_predictions = 0
        total_samples = 0

        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()

            outputs, attention_weights = model(batch_X)
            loss = criterion(outputs, batch_y)

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            correct_predictions += (predicted == batch_y).sum().item()
            total_samples += batch_y.size(0)

        avg_loss = epoch_loss / len(train_loader)
        accuracy = correct_predictions / total_samples

        train_losses.append(avg_loss)
        train_accuracies.append(accuracy)

        scheduler.step(avg_loss)

        if (epoch + 1) % 20 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')

    model.eval()
    with torch.no_grad():
        test_outputs, test_attention = model(X_test_tensor)
        _, test_predicted = torch.max(test_outputs.data, 1)
        test_accuracy = (test_predicted == y_test_tensor).sum().item() / y_test_tensor.size(0)

        test_pred_labels = label_encoder.inverse_transform(test_predicted.cpu().numpy())

    return model, test_accuracy, test_pred_labels, test_attention, label_encoder, train_losses, train_accuracies

In [None]:
def get_feature_attention_importance(model, X_test, feature_names, top_k=20):
    device = next(model.parameters()).device
    model.eval()

    with torch.no_grad():
        X_test_tensor = torch.FloatTensor(X_test).to(device)
        _, attention_weights = model(X_test_tensor)

        avg_attention = attention_weights.mean(dim=0).cpu().numpy().flatten()

        # Debugging print statements
        print(f"Length of feature_names: {len(feature_names)}")
        print(f"Length of avg_attention: {len(avg_attention)}")


        feature_importance = pd.DataFrame({
            'feature': feature_names,
            'attention_weight': avg_attention
        }).sort_values('attention_weight', ascending=False)

        return feature_importance.head(top_k)

print("\nTraining Deep Learning Models with Attention Mechanism...")

attention_model, attention_accuracy, attention_predictions, attention_weights, label_encoder, train_losses, train_accs = train_attention_model(
    X_train_mc_scaled, y_train_mc, X_test_mc_scaled, y_test_mc,
    num_epochs=150, batch_size=16, learning_rate=0.001
)

In [None]:
traditional_models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    'SVM': SVC(kernel='rbf', random_state=42, class_weight='balanced', probability=True),
    'Logistic Regression': LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000),
    'KNN': KNeighborsClassifier(n_neighbors=3)
}

model_results = {}

for name, model in traditional_models.items():
    print(f"\nTraining {name}...")
    try:
        model.fit(X_train_mc_scaled, y_train_mc)
        y_pred = model.predict(X_test_mc_scaled)
        accuracy = accuracy_score(y_test_mc, y_pred)
        cv_scores = cross_val_score(model, X_train_mc_scaled, y_train_mc, cv=3, scoring='accuracy')

        model_results[name] = {
            'model': model,
            'accuracy': accuracy,
            'cv_score': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'predictions': y_pred
        }
        print(f"Test Accuracy: {accuracy:.4f}")
        print(f"Cross-validation Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

    except Exception as e:
        print(f"Error training {name}: {e}")
        continue

model_results['Attention Deep Learning'] = {
    'model': attention_model,
    'accuracy': attention_accuracy,
    'cv_score': attention_accuracy,
    'cv_std': 0.0,
    'predictions': attention_predictions
}

print(f"\n{'='*70}")
print("ENHANCED MODEL COMPARISON WITH DEEP LEARNING")
print(f"{'='*70}")

comparison_data = []
for name, results in model_results.items():
    comparison_data.append({
        'Model': name,
        'Test_Accuracy': results['accuracy'],
        'CV_Mean': results['cv_score'],
        'CV_Std': results['cv_std']
    })

comparison_df = pd.DataFrame(comparison_data).sort_values('Test_Accuracy', ascending=False)
print(comparison_df)

print(f"\n{'='*70}")
print("ATTENTION-BASED FEATURE IMPORTANCE ANALYSIS")
print(f"{'='*70}")

attention_importance = get_feature_attention_importance(
    attention_model, X_test_mc_scaled, X_train_mc.columns, top_k=25
)

print("Top 25 Most Important Features (Attention Mechanism):")
for i, (_, row) in enumerate(attention_importance.iterrows(), 1):
    print(f"{i:2d}. {row['feature']:<40} {row['attention_weight']:.6f}")

print(f"\n{'='*70}")
print("DEEP LEARNING MODEL ANALYSIS")
print(f"{'='*70}")
print(f"Attention-Based Deep Learning Accuracy: {attention_accuracy:.4f}")
print(f"Model Parameters: {sum(p.numel() for p in attention_model.parameters()):,}")
print(f"Trainable Parameters: {sum(p.numel() for p in attention_model.parameters() if p.requires_grad):,}")

print("\nClassification Report (Attention Model):")
print(classification_report(y_test_mc, attention_predictions))

plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.plot(train_losses)
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True, alpha=0.3)

plt.subplot(1, 3, 2)
plt.plot(train_accs)
plt.title('Training Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.grid(True, alpha=0.3)

plt.subplot(1, 3, 3)
models_comp = [name for name in model_results.keys()]
accuracies_comp = [results['accuracy'] for results in model_results.values()]
colors = ['red' if 'Attention' in name else 'skyblue' for name in models_comp]

plt.bar(models_comp, accuracies_comp, color=colors)
plt.title('Model Comparison: Traditional vs Deep Learning')
plt.ylabel('Test Accuracy')
plt.xticks(rotation=45)
for i, acc in enumerate(accuracies_comp):
    plt.text(i, acc + 0.01, f'{acc:.3f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\n{'='*50}")
print("🧠 DEEP LEARNING ANALYSIS COMPLETE!")
print(f"{'='*50}")
best_model_name = max(model_results.keys(), key=lambda x: model_results[x]['accuracy'])
print(f"🏆 Best Model: {best_model_name}")
print(f"📊 Best Accuracy: {model_results[best_model_name]['accuracy']:.4f}")

if 'Attention' in best_model_name:
    print("🎯 Novel deep learning approach outperformed traditional methods!")
    print("✨ Attention mechanism provides interpretable feature importance")
else:
    print("📈 Traditional ML competitive, but attention provides interpretability")