In [13]:
#data handling
import pandas as pd
import numpy as np

#data visualization
import matplotlib.pyplot as plt
# import seaborn as sns

#preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import StandardScaler

#classification
from sklearn.ensemble import RandomForestClassifier

# performance metrics
from sklearn.metrics import f1_score,precision_score, recall_score,make_scorer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV

#Pipeline
from sklearn.pipeline import Pipeline

#Model Selection
from sklearn.model_selection import RandomizedSearchCV

# Decomposition
from sklearn.decomposition import PCA

In [3]:
data = pd.read_csv('data.csv')
labels = pd.read_csv('labels.csv')

# Rename columns to ensure the keys match
data.rename(columns={data.columns[0]: "sample_id"}, inplace=True)
labels.rename(columns={labels.columns[0]: "sample_id"}, inplace=True)

# Merge on sample_id
cancer_data = pd.merge(data, labels, on='sample_id')

# View result
print(cancer_data.head())

  sample_id  gene_0    gene_1    gene_2    gene_3     gene_4  gene_5  \
0  sample_0     0.0  2.017209  3.265527  5.478487  10.431999     0.0   
1  sample_1     0.0  0.592732  1.588421  7.586157   9.623011     0.0   
2  sample_2     0.0  3.511759  4.327199  6.881787   9.870730     0.0   
3  sample_3     0.0  3.663618  4.507649  6.659068  10.196184     0.0   
4  sample_4     0.0  2.655741  2.821547  6.539454   9.738265     0.0   

     gene_6    gene_7  gene_8  ...  gene_20522  gene_20523  gene_20524  \
0  7.175175  0.591871     0.0  ...    8.210257    9.723516    7.220030   
1  6.816049  0.000000     0.0  ...    7.323865    9.740931    6.256586   
2  6.972130  0.452595     0.0  ...    8.127123   10.908640    5.401607   
3  7.843375  0.434882     0.0  ...    8.792959   10.141520    8.942805   
4  6.566967  0.360982     0.0  ...    8.891425   10.373790    7.181162   

   gene_20525  gene_20526  gene_20527  gene_20528  gene_20529  gene_20530  \
0    9.119813   12.003135    9.650743    8.92

In [4]:
# Step 3: Feature and Target Separation
X = cancer_data.drop(['sample_id', 'Class'], axis=1)
y = cancer_data['Class']

# Step 4: Label Encoding
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Step 5: Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
pca = PCA(n_components=0.95)  # Keep components explaining 95% of variance
X_pca = pca.fit_transform(X_scaled)
print(f"Number of PCA components: {X_pca.shape[1]}")

# Step 6: Train-Test Split (create a holdout set)
X_train, X_holdout, y_train, y_holdout = train_test_split(
    X_pca, y_encoded, test_size=0.1, random_state=42, stratify=y_encoded
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

Number of PCA components: 530


In [7]:
def accuracy_score(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    correct = np.sum(y_true == y_pred)
    total = len(y_true)
    return correct / total

In [14]:
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', 0.3]
}

rf = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(
    rf, param_distributions=param_dist, n_iter=20, cv=5, scoring='accuracy',
    random_state=42, n_jobs=-1
)
random_search.fit(X_train, y_train)

# Best model
best_rf = random_search.best_estimator_
print("Best hyperparameters:", random_search.best_params_)
print("Best cross-validation accuracy:", random_search.best_score_)

# Step 9: Cross-Validation on Training Set
cv_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Step 10: Evaluate on Validation Set
y_val_pred = best_rf.fit(X_train, y_train).predict(X_val)
val_acc = accuracy_score(y_val, y_val_pred)
print(f"Validation accuracy: {val_acc:.4f}")
print("\nValidation Classification Report:")
print(classification_report(y_val, y_val_pred, target_names=le.classes_))

# Step 11: Evaluate on Holdout Set
y_holdout_pred = best_rf.predict(X_holdout)
holdout_acc = accuracy_score(y_holdout, y_holdout_pred)
print(f"Holdout accuracy: {holdout_acc:.4f}")
print("\nHoldout Classification Report:")
print(classification_report(y_holdout, y_holdout_pred, target_names=le.classes_))

Best hyperparameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 0.3, 'max_depth': None}
Best cross-validation accuracy: 0.9670164917541229
Cross-validation accuracy: 0.9670 ± 0.0186
Validation accuracy: 0.9722

Validation Classification Report:
              precision    recall  f1-score   support

        BRCA       0.95      1.00      0.97        54
        COAD       1.00      1.00      1.00        14
        KIRC       1.00      1.00      1.00        26
        LUAD       0.96      0.96      0.96        26
        PRAD       1.00      0.88      0.93        24

    accuracy                           0.97       144
   macro avg       0.98      0.97      0.97       144
weighted avg       0.97      0.97      0.97       144

Holdout accuracy: 0.9506

Holdout Classification Report:
              precision    recall  f1-score   support

        BRCA       0.91      1.00      0.95        30
        COAD       1.00      0.88      0.93         8
   

In [15]:
# Step 12: Save Model, Scaler, Label Encoder, and PCA
import pickle
with open("rf_model.pkl", "wb") as file:
    pickle.dump(best_rf, file)
with open("scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)
with open("label_encoder.pkl", "wb") as file:
    pickle.dump(le, file)
with open("pca.pkl", "wb") as file:
    pickle.dump(pca, file)

# Step 13: Feature Importance (if PCA not used, optional)
if X_pca.shape[1] == X_scaled.shape[1]:  # If PCA not applied
    importances = best_rf.feature_importances_
    indices = np.argsort(importances)[::-1][:10]  # Top 10 features
    plt.figure(figsize=(10, 6))
    plt.bar(range(10), importances[indices], align='center')
    plt.xticks(range(10), X.columns[indices], rotation=45)
    plt.title('Top 10 Feature Importances')
    plt.show()