# Core (Always run)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc, roc_auc_score, classification_report, confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, label_binarize
from itertools import cycle
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold

Global Variables

In [None]:
path_to_data = "data/"
dataset_file_name = "dataset.pq"

# Preprocess

## Load datasets

In [None]:
df = pd.read_parquet(path_to_data + dataset_file_name)

## Cleaning

Transpose

In [None]:
df = df.transpose()

print(f'Dataframe shape after transpose: {df.shape}')

df.head()

Apply subtypes

In [None]:
excell_sheet_df = pd.read_excel('./assets/subtype_sheet.xlsx', sheet_name='RNA-Seq 1148')

for sample_id in df.index:
    print(f'Processing sample ID: {sample_id}')

    if sample_id in excell_sheet_df['Sample ID'].values:
        subtype = excell_sheet_df.loc[excell_sheet_df['Sample ID'] == sample_id, 'PAM50'].values[0]
        print(f'Subtype found: {subtype}')
        df.at[sample_id, 'Subtype'] = subtype

df.head()

Look for NaN

In [None]:
if df.isna().sum().sum() > 0:
    print("Dataframe contains missing values. Dropping missing values.")
    print(f'Number of missing values: {df.isna().sum().sum()}')

    df = df.dropna()

    print("Missing values dropped.")
    print(f'Number of remaining missing values: {df.isna().sum().sum()}')
else:
    print("Dataframe does not contain missing values.")

# Exploratory Data Analysis (EDA)

Plot 1: Subtype distribution plot

More info about the subtypes in this paper: https://pmc.ncbi.nlm.nih.gov/articles/PMC6985186/

In [None]:
plot_df = df.copy()

plt.figure(figsize=(10, 6))
sns.countplot(data=plot_df, x='Subtype', order=plot_df['Subtype'].value_counts().index)
plt.title('Distribution of Subtypes')
plt.xlabel('Subtype')
plt.ylabel('Count')
plt.show()

Plot 2: Scatter plot

Observation: Contains a few outliers, not entirely sure what to do about them.

https://stats.stackexchange.com/questions/533503/when-should-you-remove-outliers-entire-dataset-or-train-dataset

In [None]:
x_log_transformed = np.log1p(plot_df.select_dtypes(include=np.number))

scaler = StandardScaler()
df_scaled = scaler.fit_transform(x_log_transformed)

PCA_model = PCA(n_components=2)
pca_result = PCA_model.fit_transform(df_scaled)
plot_df['PCA1'] = pca_result[:, 0]
plot_df['PCA2'] = pca_result[:, 1]
plt.figure(figsize=(10, 6))
sns.scatterplot(data=plot_df, x='PCA1', y='PCA2', hue='Subtype', palette='Set2')
plt.title('PCA Scatter Plot Colored by Subtype')
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.legend(title='Subtype')
plt.show()

Plot 2.1: Scatter plot with outliers removed

In [None]:
# Filter out outliers based on PCA1 and PCA2
filtered_plot_df = plot_df[plot_df['PCA1'] < 2000]

plt.figure(figsize=(10, 6))
sns.scatterplot(data=filtered_plot_df, x='PCA1', y='PCA2', hue='Subtype', palette='Set2')
plt.title('PCA Scatter Plot with PCA1 < 2000 Colored by Subtype')
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.legend(title='Subtype')
plt.show()  

# Training

Labeling

In [None]:
encoder = LabelEncoder()

y = encoder.fit_transform(df['Subtype'])
X = df.drop(columns=['Subtype'])

Normalization - log2

In [None]:
X = np.log2(X + 1)

Train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Variance Threshold

In [None]:
variance_filter = VarianceThreshold(threshold=0.1)
X_train_filtered = variance_filter.fit_transform(X_train)
X_test_filtered = variance_filter.transform(X_test)

Feature selection

In [None]:
SelectKBest_model = SelectKBest(score_func=f_classif, k=50)
X_train_selected = SelectKBest_model.fit_transform(X_train_filtered, y_train)
X_test_selected = SelectKBest_model.transform(X_test_filtered)

Random Forest

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_selected, y_train)

y_pred_rf = rf_classifier.predict(X_test_selected)

accuracy_rf = accuracy_score(y_test, y_pred_rf)

print(f'Random Forest Classifier Accuracy: {accuracy_rf:.6f}')

Logistic Regression

In [None]:
logreg_classifier = LogisticRegression(max_iter=1500, random_state=42)
logreg_classifier.fit(X_train_selected, y_train)

y_pred_lr = logreg_classifier.predict(X_test_selected)

accuracy_lr = accuracy_score(y_test, y_pred_lr)

print(f'Logistic Regression Classifier Accuracy: {accuracy_lr:.6f}')

XGBoost

Parameters for XGBClassifier: https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/sklearn.py

In [None]:
xgb_classifier = XGBClassifier(
    tree_method='auto',
    n_estimators=100,
    eval_metric='mlogloss', 
    random_state=42,
    max_depth=6,
)
xgb_classifier.fit(X_train_selected, y_train)

y_pred_xgb = xgb_classifier.predict(X_test_selected)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print(f'XGBoost Classifier Accuracy: {accuracy_xgb:.6f}')

# Training Results Analysis

## Classification Report and Confusion Matrix

In [None]:
print("Classification Report for Random Forest:")
print(classification_report(encoder.inverse_transform(y_test), encoder.inverse_transform(y_pred_rf)))

In [None]:
cm = confusion_matrix(encoder.inverse_transform(y_test), encoder.inverse_transform(y_pred_rf))
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=encoder.classes_, yticklabels=encoder.classes_)
plt.title('Confusion Matrix for Random Forest Classifier')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
print("Classification Report for Logistic Regression:")
print(classification_report(encoder.inverse_transform(y_test), encoder.inverse_transform(y_pred_lr)))

In [None]:
cm = confusion_matrix(encoder.inverse_transform(y_test), encoder.inverse_transform(y_pred_lr))
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', xticklabels=encoder.classes_, yticklabels=encoder.classes_)
plt.title('Confusion Matrix for Logistic Regression Classifier')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
print("Classification Report for XGBoost Classifier:")
print(classification_report(encoder.inverse_transform(y_test), encoder.inverse_transform(y_pred_xgb)))

In [None]:
cm = confusion_matrix(encoder.inverse_transform(y_test), encoder.inverse_transform(y_pred_xgb))
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Oranges', xticklabels=encoder.classes_, yticklabels=encoder.classes_)
plt.title('Confusion Matrix for XGBoost Classifier')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

## ROC Curve

In [None]:
classes = range(len(encoder.classes_))
class_labels = encoder.classes_
number_of_classes = len(class_labels)

Random Forest

In [None]:
y_pred_proba_rf = rf_classifier.predict_proba(X_test_selected)
y_test_binarized = label_binarize(y_test, classes=classes)

plt.figure(figsize=(10, 8))
colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'green', 'red'])

for i, color in zip(range(number_of_classes), colors):
    labels_for_class = y_test_binarized[:, i]
    probs_for_class = y_pred_proba_rf[:, i]

    fpr, tpr, _ = roc_curve(labels_for_class, probs_for_class)
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr, tpr, color=color, lw=2, label=f'ROC curve of class {class_labels[i]} (area = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Chance (AUC = 0.50)')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Random Forest Classifier')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

Logistic Regression

In [None]:
y_pred_proba_lr = logreg_classifier.predict_proba(X_test_selected)
y_test_binarized = label_binarize(y_test, classes=classes)

plt.figure(figsize=(10, 8))
colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'green', 'red'])

for i, color in zip(range(number_of_classes), colors):
    labels_for_class = y_test_binarized[:, i]
    probs_for_class = y_pred_proba_lr[:, i]

    fpr, tpr, _ = roc_curve(labels_for_class, probs_for_class)
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr, tpr, color=color, lw=2, label=f'ROC curve of class {class_labels[i]} (area = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Chance (AUC = 0.50)')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Logistic Regression Classifier')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

XGBoost

In [None]:
y_pred_proba_xgb = xgb_classifier.predict_proba(X_test_selected)
y_test_binarized = label_binarize(y_test, classes=classes)

plt.figure(figsize=(10, 8))
colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'green', 'red'])

for i, color in zip(range(number_of_classes), colors):
    labels_for_class = y_test_binarized[:, i]
    probs_for_class = y_pred_proba_xgb[:, i]

    fpr, tpr, _ = roc_curve(labels_for_class, probs_for_class)
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr, tpr, color=color, lw=2, label=f'ROC curve of class {class_labels[i]} (area = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Chance (AUC = 0.50)')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for XGBoost Classifier')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

## Train ACC vs Test ACC

Random Forest

In [None]:
train_pred_rf = rf_classifier.predict(X_train_selected)
train_f1_rf = f1_score(y_train, train_pred_rf, average='weighted')
test_f1_rf = f1_score(y_test, y_pred_rf, average='weighted')

print(f'Training F1 Score for Random Forest: {train_f1_rf:.6f}')
print(f'Test F1 Score for Random Forest: {test_f1_rf:.6f}')
print(f'F1 Score Difference for Random Forest: {train_f1_rf - test_f1_rf:.6f}')

Logistical Regression

In [None]:
train_pred_lr = logreg_classifier.predict(X_train_selected)
train_f1_lr = f1_score(y_train, train_pred_lr, average='weighted')
test_f1_lr = f1_score(y_test, y_pred_lr, average='weighted')

print(f'Training F1 Score for Logistic Regression: {train_f1_lr:.6f}')
print(f'Test F1 Score for Logistic Regression: {test_f1_lr:.6f}')
print(f'F1 Score Difference for Logistic Regression: {train_f1_lr - test_f1_lr:.6f}')

XGBoost

In [None]:
train_pred_xgb = xgb_classifier.predict(X_train_selected)
train_f1_xgb = f1_score(y_train, train_pred_xgb, average='weighted')
test_f1_xgb = f1_score(y_test, y_pred_xgb, average='weighted')

print(f'Training F1 Score for XGBoost: {train_f1_xgb:.6f}')
print(f'Test F1 Score for XGBoost: {test_f1_xgb:.6f}')
print(f'F1 Score Difference for XGBoost: {train_f1_xgb - test_f1_xgb:.6f}')