In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier  # might use this later
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter


df = pd.read_csv("/content/LIHC_DESeq_significant_vst_WITH_META.csv")
print("shape of", df.shape)

print(df["Stage_main"].value_counts())

metadata_cols = [
    "Sample_ID_full", "Sample_short", "Stage_raw",
    "Stage_main", "Patient_ID", "Diagnosis_Age"
]

gene_columns = []

for col in df.columns:
    if col not in metadata_cols:
        gene_columns.append(col)

genes_df = df[gene_columns]

X = genes_df.values
y = df["Stage_main"].values

print("\nTotal DESeq2-significant gene features:", genes_df.shape[1])
print("Stage distribution before filtering:")
print(pd.Series(y).value_counts())


valid_stages = ['Stage I', 'Stage II', 'Stage III']
sample_mask = np.isin(y, valid_stages)
X = X[sample_mask]
y = y[sample_mask]

print("after filtering:", X.shape[0])
print("remaining counts:", np.unique(y, return_counts=True))

df["Stage_for_ML"] = df["Stage_main"].replace({"Other": "Stage III"})
print("\nStage counts after adjusting labels:")
print(df["Stage_for_ML"].value_counts())

df = df[df["Stage_for_ML"].isin(valid_stages)]
print("\nFinal stage counts used for training:")
print(df["Stage_for_ML"].value_counts())

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("BEFORE SMOTE:")
print(pd.Series(y_train).value_counts())
print("distribution:")
print(pd.Series(y_test).value_counts())



orig_counts = Counter(y_train)
print("\nOriginal class breakdown:", ori_counts)

max = max(orig_counts.values())
smote_strategy = {}

for label, count in orig_counts.items():
    if label in ["Stage II", "Stage III"]:
        smote_strategy[label] = max
    else:
        smote_strategy[label] = count

print("SMOTE:", smote_strategy)

sm = SMOTE(sampling_strategy=smote_strategy, random_state=42, k_neighbors=3)
X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)

print("AFTER SMOTE:")
print(pd.Series(y_train_bal).value_counts())


smote = SMOTE(
    sampling_strategy=smote_strategy,
    random_state=42,
    k_neighbors=3
)

X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("\nAfter SMOTE resampling:")
print(pd.Series(y_train_balanced).value_counts())


display(df.head())

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Using some classic ML models â€” nothing fancy, just testing things out
ml_models = {
    "LogisticRegression": LogisticRegression(max_iter=5000, multi_class="multinomial"),
    "LinearSVM": LinearSVC(),
    "RBF_SVM": SVC(kernel="rbf", probability=False),
    "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42),
    "GradientBoosting": GradientBoostingClassifier()
}


for label, model in ml_models.items():
    print(f"Training model: {label}")
    model.fit(X_train_balanced, y_train_balanced)
    y_predicted = model.predict(X_test)

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_predicted))

    print("\nClassification Report:")
    print(classification_report(y_test, y_predicted))



from imblearn.over_sampling import BorderlineSMOTE

print("Borderline-SMOTE")

X_tr, X_te, y_tr, y_te = train_test_split(X_vals, y_labels, test_size=0.2, stratify=y_labels, random_state=42)
bsm = BorderlineSMOTE(random_state=42)
X_tr_resampled, y_tr_resampled = bsm.fit_resample(X_tr, y_tr)


for name, model in ml_models.items():
    model.fit(X_tr_resampled, y_tr_resampled)
    preds = model.predict(X_te)
    print("Confusion Matrix:")
    print(confusion_matrix(y_te, preds))
    print("Report:")
    print(classification_report(y_te, preds))

from imblearn.over_sampling import ADASYN

adasyn = ADASYN(random_state=42)
X_tr_res, y_tr_res = adasyn.fit_resample(X_tr, y_tr)

print("ADASYN Sampling")
for name, model in ml_models.items():
    model.fit(X_tr_res, y_tr_res)
    y_pred = model.predict(X_te)
    print("Confusion Matrix:")
    print(confusion_matrix(y_te, y_pred))
    print("Classification Report:")
    print(classification_report(y_te, y_pred))


from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=42)
X_res, y_res = smote_enn.fit_resample(X_tr, y_tr)

print("SMOTEENN Combination")
for model_name, model in ml_models.items():
    model.fit(X_res, y_res)
    y_pred = model.predict(X_te)
    print("Confusion Matrix:")
    print(confusion_matrix(y_te, y_pred))
    print("Report:")
    print(classification_report(y_te, y_pred))
from lime.lime_tabular import LimeTabularExplainer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

non_gene_cols = [
    "Sample_ID_full", "Sample_short", "Stage_raw",
    "Stage_main", "Patient_ID", "Diagnosis_Age"
]

X_lime = df.drop(columns=non_gene_cols)
y_lime = df["Stage_main"]

keep_mask = y_lime.isin(["Stage I", "Stage II", "Stage III"])
X_lime = X_lime.loc[keep_mask]
y_lime = y_lime.loc[keep_mask]

X_tr, X_te, y_tr, y_te = train_test_split(X_lime, y_lime, test_size=0.2, stratify=y_lime, random_state=42)

encoder = LabelEncoder()
y_tr_enc = encoder.fit_transform(y_tr)
y_te_enc = encoder.transform(y_te)

smote = BorderlineSMOTE(random_state=42, k_neighbors=3)
X_bal, y_bal = smote.fit_resample(X_tr.values, y_tr_enc)

clf = LogisticRegression(max_iter=5000, multi_class="multinomial")
clf.fit(X_bal, y_bal)

lime = LimeTabularExplainer(
    training_data=X_bal,
    feature_names=X_lime.columns.tolist(),
    class_names=encoder.classes_.tolist(),
    mode="classification",
    discretize_continuous=True
)

i=0
explanation = lime.explain_instance(X_te.values[i], clf.predict_proba, num_features=200)
explanation.show_in_notebook(show_table=True)
import gseapy as gp

if clf.coef_.shape[0] == 1:
    importance_scores = np.abs(clf.coef_[0])
else:
    importance_scores = np.max(np.abs(clf.coef_), axis=0)

gene_names = X_lime.columns
importance_df = pd.DataFrame({
    'gene': gene_names,
    'importance': importance_scores
}).sort_values(by='importance', ascending=False)

top_genes = importance_df.head(500)['gene'].tolist()
print("top genes", top_genes[:5])

enr = gp.enrichr(
    gene_list=top_genes,
    gene_sets=['KEGG_2021_Human', 'GO_Biological_Process_2021'],
    organism='Human',
)


enriched = enr.results
significant = enriched[enriched['Adjusted P-value'] < 0.05].head(10)

if not significant.empty:
    plt.figure(figsize=(10, 6))
    plt.barh(significant['Term'], -np.log10(significant['Adjusted P-value']), color='skyblue')
    plt.xlabel('-log10(Adjusted P-value)')
    plt.title('Top Enriched Pathways')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()





