In [None]:
import pandas as pd
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import numpy as np

tpm_df = pd.read_csv("pnas_tpm_96_nodup.txt", sep="\t", index_col=0)
meta_df = pd.read_csv("pnas_patient_info.csv")

X = tpm_df.T

y = meta_df["recurStatus"].map({"N": 0, "R": 1})
X = X.iloc[:len(y), :]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)

lasso = LassoCV(cv=10, random_state=0, max_iter=100000)
lasso.fit(X_train, y_train)

y_pred = lasso.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print("\n🔍 Classification Report:")
print(classification_report(y_test, y_pred_binary, zero_division=0))

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_pred)

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f"AUC = {auc_score:.2f}", color='darkorange')
plt.plot([0, 1], [0, 1], '--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

coef = lasso.coef_
selected = coef != 0
selected_genes = X.columns[selected]
selected_weights = coef[selected]

lasso_df = pd.DataFrame({
    "Gene": selected_genes,
    "Weight": selected_weights
}).sort_values(by="Weight", key=abs, ascending=False)

K = 9
top_k = lasso_df.head(K)
top_k.to_csv("lasso_top15_genes.csv", index=False)
print(f"\n✅ Top {K} LASSO genes saved to lasso_top15_genes.csv")

plt.figure(figsize=(10, 4))
plt.bar(top_k["Gene"], top_k["Weight"])
plt.xticks(rotation=90)
plt.title(f"Top {K} LASSO Selected Genes")
plt.ylabel("Weight")
plt.tight_layout()
plt.show()

print("\n📊 样本标签分布:")
print(y.value_counts())
print("\n✅ 总计:", y.shape[0], "个样本")
print("\n📊 训练集标签:")
print(y_train.value_counts())
print("\n📊 测试集标签:")
print(y_test.value_counts())

#Reference:
#https://chatgpt.com/  This code used assitant from AI, ChatGPT to debug and modify.

In [None]:
import pandas as pd
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import numpy as np

tpm_df = pd.read_csv("pnas_tpm_96_nodup.txt", sep="\t", index_col=0)
meta_df = pd.read_csv("pnas_patient_info.csv")

X = tpm_df.T

y = meta_df["recurStatus"].map({"N": 0, "R": 1})
X = X.iloc[:len(y), :]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

weights = np.where(y_train == 1, 3.0, 1.0)
lasso = LassoCV(cv=10, random_state=0, max_iter=100000)
lasso.fit(X_train, y_train, sample_weight=weights)

y_pred = lasso.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print("\n🔍 Classification Report:")
print(classification_report(y_test, y_pred_binary, zero_division=0))

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_pred)

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f"AUC = {auc_score:.2f}", color='darkorange')
plt.plot([0, 1], [0, 1], '--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

coef = lasso.coef_
selected = coef != 0
selected_genes = X.columns[selected]
selected_weights = coef[selected]

lasso_df = pd.DataFrame({
    "Gene": selected_genes,
    "Weight": selected_weights
}).sort_values(by="Weight", key=abs, ascending=False)

K = 9
top_k = lasso_df.head(K)
top_k.to_csv("lasso_top15_genes.csv", index=False)
print(f"\n✅ Top {K} LASSO genes saved to lasso_top15_genes.csv")

plt.figure(figsize=(10, 4))
plt.bar(top_k["Gene"], top_k["Weight"])
plt.xticks(rotation=45)
plt.title(f"Top {K} LASSO Selected Genes")
plt.ylabel("Weight")
plt.tight_layout()
plt.show()

print("\n📊 样本标签分布:")
print(y.value_counts())
print("\n✅ 总计:", y.shape[0], "个样本")
print("\n📊 训练集标签:")
print(y_train.value_counts())
print("\n📊 测试集标签:")
print(y_test.value_counts())

#Reference:
#https://chatgpt.com/  This code used assitant from AI, ChatGPT to debug and modify.