In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [4]:
##upload the original mRNA and methylation
#interated mRNA and methylation (n=1417) without any operation and exclusion of features
methy_sub = pd.read_csv('/Users/mac1/Desktop/Test_Train/methylation_sub.csv')
mRNA_sub = pd.read_csv('/Users/mac1/Desktop/Test_Train/mRNA_sub.csv')
#overall mRNA (n=1980) without any operation and exclusion of features
mRNA_genetic = pd.read_csv('/Users/mac1/Desktop/Test_Train/mRNA_genetic.csv')

In [5]:
##extract ID row index for cohort
keep_ids_overall = mRNA_genetic['PATIENT_ID'] #overall
keep_ids_integrate = methy_sub['PATIENT_ID'] #integrate
##extract feature sets for methylation or mRNA
methy_features = [c for c in methy_sub.columns if c not in ["PATIENT_ID"]]
mRNA_features = [c for c in mRNA_sub.columns if c not in ["PATIENT_ID"]]

In [6]:
##clinical features ï¼ˆoverall cohort)
clinical_impute_overall = pd.read_csv("/Users/mac1/Desktop/DATAFEST/clinical_imputed_1.csv")
clinical_impute_overall.shape

(1980, 29)

In [7]:
##clinical feature (integrated cohort)
clinical_impute_integrate = clinical_impute_overall[clinical_impute_overall['PATIENT_ID'].isin(keep_ids_integrate)].copy()
clinical_impute_integrate.shape

(1417, 29)

In [8]:
##gwas feature (overall cohort)
gwas=pd.read_csv('/Users/mac1/Desktop/DATAFEST/genomics_selected_gwas_ensembl.csv',index_col=0)
gwas_overall = gwas[gwas['PATIENT_ID'].isin(keep_ids_overall)].copy()

In [9]:
#99 methylation in the combined overall gwas feature set, may not be used
gwas_overall_methy = gwas_overall[["PATIENT_ID"] + list(gwas_overall.columns.intersection(methy_features))]
gwas_overall_methy.shape

(1980, 99)

In [10]:
#125 mRNA in the combined overall gwas feature set
gwas_overall_mRNA = gwas_overall[["PATIENT_ID"] + list(gwas_overall.columns.intersection(mRNA_features))]
gwas_overall_mRNA.shape

(1980, 125)

In [11]:
##gwas feature (integrated cohort)
gwas_integrate = gwas[gwas['PATIENT_ID'].isin(keep_ids_integrate)].copy()
gwas_integrate.shape

(1417, 224)

In [12]:
gwas_integrate_methy = gwas_integrate[["PATIENT_ID"] + list(gwas_integrate.columns.intersection(methy_features))]
gwas_integrate_methy.shape

(1417, 99)

In [13]:
gwas_integrate_mRNA = gwas_integrate[["PATIENT_ID"] + list(gwas_integrate.columns.intersection(mRNA_features))]
gwas_integrate_mRNA.shape

(1417, 125)

In [14]:
nan_counts_mRNA = gwas_overall_mRNA.isna().sum()
nan_counts_meth = gwas_integrate_methy.isna().sum()
missing_pct_mRNA= gwas_overall_mRNA.isna().mean() * 100
missing_pct_meth= gwas_integrate_methy.isna().mean() * 100
print("Columns with NaN in mRNA in GWAS overall:", (nan_counts_mRNA > 0).sum())
print("Columns with NaN in methylation in GWAS integrated:", (nan_counts_meth > 0).sum())
print("Percentage of missing in mRNA in GWAS overall",missing_pct_mRNA.describe())
print("Percentage of missing in methylation in GWAS integrated",missing_pct_meth.describe())
##the missingness percentage is acceptable, let's move on for KNN imputation

Columns with NaN in mRNA in GWAS overall: 0
Columns with NaN in methylation in GWAS integrated: 75
Percentage of missing in mRNA in GWAS overall count    125.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
dtype: float64
Percentage of missing in methylation in GWAS integrated count    99.000000
mean      3.758830
std       6.782153
min       0.000000
25%       0.070572
50%       0.423430
75%       3.705011
max      28.863797
dtype: float64


In [15]:
##data-driven feature set
#integrated cohort case 1 (n=1417, N=200)
var_data_driven_integrate_1 = pd.read_csv('/Users/mac1/Desktop/DATAFEST/strategy_3_Importance_Cohort1_Case1_SeparateCox.csv')
top200_features_1 = var_data_driven_integrate_1["feature_name"].head(200).tolist()
##integrated cohort case 2 (n=1417, N=300)
var_data_driven_integrate_2 = pd.read_csv('/Users/mac1/Desktop/DATAFEST/strategy_3_Importance_Cohort1_Case2_CombinedCox.csv')
top300_features_2 = var_data_driven_integrate_2["feature_name"].head(300).tolist()
#overall cohort (n=1980, N mRNA=100)
var_data_driven_overall = pd.read_csv('/Users/mac1/Desktop/DATAFEST/strategy_3_Importance_Cohort1_Case2_CombinedCox.csv')
top100_features_3 = var_data_driven_overall["feature_name"].head(100).tolist()

In [16]:
##data-driven dataset (integrate cohort case 1)
genetic_sub = methy_sub.merge(mRNA_sub, on="PATIENT_ID", how="inner")
data_driven_integrate_1 = genetic_sub[["PATIENT_ID"] + list(genetic_sub.columns.intersection(top200_features_1))]
##data-driven dataset (integrate cohort case 2)
data_driven_integrate_2 = genetic_sub[["PATIENT_ID"] + list(genetic_sub.columns.intersection(top300_features_2))]
##data-driven dataset (overall cohort)
data_driven_overall = genetic_sub[["PATIENT_ID"] + list(genetic_sub.columns.intersection(top100_features_3))]

In [17]:
ID_COL = "PATIENT_ID"

def drop_overlap_keep_gwas(gwas_df, dd_df, id_col=ID_COL):
    overlap = (set(gwas_df.columns) & set(dd_df.columns)) - {id_col}
    if overlap:
        print(f"Dropping {len(overlap)} overlapping columns from data-driven set (keeping GWAS).")
        dd_df = dd_df.drop(columns=list(overlap))
    return dd_df

# integrate cohort case 1
data_driven_integrate_1 = drop_overlap_keep_gwas(gwas_integrate, data_driven_integrate_1)

# integrate cohort case 2
data_driven_integrate_2 = drop_overlap_keep_gwas(gwas_integrate, data_driven_integrate_2)

# overall cohort (match the GWAS df you merge with)
data_driven_overall = drop_overlap_keep_gwas(gwas_overall_mRNA, data_driven_overall)

Dropping 3 overlapping columns from data-driven set (keeping GWAS).
Dropping 4 overlapping columns from data-driven set (keeping GWAS).
Dropping 2 overlapping columns from data-driven set (keeping GWAS).


In [18]:
##Combine clinical+gwas+data-driven
#integrate cohort case 1 (N=1417,N=449)
data_all_integrate_1 = clinical_impute_integrate.merge(gwas_integrate, on="PATIENT_ID", how="inner") #clinical+gwas
data_all_integrate_1 = data_all_integrate_1.merge(data_driven_integrate_1, on="PATIENT_ID", how="inner") # +data-driven
#integrate cohort case 2 (n=1417, N=548)
data_all_integrate_2 = clinical_impute_integrate.merge(gwas_integrate, on="PATIENT_ID", how="inner") #clinical+gwas
data_all_integrate_2 = data_all_integrate_2.merge(data_driven_integrate_2, on="PATIENT_ID", how="inner") # +data-driven
#overall cohort (n=1417, N=251)
data_all_overall = clinical_impute_overall.merge(gwas_overall_mRNA, on="PATIENT_ID", how="inner") #clinical+gwas (mrna)
data_all_overall = data_all_overall.merge(data_driven_overall, on="PATIENT_ID", how="inner") # +data-driven

In [24]:
##split into training and testing (70% vs 30%)
#integrate cohort case 1 (N=1417,N=449)
train_all_integrate_1, test_all_integrate_1 = train_test_split(
    data_all_integrate_1,
    test_size=0.30,
    stratify=data_all_integrate_1['OS_STATUS'],
    random_state=123
)
X_train_integrate_1 = train_all_integrate_1.drop(columns=['PATIENT_ID', 'OS_MONTHS', 'OS_STATUS'])
y_train_integrate_1 = train_all_integrate_1[['OS_MONTHS', 'OS_STATUS']]
X_test_integrate_1 = test_all_integrate_1.drop(columns=['PATIENT_ID', 'OS_MONTHS', 'OS_STATUS'])
y_test_integrate_1 = test_all_integrate_1[['OS_MONTHS', 'OS_STATUS']]

#integrate cohort case 2 (N=1417,N=548)
train_all_integrate_2, test_all_integrate_2 = train_test_split(
    data_all_integrate_2,
    test_size=0.30,
    stratify=data_all_integrate_2['OS_STATUS'],
    random_state=123
)
X_train_integrate_2 = train_all_integrate_2.drop(columns=['PATIENT_ID', 'OS_MONTHS', 'OS_STATUS'])
y_train_integrate_2 = train_all_integrate_2[['OS_MONTHS', 'OS_STATUS']]
X_test_integrate_2 = test_all_integrate_2.drop(columns=['PATIENT_ID', 'OS_MONTHS', 'OS_STATUS'])
y_test_integrate_2 = test_all_integrate_2[['OS_MONTHS', 'OS_STATUS']]

#overall cohort case 2 (N=1980,N=251)
train_all_overall, test_all_overall = train_test_split(
    data_all_overall,
    test_size=0.30,
    stratify=data_all_overall['OS_STATUS'],
    random_state=123
)
X_train_overall = train_all_overall.drop(columns=['PATIENT_ID', 'OS_MONTHS', 'OS_STATUS'])
y_train_overall = train_all_overall[['OS_MONTHS', 'OS_STATUS']]
X_test_overall = test_all_overall.drop(columns=['PATIENT_ID', 'OS_MONTHS', 'OS_STATUS'])
y_test_overall = test_all_overall[['OS_MONTHS', 'OS_STATUS']]

In [25]:
## KNN imputation (genetic only + std first)
#integrate cohort case 1 (N=1417,N=446)
Xc_train_integrate_1 = X_train_integrate_1.iloc[:, :26]
Xg_train_integrate_1 = X_train_integrate_1.iloc[:, 26:] #only impute on training
Xc_test_integrate_1  = X_test_integrate_1.iloc[:, :26]
Xg_test_integrate_1  = X_test_integrate_1.iloc[:, 26:]

scaler = StandardScaler()
Xg_train_integrate_1_scaled = scaler.fit_transform(Xg_train_integrate_1) #std first

imputer = KNNImputer(n_neighbors=5,weights="distance",metric="nan_euclidean") #knn
Xg_train_integrate_1_imp = imputer.fit_transform(Xg_train_integrate_1_scaled)

Xg_train_integrate_1_imp = pd.DataFrame( #get columns' name back
    Xg_train_integrate_1_imp,
    columns=Xg_train_integrate_1.columns,
    index=Xg_train_integrate_1.index)

X_train_integrate_1_imp = pd.concat([Xc_train_integrate_1, Xg_train_integrate_1_imp], axis=1)

Xg_test_integrate_1_scaled = scaler.transform(Xg_test_integrate_1) #Scale and std test using TRAIN statistics
Xg_test_integrate_1_imp = imputer.transform(Xg_test_integrate_1_scaled)
Xg_test_integrate_1_imp = pd.DataFrame(
    Xg_test_integrate_1_imp,
    columns=Xg_test_integrate_1.columns,
    index=Xg_test_integrate_1.index
)
X_test_integrate_1_imp = pd.concat([Xc_test_integrate_1, Xg_test_integrate_1_imp], axis=1)

In [26]:
## KNN imputation (genetic only + std first)
#integrate cohort case 2 (N=1417,N=545)
Xc_train_integrate_2 = X_train_integrate_2.iloc[:, :26]
Xg_train_integrate_2 = X_train_integrate_2.iloc[:, 26:]  #only impute on training
Xc_test_integrate_2 = X_test_integrate_2.iloc[:, :26]
Xg_test_integrate_2 = X_test_integrate_2.iloc[:, 26:]

scaler = StandardScaler()
Xg_train_integrate_2_scaled = scaler.fit_transform(Xg_train_integrate_2)  #std first

imputer = KNNImputer(n_neighbors=5, weights="distance", metric="nan_euclidean")  #knn
Xg_train_integrate_2_imp = imputer.fit_transform(Xg_train_integrate_2_scaled)

Xg_train_integrate_2_imp = pd.DataFrame(  #get columns' name back
    Xg_train_integrate_2_imp,
    columns=Xg_train_integrate_2.columns,
    index=Xg_train_integrate_2.index)

X_train_integrate_2_imp = pd.concat([Xc_train_integrate_2, Xg_train_integrate_2_imp], axis=1)

Xg_test_integrate_2_scaled = scaler.transform(Xg_test_integrate_2)  #Scale and std test using TRAIN statistics
Xg_test_integrate_2_imp = imputer.transform(Xg_test_integrate_2_scaled)
Xg_test_integrate_2_imp = pd.DataFrame(
    Xg_test_integrate_2_imp,
    columns=Xg_test_integrate_2.columns,
    index=Xg_test_integrate_2.index
)
X_test_integrate_2_imp = pd.concat([Xc_test_integrate_2, Xg_test_integrate_2_imp], axis=1)

In [27]:
## KNN imputation (genetic only + std first)
#overall cohort (N=1980,N=248)
Xc_train_overall = X_train_overall.iloc[:, :26]
Xg_train_overall = X_train_overall.iloc[:, 26:]  #only impute on training
Xc_test_overall = X_test_overall.iloc[:, :26]
Xg_test_overall = X_test_overall.iloc[:, 26:]

scaler = StandardScaler()
Xg_train_overall_scaled = scaler.fit_transform(Xg_train_overall)  #std first

imputer = KNNImputer(n_neighbors=5, weights="distance", metric="nan_euclidean")  #knn
Xg_train_overall_imp = imputer.fit_transform(Xg_train_overall_scaled)

Xg_train_overall_imp = pd.DataFrame(  #get columns' name back
    Xg_train_overall_imp,
    columns=Xg_train_overall.columns,
    index=Xg_train_overall.index)

X_train_overall_imp = pd.concat([Xc_train_overall, Xg_train_overall_imp], axis=1)

Xg_test_overall_scaled = scaler.transform(Xg_test_overall)  #Scale and std test using TRAIN statistics
Xg_test_overall_imp = imputer.transform(Xg_test_overall_scaled)
Xg_test_overall_imp = pd.DataFrame(
    Xg_test_overall_imp,
    columns=Xg_test_overall.columns,
    index=Xg_test_overall.index
)
X_test_overall_imp = pd.concat([Xc_test_overall, Xg_test_overall_imp], axis=1)


In [82]:
##output all dataset ready for prediction models
X_train_integrate_1_imp.to_csv("/Users/mac1/Desktop/X_train_integrate_1_imp.csv", index=False)
X_test_integrate_1_imp.to_csv("/Users/mac1/Desktop/X_test_integrate_1_imp.csv", index=False)
y_train_integrate_1.to_csv("/Users/mac1/Desktop/y_train_integrate_1.csv", index=False)
y_test_integrate_1.to_csv("/Users/mac1/Desktop/y_test_integrate_1.csv", index=False)
X_train_integrate_2_imp.to_csv("/Users/mac1/Desktop/X_train_integrate_2_imp.csv", index=False)
X_test_integrate_2_imp.to_csv("/Users/mac1/Desktop/X_test_integrate_2_imp.csv", index=False)
y_train_integrate_2.to_csv("/Users/mac1/Desktop/y_train_integrate_2.csv", index=False)
y_test_integrate_2.to_csv("/Users/mac1/Desktop/y_test_integrate_2.csv", index=False)
X_train_overall_imp.to_csv("/Users/mac1/Desktop/X_train_overall_imp.csv", index=False)
X_test_overall_imp.to_csv("/Users/mac1/Desktop/X_test_overall_imp.csv", index=False)
y_train_overall.to_csv("/Users/mac1/Desktop/y_train_overall.csv", index=False)
y_test_overall.to_csv("/Users/mac1/Desktop/y_test_overall.csv", index=False)

In [81]:
##ID list and feature set
keep_ids_overall.to_frame(name="PATIENT_ID").to_csv(
    "/Users/mac1/Desktop/keep_ids_overall.csv", index=False
)
keep_ids_integrate.to_frame(name="PATIENT_ID").to_csv(
    "/Users/mac1/Desktop/keep_ids_integrate.csv", index=False
)