In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
import sklearn.metrics as metric
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from xgboost import XGBClassifier

In [4]:
fpkm = pd.read_csv("../datasets/datasets_processed/fpkm/toden_fpkm.txt", sep="\t", index_col=0)
tpm = pd.read_csv("../datasets/datasets_processed/tpm/toden_tpm.txt", sep="\t", index_col=0)
raw_count = pd.read_csv("../datasets/datasets_raw/toden/toden_counts.txt", sep="\t")
meta = pd.read_excel("../datasets/datasets_raw/toden/toden_metadata.xlsx")

In [5]:
# Convert categorical variable ['Apoe.status','apoe_carrier','apoe_dose']
share_feat = ['Apoe.status','apoe_carrier','apoe_dose']
status_dic = {'None': 0, 'E2/E3': 1, 'E2/E4': 2, 'E3/E3': 3, 'E3/E4': 4, 'E4/E4': 5}
carrier_dic = {'None': 0, 'no_apoe4': 0, 'apoe4': 1}
dose_dic = {'apoe4': 1, 'None': 0, 'apoe44': 2, 'no_apoe4': 0}
com_dic = [status_dic, carrier_dic, dose_dic]

meta_feat = meta[share_feat].copy()
for i in range(len(share_feat)):
    meta_feat[share_feat[i]] = meta_feat[share_feat[i]].map(com_dic[i])
meta_feat.fillna(0, inplace=True)

In [6]:
tab1 = pd.read_excel("../datasets/abb1654_data_file_s1.xlsx", sheet_name="Upregulated genes", header=1)
tab2 = pd.read_excel("../datasets/abb1654_data_file_s1.xlsx", sheet_name="Downregulated genes", header=1)
select_gene = list(set(list(tab1['ID'])+list(tab2['ID'])))

In [7]:
# Prepare training data (x)
df1 = raw_count.T
df2 = df1.copy()[list(set(df1.columns) & set(select_gene))]
df3 = df1.copy()
df4 = fpkm.copy()
df5 = fpkm.copy()[list(set(df4.columns) & set(select_gene))]
df6 = fpkm.copy()
df7 = tpm.copy()
df8 = tpm.copy()[list(set(df7.columns) & set(select_gene))]
df9 = tpm.copy()
for i in meta_feat.columns:
    df3[i] = meta_feat[i].values
    df6[i] = meta_feat[i].values
    df9[i] = meta_feat[i].values

train_input=['count', 'select-gene count', 'count+apoe', 'FPKM', 'select-gene FPKM',
             'FPKM+apoe', 'TPM', 'select-gene TPM', 'TPM+apoe']
train_data = [df1, df2, df3, df4, df5, df6, df7, df8, df9]

In [8]:
import warnings
warnings.filterwarnings('ignore')

# extract y
y = meta['Disease']
ma = {'NCI': 0, 'AD': 1}
for i in range(len(y)):
    y.iloc[i] = ma[y.iloc[i]]
y = y.astype(int)

In [9]:
norm_data = {}
for i in ["l1", "l2"]:
    norm_data[i] = []
    norm_data[i].append(pd.DataFrame(normalize(train_data[0], norm=i, axis=1)))
    norm_data[i].append(pd.DataFrame(normalize(train_data[1], norm=i, axis=1)))
    temp = normalize(train_data[2].iloc[:, :60675], norm=i, axis=1)
    temp_df = pd.DataFrame(temp, columns=train_data[2].columns[:60675], index=train_data[2].index)
    norm_data[i].append(pd.concat([temp_df, train_data[2].iloc[:, 60675:]], axis=1))

In [80]:
def regression_models(train_data, train_input, y, model_type='linear', norm=None):
    for i in range(len(train_data) if norm == None else 3):
        dTrain = train_data[i]

        if norm != None:
            dTrain = norm_data[norm][i]
        
        accuracy = []
        roc_score = []
        f1_score = []
        mcc_score = []
        recall = []
        
        for k, (trainInd, valInd) in enumerate(KFold(shuffle=True, random_state=16).split(dTrain)):
            XTrain = dTrain.iloc[trainInd,]
            yTrain = y.iloc[trainInd,]
            XVal = dTrain.iloc[valInd,]
            yVal = y.iloc[valInd,]

            if model_type == 'linear':
                model = LinearRegression()
            else:
                model = LogisticRegression(max_iter=1000)
                
            model.fit(XTrain, yTrain)
            pred = model.predict(XVal)
            pred_binary = (pred >= 0.5).astype(int)
            
            if model_type == 'linear':
                accuracy.append(metric.r2_score(yVal, pred_binary))
            else:
                accuracy.append(metric.accuracy_score(yVal, pred_binary))
                roc_score.append(metric.roc_auc_score(yVal, model.predict_proba(XVal)[:,1]))
                f1_score.append(metric.f1_score(yVal, pred_binary))
                mcc_score.append(metric.matthews_corrcoef(yVal, pred_binary))
                recall.append(metric.recall_score(yVal, pred_binary))
        
        print(f"Train: {train_input[i]}, mean val accuracy: {sum(accuracy)/len(accuracy)}")
        if model_type != 'linear':
            print(f"Train: {train_input[i]}, mean val AUC: {sum(roc_score)/len(roc_score)}")
            print(f"Train: {train_input[i]}, mean val F1: {sum(f1_score)/len(f1_score)}")
            print(f"Train: {train_input[i]}, mean val MCC: {sum(mcc_score)/len(mcc_score)}")
            print(f"Train: {train_input[i]}, mean val Recall: {sum(recall)/len(recall)}\n")

In [98]:
for i in ['linear', 'logistic']:
    print(f"\n{i} regression without normalization\n")
    regression_models(train_data, train_input, y, model_type=i)
    for j in ['l1', 'l2']:
        print(f"\n{i} regression with {j} Norm\n")
        regression_models(train_data, train_input, y, model_type=i, norm=j)


linear regression without normalization

Train: count, mean val accuracy: 0.19101626052344617
Train: select-gene count, mean val accuracy: -0.08850260257031248
Train: count+apoe, mean val accuracy: 0.19101626052344617
Train: FPKM, mean val accuracy: 0.2670362297528549
Train: select-gene FPKM, mean val accuracy: 0.026382306954680645
Train: FPKM+apoe, mean val accuracy: 0.2670362297528549
Train: TPM, mean val accuracy: 0.31719172993742484
Train: select-gene TPM, mean val accuracy: 0.025165616918085987
Train: TPM+apoe, mean val accuracy: 0.31719172993742484

linear regression with l1 Norm

Train: count, mean val accuracy: 0.3797535818071419
Train: select-gene count, mean val accuracy: 0.07285675512625456
Train: count+apoe, mean val accuracy: 0.37936273399796383

linear regression with l2 Norm

Train: count, mean val accuracy: -0.21119268505622074
Train: select-gene count, mean val accuracy: -0.04828015027768125
Train: count+apoe, mean val accuracy: -0.2856778431979441

logistic regressio

In [10]:
def xgboost_model(train_data, train_input, y, norm=None):
    for i in range(len(train_data) if norm is None else 3):
        dTrain = train_data[i]

        if norm is not None:
            dTrain = norm_data[norm][i]
        
        accuracy = []
        roc_score = []
        f1_score = []
        mcc_score = []
        recall = []
        
        for k, (trainInd, valInd) in enumerate(KFold(shuffle=True, random_state=16).split(dTrain)):
            XTrain = dTrain.iloc[trainInd]
            yTrain = y.iloc[trainInd]
            XVal = dTrain.iloc[valInd]
            yVal = y.iloc[valInd]

            model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
            model.fit(XTrain, yTrain)
            pred = model.predict(XVal)
            pred_binary = (pred >= 0.5).astype(int)
            
            accuracy.append(metric.accuracy_score(yVal, pred_binary))
            roc_score.append(metric.roc_auc_score(yVal, model.predict_proba(XVal)[:,1]))
            f1_score.append(metric.f1_score(yVal, pred_binary))
            mcc_score.append(metric.matthews_corrcoef(yVal, pred_binary))
            recall.append(metric.recall_score(yVal, pred_binary))
        
        print(f"Train: {train_input[i]}, mean val accuracy: {sum(accuracy)/len(accuracy)}")
        print(f"Train: {train_input[i]}, mean val AUC: {sum(roc_score)/len(roc_score)}")
        print(f"Train: {train_input[i]}, mean val F1: {sum(f1_score)/len(f1_score)}")
        print(f"Train: {train_input[i]}, mean val MCC: {sum(mcc_score)/len(mcc_score)}")
        print(f"Train: {train_input[i]}, mean val Recall: {sum(recall)/len(recall)}\n")

In [11]:
print("XGBoost model without normalization\n")
xgboost_model(train_data, train_input, y)
for i in ['l1', 'l2']:
    print(f"\nXGBoost model with {i} Norm\n")
    xgboost_model(train_data, train_input, y, norm=i)

XGBoost model without normalization

Train: count, mean val accuracy: 0.7907281772953415
Train: count, mean val AUC: 0.8771866913898844
Train: count, mean val F1: 0.7922590226402988
Train: count, mean val MCC: 0.5749331621963462
Train: count, mean val Recall: 0.7888052596071842

Train: select-gene count, mean val accuracy: 0.7369968340117594
Train: select-gene count, mean val AUC: 0.8489370992749627
Train: select-gene count, mean val F1: 0.7450930867180825
Train: select-gene count, mean val MCC: 0.47026240749433346
Train: select-gene count, mean val Recall: 0.7702022632094806

Train: count+apoe, mean val accuracy: 0.7906829488919042
Train: count+apoe, mean val AUC: 0.8748143380370198
Train: count+apoe, mean val F1: 0.7872796895725995
Train: count+apoe, mean val MCC: 0.5743782842125127
Train: count+apoe, mean val Recall: 0.7862093999142916

Train: FPKM, mean val accuracy: 0.7724559023066485
Train: FPKM, mean val AUC: 0.85503702453139
Train: FPKM, mean val F1: 0.7728591118933936
Train: F