In [32]:
from scipy.stats import ranksums
import pandas as pd
import itertools

## Utils and Library for notebook
from notebook_utils.OpenKbcMSToolkit import ExtractionToolkit as exttoolkit

# Root data path
DATA_PATH = '../data/'

#Data loading
df = pd.read_csv("resultFiles/Ranksum.RFECV.CD8.csv", engine='c', index_col=0)
meta_data = pd.read_csv(DATA_PATH+'annotation_metadata/EPIC_HCvB_metadata_baseline_updated-share.csv')
longDD_samples, shortDD_samples = exttoolkit.get_sample_name_by_contValues(meta_data, 'HCVB_ID', 'DiseaseDuration', 50)

In [33]:
MSIGDB_PATH = "/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/data/MsigDB_list/msigdb.v7.4.entrez.gmt" #GMT file
gmt_arr = [] # gmt parsing array
with open(MSIGDB_PATH, 'r') as infile:
    for line in infile:
        gmt_value = line.strip().split("\t") # splitting line
        sig_names = gmt_value[0] # signature name
        gene_list = gmt_value[2:] # gene list
        gmt_arr.append([sig_names]+gene_list)

In [34]:
gmt_ext_arr = [x[1:] for x in gmt_arr if x[0] in df.index.tolist()] # Selected signature genes
gmt_ext_arr = list(itertools.chain(*gmt_ext_arr))
gmt_ext_arr = list(set(gmt_ext_arr)) # remove duplicated

In [35]:
print(len(gmt_ext_arr))
df_expr = pd.read_csv(DATA_PATH+"counts_normalized/IDConvertedFiles/counts_vst_CD8.csv", engine='c', index_col=0) # get expr
gene_intersected = list(set(gmt_ext_arr).intersection(df_expr.index.tolist())) # intersected between expr and actScore sig
df_expr = df_expr.loc[gene_intersected] # selected expr only
longDD_samples = list(set(longDD_samples.values.tolist()).intersection(df_expr.columns.tolist())) # intersected with act score matrix
shortDD_samples = list(set(shortDD_samples.values.tolist()).intersection(df_expr.columns.tolist())) # intersected with act score matrix

650


In [36]:
significant_list = []
for x in df_expr.index.tolist():
    long_data = df_expr[longDD_samples].loc[x] # Long expr list
    short_data = df_expr[shortDD_samples].loc[x] # Short expr list

    s, p = ranksums(long_data.values.tolist(), short_data.values.tolist()) # ranksum
    fc = short_data.mean() - long_data.mean() # FC

    if p<0.05:
        significant_list.append([x,fc, p]) # sig list

sig_df = pd.DataFrame(significant_list, columns=["Names", "fc", "pval"])

In [37]:
df_expr.loc[sig_df['Names'].values.tolist()].to_csv("resultFiles/Ranksum.CD8.gene.csv")