In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
from natsort import natsorted
import scipy.stats as stats
import networkx as nx
import os

In [2]:
# Get the list of all files and directories
path = ".\\1 - Output\datasets"
dir_list = os.listdir(path)
pathwaysNames=[]
for file in dir_list:
    if file not in ["BP","CombinedDF.csv"]:
        pathwaysNames.append(file[:-4])

In [3]:
cols = ['Age_Discrete', 'Sex', 'Smoking','SPY_Discrete', 'Stage', 'Status', 'TMB_Discrete','PatientID']
datasets={}
patientInPathways={}
allPatients=[]
for sp in pathwaysNames:
    sp = sp.split(".csv")[0]
    
    spName = sp.split(" - ")[0]
    cols.append(sp)

    datasets[sp] = pd.read_csv(".\\1 - Output\datasets\\"+sp+".csv",sep="\t",usecols=cols)
    datasets[sp]=datasets[sp].rename(columns={'Age_Discrete': 'Age'})
    datasets[sp]=datasets[sp].rename(columns={'SPY_Discrete': 'SPY'})
    datasets[sp]=datasets[sp].rename(columns={'TMB_Discrete': 'TMB'})
    cols.pop()

    allPatients.extend(list(pd.read_csv(".\\1 - Output\datasets\\"+sp+".csv",sep="\t",usecols=["PatientID"])["PatientID"]))

In [4]:
print("Repeting Patient",len(allPatients))
print("Unique Patient",len(set(allPatients)))

Repeting Patient 4652
Unique Patient 1044


## Joining Dataframes

In [5]:
features=[]
features.extend(list(datasets['Cell-Cell communication'].columns)[1:-1])
features.extend(list(datasets.keys()))
len(features),features#all patient and tumor features + all SP or subpathways with significant intersections

(19,
 ['Age',
  'Sex',
  'Smoking',
  'SPY',
  'Stage',
  'Status',
  'TMB',
  'Cell-Cell communication',
  'Cellular responses to stimuli -- Oncogene Induced Senescence',
  'Chromatin organization -- PKMTs methylate histone lysines',
  'Developmental Biology',
  'Extracellular matrix organization',
  'Gene expression (Transcription) -- Transcriptional regulation by RUNX3',
  'Immune System -- Dectin-2 family',
  'Metabolism of proteins -- SUMOylation',
  'Neuronal System',
  'Programmed Cell Death -- Activation of NOXA and translocation to mitochondria',
  'Signal Transduction',
  'Vesicle-mediated transport -- Scavenging by Class H Receptors'])

In [6]:
combinedDF=pd.DataFrame(index=list(set(allPatients)), columns=features)
combinedDF = combinedDF.map(lambda x: np.nan)

# Set dtype for the patients and tomur variables
for column in combinedDF.columns[:7]:
    combinedDF[column] = combinedDF[column].astype('object')

for column in combinedDF.columns[7:]:
    combinedDF[column] = combinedDF[column].astype('Int64')

combinedDF.head()

Unnamed: 0,Age,Sex,Smoking,SPY,Stage,Status,TMB,Cell-Cell communication,Cellular responses to stimuli -- Oncogene Induced Senescence,Chromatin organization -- PKMTs methylate histone lysines,Developmental Biology,Extracellular matrix organization,Gene expression (Transcription) -- Transcriptional regulation by RUNX3,Immune System -- Dectin-2 family,Metabolism of proteins -- SUMOylation,Neuronal System,Programmed Cell Death -- Activation of NOXA and translocation to mitochondria,Signal Transduction,Vesicle-mediated transport -- Scavenging by Class H Receptors
LUAD-CHTN-MAD08-00104,,,,,,,,,,,,,,,,,,,
LUAD-D01751,,,,,,,,,,,,,,,,,,,
TCGA-44-7660,,,,,,,,,,,,,,,,,,,
TCGA-55-6983,,,,,,,,,,,,,,,,,,,
TCGA-55-6985,,,,,,,,,,,,,,,,,,,


### Iterate all dataframes and update the combined dataframe with the info from indivuduals dfs

In [7]:
for df in datasets.values():
    for index, row in df.iterrows():
        features = list(row.keys())
        pId = row[features[0]]
        for f in features[1:]:
            if f in ["Age","Sex","Smoking","SPY","Stage","Status","TMB"]:
                combinedDF.at[pId,f]=row[f]
            else:
                combinedDF.at[pId,f]=int(row[f])

In [8]:
row[f],f

(1, 'Vesicle-mediated transport -- Scavenging by Class H Receptors')

In [9]:
combinedDF

Unnamed: 0,Age,Sex,Smoking,SPY,Stage,Status,TMB,Cell-Cell communication,Cellular responses to stimuli -- Oncogene Induced Senescence,Chromatin organization -- PKMTs methylate histone lysines,Developmental Biology,Extracellular matrix organization,Gene expression (Transcription) -- Transcriptional regulation by RUNX3,Immune System -- Dectin-2 family,Metabolism of proteins -- SUMOylation,Neuronal System,Programmed Cell Death -- Activation of NOXA and translocation to mitochondria,Signal Transduction,Vesicle-mediated transport -- Scavenging by Class H Receptors
LUAD-CHTN-MAD08-00104,,Male,,,,,4-8,,,,,,,,4,,,15,
LUAD-D01751,,Female,Current Smoker,15-29,IB,,4-8,,,,13,,3,,,11,1,19,
TCGA-44-7660,69-74,Male,Current Smoker,118-132,IB,0:LIVING,8-13,,2,,,9,,,,,1,,
TCGA-55-6983,80-85,Male,Reformed <=15,59-74,IIB,0:LIVING,0-4,,,,,,,,,,,12,
TCGA-55-6985,54-59,Female,Reformed <=15,44-59,IB,0:LIVING,13-17,6,,,19,9,,2,,11,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-85-7696,59-64,Male,Reformed <=15,29-44,IA,0:LIVING,4-8,,,2,,,,2,5,,1,,
TCGA-22-4604,69-74,Male,Current Smoker,44-59,IIA,1:DECEASED,4-8,4,,,13,,,,,7,1,23,1
TCGA-97-7941,69-74,Female,Reformed >15,15-29,IA,0:LIVING,0-4,,,,6,,2,,,4,,10,1
TCGA-18-5592,54-59,Male,Reformed >15,29-44,IIB,0:LIVING,8-13,,,,21,10,4,,5,,1,31,


In [10]:
combinedDF.isnull().sum()

Age                                                                              168
Sex                                                                                3
Smoking                                                                           51
SPY                                                                              233
Stage                                                                             31
Status                                                                           144
TMB                                                                                0
Cell-Cell communication                                                          705
Cellular responses to stimuli -- Oncogene Induced Senescence                     767
Chromatin organization -- PKMTs methylate histone lysines                        807
Developmental Biology                                                            455
Extracellular matrix organization                                

In [11]:
combinedDF.to_csv("1 - Output/datasets/CombinedDF.csv",sep="\t",index=True)