In [11]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD

Import National Institute of Health Genomic Data Commons (GDA) and Human Protein Atlas (HPA) Data 

In [12]:
# Process the gdc dataset to include only desired columns and scores
gdc = pd.read_csv("../data/gdc_luad_genes.csv")

featureNameColumns = ['# SSM Affected Cases in Cohort', '# CNV Gain', '# CNV Loss']
for i in featureNameColumns:
    gdc[['1', '2', '3', '4']] = gdc[i].replace({',':''}, regex=True).str.split(' ', 3, expand=True)
    gdc[i] = gdc['1'].astype(float)/gdc['3'].astype(float)
    gdc = gdc.drop(['1', '2', '3', '4'], axis=1)

gdc.drop(['Symbol', 'Name', 'Cytoband', 'Type', 'Annotations', 'Survival'], axis=1, inplace=True)

gdc[['1', '2', '3']] = gdc['# SSM Affected Cases Across the GDC'].replace({',':''}, regex=True).str.split(' ', 2, expand=True)
gdc['# SSM Affected Cases Across the GDC'] = gdc['1'].astype(float)/gdc['3'].astype(float)
gdc = gdc.drop(['1', '2', '3'], axis=1)

gdc = gdc.rename({'# SSM Affected Cases in Cohort': 'nih_ssm_in_cohort', '# SSM Affected Cases Across the GDC':'nih_ssm_across_gdc',
    '# CNV Gain':'nih_cnv_gain', '# CNV Loss':'nih_cnv_loss', 'Gene ID':'ensembl', '# Mutations':'nih_tot_mutations'}, axis=1)

gdc

Unnamed: 0,ensembl,nih_ssm_in_cohort,nih_ssm_across_gdc,nih_cnv_gain,nih_cnv_loss,nih_tot_mutations
0,ENSG00000147481,0.144621,0.028155,0.076023,0.056530,92
1,ENSG00000105877,0.144621,0.084772,0.074074,0.007797,105
2,ENSG00000188107,0.144621,0.068585,0.111111,0.056530,95
3,ENSG00000125414,0.144621,0.062601,0.005848,0.042885,95
4,ENSG00000009694,0.144621,0.071653,0.052632,0.025341,110
...,...,...,...,...,...,...
21087,ENSG00000146648,0.146384,0.052244,0.105263,0.019493,51
21088,ENSG00000133958,0.146384,0.059916,0.025341,0.046784,110
21089,ENSG00000134376,0.146384,0.056617,0.085770,0.007797,93
21090,ENSG00000109061,0.146384,0.057691,0.009747,0.040936,93


In [13]:
hpa = pd.read_csv('../data/hpa_gene_features.tsv', sep='\t').drop_duplicates(subset='Gene')

identifiers = [
    "Gene",
    "Ensembl"
]
discrete_features = [
    "Protein class",
    "Biological process",
    "Molecular function",
    "Disease involvement",
    "Subcellular location",
]
continuous_features = [
    "Tissue RNA - lung [NX]",
    "Single Cell Type RNA - Mucus-secreting cells [NX]"
]

hpa_features = hpa.iloc[:, hpa.columns.isin(identifiers+discrete_features+continuous_features)]


print("Feature Sparsity:\n", hpa_features.isna().sum())
hpa_features["Tissue RNA - lung [NX]"] = (hpa_features["Tissue RNA - lung [NX]"] - hpa_features["Tissue RNA - lung [NX]"].mean()) / hpa_features["Tissue RNA - lung [NX]"].std()
col = "Single Cell Type RNA - Mucus-secreting cells [NX]"
hpa_features[col] = (hpa_features[col] - hpa_features[col].mean()) / hpa_features[col].std()

def explode(feature) :
    return feature.apply(lambda x: x.replace(' ', '').split(','))

hpa_clean = hpa.fillna('')
for ft in discrete_features :
    hpa_clean[ft] = explode(hpa_clean[ft])

protein_class = hpa_clean["Protein class"].explode().unique()
biological_process = hpa_clean["Biological process"].explode().unique()
molecular_function = hpa_clean["Molecular function"].explode().unique()
disease_involvement = hpa_clean["Disease involvement"].explode().unique()
subcellular_location = hpa_clean["Subcellular location"].explode().unique()
GO_features = np.concatenate([protein_class, biological_process, molecular_function, disease_involvement, subcellular_location])

RowFeatures = pd.DataFrame(data = 0,index = hpa_clean['Ensembl'],columns=GO_features)
counter = 0

hpa_clean

for index, row in RowFeatures.iterrows() :
    features = hpa_clean.iloc[counter][['Protein class', 'Biological process', 'Molecular function', 'Disease involvement', 'Subcellular location']].to_list()
    flattened = [item for sublist in features for item in sublist if item]
    for t in flattened :
        row[t] = 1
    counter +=1 

n_comp = 100
svd = TruncatedSVD(n_components = n_comp)
svdModel = svd.fit(RowFeatures)
visits_emb = svdModel.transform(RowFeatures)
hpa = pd.DataFrame(data=visits_emb, index=RowFeatures.index)

Feature Sparsity:
 Gene                                                     0
Ensembl                                                  0
Protein class                                            0
Biological process                                    6858
Molecular function                                    6651
Disease involvement                                  10548
Subcellular location                                  3666
Tissue RNA - lung [NX]                                   0
Single Cell Type RNA - Mucus-secreting cells [NX]        0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hpa_features["Tissue RNA - lung [NX]"] = (hpa_features["Tissue RNA - lung [NX]"] - hpa_features["Tissue RNA - lung [NX]"].mean()) / hpa_features["Tissue RNA - lung [NX]"].std()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hpa_features[col] = (hpa_features[col] - hpa_features[col].mean()) / hpa_features[col].std()


Import Disease Gene Network (DGN) data 

In [15]:
dgn = pd.read_csv("../data/gda_disease_summary_luad.csv")

dgn_threshold = 0.02

dgn = dgn[['ensembl', 'EI_gda', 'Score_gda']].loc[dgn['EI_gda'] > dgn_threshold]
dgn.rename({'Score_gda':'gda_score'}, axis=1, inplace=True)


dgn

KeyError: "['ensembl'] not in index"