# Why some genes have differential expression but are not in PPI network?

In [1]:
import numpy as np
import pandas as pd
import h5py

In [2]:
def load_diff_expr(file_name_up, file_name_down):
    """Loads differential up and down-regulated genes."""
    differential_expression_up = pd.read_html(file_name_up,
                                           index_col=2,
                                           header=0
                                          )[0]
    differential_expression_down = pd.read_html(file_name_down,
                                               index_col=2,
                                               header=0
                                               )[0]
    print ("Loaded Differential Expression from html...")

    # concatenate the up and down-regulated genes
    differential_expression_down.drop('Ensembl', inplace=True)
    differential_expression_down = differential_expression_down.convert_objects(convert_numeric=True)
    de = pd.concat([differential_expression_up, differential_expression_down])
    return de

In [4]:
# load network
with h5py.File('../data/preprocessing/ppi_networks.h5', 'r') as f:
    gene_expression_data = f['gene_expression'][:]
    ppi_network = f['consensusPathDB_ppi'][:]
    gene_names = f['gene_names'][:]

de = load_diff_expr('../data/differential_expression/DEanalysis/gpp1605-up.html',
                    '../data/differential_expression/DEanalysis/gpp1605-down.html'
                   )
gene_names_df = pd.DataFrame(gene_names, columns=['Ensembl', 'Gene-name'])
gene_names_df = gene_names_df.set_index('Ensembl')

Loaded Differential Expression from html...




In [62]:
common = de.join(gene_names_df, lsuffix='left', how='inner')

In [70]:
gene_names_df[~de.isin(gene_names_df)].dropna()

Unnamed: 0_level_0,Gene-name
Ensembl,Unnamed: 1_level_1


In [64]:
n = gene_names[:,0]
'ENSG00000000457' in n

True

In [65]:
'ENSG00000000457' in de.index

False

In [66]:
common.shape

(4754, 9)

In [67]:
gene_names_df.shape[0] - common.shape[0]

5613

In [68]:
gene_names_df[gene_names_df['Gene-name'] == 'SCYL3']

Unnamed: 0_level_0,Gene-name
Ensembl,Unnamed: 1_level_1
ENSG00000000457,SCYL3


In [69]:
de[de['Gene-name'] == 'SCYL3']

Unnamed: 0_level_0,Gene-type,Gene-name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
Ensembl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1


In [72]:
gene_names_df['Gene-name'].unique().shape

(10355,)

In [73]:
gene_names_df.shape

(10367, 1)

In [75]:
de['Gene-name'].unique().shape

(8856,)

In [80]:
gene_names_df[de.isin(gene_names_df)].dropna().shape

(4754, 1)

In [81]:
de.isin(gene_names_df)

Unnamed: 0_level_0,Gene-type,Gene-name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
Ensembl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENSG00000140403,False,True,False,False,False,False,False,False
ENSG00000132002,False,True,False,False,False,False,False,False
ENSG00000162616,False,True,False,False,False,False,False,False
ENSG00000120694,False,True,False,False,False,False,False,False
ENSG00000149257,False,True,False,False,False,False,False,False
ENSG00000178381,False,True,False,False,False,False,False,False
ENSG00000110172,False,False,False,False,False,False,False,False
ENSG00000151929,False,True,False,False,False,False,False,False
ENSG00000168685,False,True,False,False,False,False,False,False
ENSG00000109846,False,True,False,False,False,False,False,False


In [86]:
gene_names_df.join(de, how='inner', lsuffix='_l').shape

(4754, 9)