In [1]:
import pickle
import random
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
N = 5
to_name = pd.read_csv('gene_data/genes.csv').set_index("code")["name"].to_dict()
to_ensembl = pd.read_csv('gene_data/genes.csv').set_index("name")["code"].to_dict()

### Suppose we select N genes and use their counts to make a case-ctrl prediction via logistic regression.  Is the prediction accuracy higher if the genes are on the same pathway?

First we load a dictionary that maps pathways (keys) to values (lists of genes)

In [2]:
with open('pairs.pkl', "rb") as f:
    _, _, _, _, pathway_dict = pickle.load(f)
pathways_with_min_N_genes = [key for key, values in list(pathway_dict.items()) if len(values) > N]
pathway_dict

{'Protein Synthesis: Asparagine': ['RPL17',
  'RPL23',
  'RPL26',
  'RPL27',
  'RPL34'],
 'Metabolic pathways': ['EPHX2', 'HK2', 'HPSE', 'MTHFR', 'NOS3'],
 'VEGF and VEGFR signaling network': ['FLT1',
  'PGF',
  'VEGFA',
  'VEGFB',
  'VEGFC'],
 'Insulin/IGF pathway-protein kinase B signaling cascade': ['IGF1', 'IGF2R'],
 'Protein Synthesis: Glycine': ['RPL17', 'RPL23', 'RPL26', 'RPL27', 'RPL34'],
 'Protein Synthesis: Serine': ['RPL17', 'RPL23', 'RPL26', 'RPL27', 'RPL34'],
 'Protein Synthesis: Leucine': ['RPL17', 'RPL23', 'RPL26', 'RPL27', 'RPL34'],
 'Protein Synthesis: Glutamine': ['RPL17', 'RPL23', 'RPL26', 'RPL27', 'RPL34'],
 'Protein Synthesis: Cysteine': ['RPL17', 'RPL23', 'RPL26', 'RPL27', 'RPL34'],
 'Protein Synthesis: Lysine': ['RPL17', 'RPL23', 'RPL26', 'RPL27', 'RPL34'],
 'Protein Synthesis: Proline': ['RPL17', 'RPL23', 'RPL26', 'RPL27', 'RPL34'],
 'Heterotrimeric GTP-binding protein coupled receptor signaling pathway (through_G_alpha_s,_cholera_toxin,_adenylate_cyclase_and_cA

Then we get N genes all from the same pathway

In [3]:
same_pathway_genes = random.sample(pathway_dict[random.choice(pathways_with_min_N_genes)], N)
same_pathway_genes

['BHLHE40', 'NDRG1', 'PLIN2', 'SERPINE1', 'LEP']

Then we get N genes from N different pathways

In [4]:
pathways = random.sample(pathways_with_min_N_genes, N)
different_pathway_genes = [random.choice(pathway_dict[p]) for p in pathways]
different_pathway_genes

['TNF', 'VEGFC', 'IGF1', 'TNF', 'LEP']

Then we load both of their counts

In [5]:
different_pathway_counts = np.log1p(pd.read_csv('gene_data/pe-rna-counts.csv', index_col=0)).loc[[to_ensembl[k] for k in different_pathway_genes], :]
different_pathway_counts

Unnamed: 0,PL1013,PL1015,PL1023,PL1043,PL1159,PL1182,PL1226,PL1362,PL1365,PL1383,...,PL2353,PL2360,PL2406,PL475,PL519,PL629,PL687,PL808,PL810,PL893
ENSG00000232810,5.4161,4.644391,4.532599,6.480045,0.0,6.871091,4.94876,5.31812,0.0,4.276666,...,6.70196,5.583496,6.109248,3.663562,1.098612,0.0,0.0,5.56452,2.70805,4.691348
ENSG00000150630,7.458186,4.744932,6.115892,5.609472,3.496508,6.610696,0.0,0.0,0.0,6.857514,...,5.283204,4.94876,6.523562,4.89784,2.302585,6.44254,0.0,6.833032,5.652489,0.0
ENSG00000017427,7.121252,4.744932,4.430817,3.912023,4.927254,6.895683,0.0,3.688879,4.962845,0.0,...,4.382027,3.73767,5.978886,0.0,0.0,0.0,0.0,5.97381,3.89182,6.51323
ENSG00000232810,5.4161,4.644391,4.532599,6.480045,0.0,6.871091,4.94876,5.31812,0.0,4.276666,...,6.70196,5.583496,6.109248,3.663562,1.098612,0.0,0.0,5.56452,2.70805,4.691348
ENSG00000174697,6.716595,3.931826,5.379897,0.0,0.0,0.0,0.0,5.545177,5.690359,4.356709,...,0.0,0.0,8.494334,4.382027,3.091042,6.246107,4.812184,6.717805,6.082219,0.0


In [6]:
same_pathway_counts = np.log1p(pd.read_csv('gene_data/pe-rna-counts.csv', index_col=0)).loc[[to_ensembl[k] for k in same_pathway_genes], :]
same_pathway_counts

Unnamed: 0,PL1013,PL1015,PL1023,PL1043,PL1159,PL1182,PL1226,PL1362,PL1365,PL1383,...,PL2353,PL2360,PL2406,PL475,PL519,PL629,PL687,PL808,PL810,PL893
ENSG00000134107,7.26403,6.997596,8.023552,7.855157,7.337588,6.727432,6.839476,7.078342,6.095825,7.387709,...,7.228388,7.956477,8.768885,6.708084,7.192181,5.866468,7.543803,8.359135,7.852828,6.042633
ENSG00000104419,7.865955,7.081708,7.36818,7.112548,7.589336,7.911691,7.646831,7.443078,8.034955,7.862206,...,7.766841,8.404472,8.65904,7.445418,6.132046,6.606954,6.904751,8.489443,7.312776,7.591357
ENSG00000147872,5.4161,5.533389,7.390799,6.745236,1.098612,0.0,7.136483,6.070738,0.0,5.141664,...,7.029088,7.109879,7.389564,7.360104,6.428105,5.252273,5.513429,5.826003,6.870053,5.631212
ENSG00000106366,7.262629,4.844187,5.686975,5.049856,0.0,6.536692,6.216606,4.927254,5.961005,3.258097,...,5.955837,7.351158,6.461468,6.113682,4.60517,3.044522,6.259581,0.0,4.828314,5.693732
ENSG00000174697,6.716595,3.931826,5.379897,0.0,0.0,0.0,0.0,5.545177,5.690359,4.356709,...,0.0,0.0,8.494334,4.382027,3.091042,6.246107,4.812184,6.717805,6.082219,0.0


And then we load the ground truth for the logistic regression model

In [7]:
raw_meta = pd.read_csv('gene_data/pe-rna-metadata.csv',header=None, names=["sample", "classification"]).T
catagorical_ground_truth  = pd.DataFrame(raw_meta.values[1:], columns=raw_meta.iloc[0])
catagorical_ground_truth.columns.name = None
catagorical_ground_truth

Unnamed: 0,PL1013,PL1015,PL1023,PL1043,PL1159,PL1182,PL1226,PL1362,PL1365,PL1383,...,PL2353,PL2360,PL2406,PL475,PL519,PL629,PL687,PL808,PL810,PL893
0,Control,Mild,Mild,Mild,Control,Control,Severe,Mild,Control,Severe,...,Mild,Severe,Severe,Severe,Severe,Severe,Mild,Mild,Mild,Severe


But this is not quite right because we are doing case-ctrl, while this is Control vs Severe vs Mild

In [8]:
ground_truth = [{"Control": "ctrl", "Severe": "case", "Mild": "case"}.get(item, item) for item in list(catagorical_ground_truth.loc[0])]
ground_truth

['ctrl',
 'case',
 'case',
 'case',
 'ctrl',
 'ctrl',
 'case',
 'case',
 'ctrl',
 'case',
 'case',
 'ctrl',
 'ctrl',
 'ctrl',
 'case',
 'ctrl',
 'case',
 'ctrl',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case']

As you can see, the sample names are in the same order for the counts and for the ground truth so we don't have to sort them

In [9]:
catagorical_ground_truth.columns == same_pathway_counts.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [10]:
catagorical_ground_truth.columns == different_pathway_counts.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

Then we create a logistic regression model

In [11]:
logreg = LogisticRegression(random_state=16)

What is the accuracy for counts of using only genes on the same pathway? Note that I am not doing any test/train splitting

In [12]:
logreg.fit(same_pathway_counts.T, ground_truth)
y_pred = logreg.predict(same_pathway_counts.T)
accuracy = y_pred==ground_truth
sum(accuracy)/len(accuracy)

0.972972972972973

What about the accuracy using only genes on different pathways?

In [13]:
logreg.fit(different_pathway_counts.T, ground_truth)
y_pred = logreg.predict(different_pathway_counts.T)
accuracy = y_pred==list(ground_truth)
sum(accuracy)/len(accuracy)

0.8378378378378378

This is promising!