In [1]:
import random
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from PathwayAnalysis import PathwayAnalysis
N = 5
to_name = pd.read_csv('gene_data/genes.csv').set_index("code")["name"].to_dict()
to_ensembl = pd.read_csv('gene_data/genes.csv').set_index("name")["code"].to_dict()

### Suppose we select N genes and use their counts to make a case-ctrl prediction via logistic regression.  Is the prediction accuracy higher if the genes are on the same pathway?

First we load a dictionary that maps pathways (keys) to values (lists of genes)

In [2]:
p = PathwayAnalysis(
    gene_path="gene_data/genes.csv",
    rnacounts_path="gene_data/pe-rna-counts.csv",
    metadata_path="gene_data/pe-rna-metadata.csv",
)

100%|██████████| 71/71 [00:26<00:00,  2.67it/s]
  mcc = ((tn*tp)-(fp*fn))/(((tn+fn)*(fp+tp)*(tn+fp)*(fn+tp))**.5)
  mcc = ((tn*tp)-(fp*fn))/(((tn+fn)*(fp+tp)*(tn+fp)*(fn+tp))**.5)
  mcc = ((tn*tp)-(fp*fn))/(((tn+fn)*(fp+tp)*(tn+fp)*(fn+tp))**.5)
  mcc = ((tn*tp)-(fp*fn))/(((tn+fn)*(fp+tp)*(tn+fp)*(fn+tp))**.5)
  mcc = ((tn*tp)-(fp*fn))/(((tn+fn)*(fp+tp)*(tn+fp)*(fn+tp))**.5)
  mcc = ((tn*tp)-(fp*fn))/(((tn+fn)*(fp+tp)*(tn+fp)*(fn+tp))**.5)


In [3]:
pathways_with_min_N_genes = [pathway for pathway, genes in p.pathways_to_genes.items() if len(genes)>N]

Then we get N genes all from the same pathway

In [4]:
same_pathway_genes = random.sample(p.pathways_to_genes[random.choice(pathways_with_min_N_genes)], N)
same_pathway_genes

['TNF', 'VEGFC', 'VEGFA', 'VEGFB', 'LEP']

Then we get N genes from N different pathways

In [5]:
pathways = random.sample(pathways_with_min_N_genes, N)
different_pathway_genes = [random.choice(p.pathways_to_genes[pathway]) for pathway in pathways]
different_pathway_genes

['VEGFA', 'LEP', 'VEGFA', 'IGF1', 'VEGFB']

Then we load both of their counts

In [6]:
different_pathway_counts = np.log1p(pd.read_csv('gene_data/pe-rna-counts.csv', index_col=0)).loc[[to_ensembl[k] for k in different_pathway_genes], :]
different_pathway_counts

Unnamed: 0,PL1013,PL1015,PL1023,PL1043,PL1159,PL1182,PL1226,PL1362,PL1365,PL1383,...,PL2353,PL2360,PL2406,PL475,PL519,PL629,PL687,PL808,PL810,PL893
ENSG00000112715,0.0,6.257668,0.0,5.192957,5.993961,0.0,0.0,5.802118,0.0,6.276643,...,6.352629,7.633854,0.0,5.347108,4.70953,6.272877,5.723585,5.652489,4.744932,5.402677
ENSG00000174697,6.716595,3.931826,5.379897,0.0,0.0,0.0,0.0,5.545177,5.690359,4.356709,...,0.0,0.0,8.494334,4.382027,3.091042,6.246107,4.812184,6.717805,6.082219,0.0
ENSG00000112715,0.0,6.257668,0.0,5.192957,5.993961,0.0,0.0,5.802118,0.0,6.276643,...,6.352629,7.633854,0.0,5.347108,4.70953,6.272877,5.723585,5.652489,4.744932,5.402677
ENSG00000017427,7.121252,4.744932,4.430817,3.912023,4.927254,6.895683,0.0,3.688879,4.962845,0.0,...,4.382027,3.73767,5.978886,0.0,0.0,0.0,0.0,5.97381,3.89182,6.51323
ENSG00000173511,7.381502,6.632002,7.677864,7.649216,6.317165,6.870053,7.63337,7.052721,6.739337,6.740519,...,7.930206,7.288244,7.198184,7.363914,3.637586,6.590301,7.608871,7.956477,7.7012,5.872118


In [7]:
same_pathway_counts = np.log1p(pd.read_csv('gene_data/pe-rna-counts.csv', index_col=0)).loc[[to_ensembl[k] for k in same_pathway_genes], :]
same_pathway_counts

Unnamed: 0,PL1013,PL1015,PL1023,PL1043,PL1159,PL1182,PL1226,PL1362,PL1365,PL1383,...,PL2353,PL2360,PL2406,PL475,PL519,PL629,PL687,PL808,PL810,PL893
ENSG00000232810,5.4161,4.644391,4.532599,6.480045,0.0,6.871091,4.94876,5.31812,0.0,4.276666,...,6.70196,5.583496,6.109248,3.663562,1.098612,0.0,0.0,5.56452,2.70805,4.691348
ENSG00000150630,7.458186,4.744932,6.115892,5.609472,3.496508,6.610696,0.0,0.0,0.0,6.857514,...,5.283204,4.94876,6.523562,4.89784,2.302585,6.44254,0.0,6.833032,5.652489,0.0
ENSG00000112715,0.0,6.257668,0.0,5.192957,5.993961,0.0,0.0,5.802118,0.0,6.276643,...,6.352629,7.633854,0.0,5.347108,4.70953,6.272877,5.723585,5.652489,4.744932,5.402677
ENSG00000173511,7.381502,6.632002,7.677864,7.649216,6.317165,6.870053,7.63337,7.052721,6.739337,6.740519,...,7.930206,7.288244,7.198184,7.363914,3.637586,6.590301,7.608871,7.956477,7.7012,5.872118
ENSG00000174697,6.716595,3.931826,5.379897,0.0,0.0,0.0,0.0,5.545177,5.690359,4.356709,...,0.0,0.0,8.494334,4.382027,3.091042,6.246107,4.812184,6.717805,6.082219,0.0


And then we load the ground truth for the logistic regression model

In [8]:
raw_meta = pd.read_csv('gene_data/pe-rna-metadata.csv',header=None, names=["sample", "classification"]).T
catagorical_ground_truth  = pd.DataFrame(raw_meta.values[1:], columns=raw_meta.iloc[0])
catagorical_ground_truth.columns.name = None
catagorical_ground_truth

Unnamed: 0,PL1013,PL1015,PL1023,PL1043,PL1159,PL1182,PL1226,PL1362,PL1365,PL1383,...,PL2353,PL2360,PL2406,PL475,PL519,PL629,PL687,PL808,PL810,PL893
0,Control,Mild,Mild,Mild,Control,Control,Severe,Mild,Control,Severe,...,Mild,Severe,Severe,Severe,Severe,Severe,Mild,Mild,Mild,Severe


But this is not quite right because we are doing case-ctrl, while this is Control vs Severe vs Mild

In [9]:
ground_truth = [{"Control": "ctrl", "Severe": "case", "Mild": "case"}.get(item, item) for item in list(catagorical_ground_truth.loc[0])]
ground_truth

['ctrl',
 'case',
 'case',
 'case',
 'ctrl',
 'ctrl',
 'case',
 'case',
 'ctrl',
 'case',
 'case',
 'ctrl',
 'ctrl',
 'ctrl',
 'case',
 'ctrl',
 'case',
 'ctrl',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case',
 'case']

As you can see, the sample names are in the same order for the counts and for the ground truth so we don't have to sort them

In [10]:
catagorical_ground_truth.columns == same_pathway_counts.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [11]:
catagorical_ground_truth.columns == different_pathway_counts.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

Then we create a logistic regression model

In [19]:
logreg = LogisticRegression()

What is the accuracy for counts of using only genes on the same pathway? Note that I am not doing any test/train splitting

In [20]:
logreg.fit(same_pathway_counts.T, ground_truth)
y_pred = logreg.predict(same_pathway_counts.T)
accuracy = y_pred==ground_truth
sum(accuracy)/len(accuracy)

0.8918918918918919

What about the accuracy using only genes on different pathways?

In [21]:
logreg.fit(different_pathway_counts.T, ground_truth)
y_pred = logreg.predict(different_pathway_counts.T)
accuracy = y_pred==list(ground_truth)
sum(accuracy)/len(accuracy)

0.8918918918918919

This is promising!