TRANSCISTOR_GOLDLAB.py

import os
import argparse
import math
import random
import matplotlib.pyplot as plt
import itertools
import csv
  
parser=argparse.ArgumentParser(description = "Calculating test statistic to classify cis interaction of UMLILO with nominal p-value as test of significance")

parser.add_argument("-g",nargs = 1,metavar = "GENCODE",type = str, help = "The location of GENCODE File containing positions of genes - which is used to calculate the distance of the TSS of each gene from the respective lncrna")
parser.add_argument("-meta",nargs = 1,type = str, help = "The location of the metadata file")
parser.add_argument("-fantom",nargs = 1,metavar="FANTOM",type = str, help = "The location of the FANTOM data files corresponding to each lncrna")
parser.add_argument("-o",nargs = 1,metavar = "csv file",type = str, help = "The location of the output file ")

args = parser.parse_args()

distance_dict_up={}
distance_dict_down={}
distance_dict_nontargs={}

''' load_meta() runs through the entire lncrna metadata file and stores different attributes as dictionaries with keys corresponding to the ENSEMBL gene id.
    The following information is collected -
    1) Experiment name
    2) Ensembl id
    3) Chromosome
    4) lncrna gene name
    5) TSS of the lncrna
    
    OUTPUT OF FUNCTION: 1 list and 4 dictionaries.
'''

def load_meta():
    global tss_dict
    global chr_dict
    global ensembl_dict
    global lncrna_name
    global exp_meta
    
    tss_dict={}
    chr_dict={}
    ensembl_dict={}
    lncrna_name={}
    with open(args.meta[0],'r') as f:
        exp_meta=[]
        next(f)
        f=f.readlines()
        for line in f:
            exp_name=line.split('.txt')[0]
            ensembl=line.split('\t')[1]
            chrom=line.split('\t')[2]
            lnc_gene_name=line.split('\t')[4]
            tss_start=line.split('\t')[5]
            
            exp_meta.append(str(exp_name))
            tss_dict.update({exp_name:tss_start})
            chr_dict.update({exp_name:chrom})
            ensembl_dict.update({exp_name:ensembl})
            lncrna_name.update({exp_name:lnc_gene_name})

''' load_gencode_data() runs through the GENCODE file and stores different attributes as dictionaries with keys corresponding to the ENSEMBL gene id.
    The following information is collected -
    1) Chromosome of gene
    2) TSS of gene
    
    OUTPUT OF FUNCTION: Two dictionaries - both with format {ENSEMBL GENE ID:DATA}
'''

def load_gencode_data():

    with open(args.g[0],'r') as g:

        p=1
        global chrom_gencode
        global dist_gencode
        chrom_gencode={}
        dist_gencode={}
        
        next(g)
        for line in g:
            line=line.split()
            chrom_gencode.update({line[0]:line[1]})
            dist_gencode.update({line[0]:line[2]})
  
'''
    Read filenames from the FANTOM data folder. Filenames are the Experiment Ids
    
    OUTPUT OF FUNCTION: A list with Experiment names
'''

def load_FANTOM_files():
    
    arr = os.listdir(args.fantom[0])
    exp_fantom=[name.split('.txt')[0] for name in arr]
    

'''
    The simulation(mode) function generates gene lists consisting of non-target genes from the corresponding lncrna experiment file.
    Since any lncrna experiment file will most likely contain different number of upregulated and downregulated genes, when simulating
    with non-targets, the correct NUMBER of genes must be chosen to replace the dysregulated genes.
    For example, if 150 genes were upregulated, then simulations must also contain 150 genes from the non-target set. This is necessary
    since the test-statistic calculation involves the number of genes in that dataset.
    
    There are three modes in which this function can be run - i)'UP' ii)'DOWN' iii)'BOTH'.
    'UP' corresponds to the case when non-target genes are to replace the upregulated genes.
    'DOWN' corresponds to the case when non-target genes are to replace the downregulated genes.
    'BOTH' corresponds to the case when non-target genes are to replace the both upregulated and downregulated genes.
    
    OUTPUT OF FUNCTION: A list consisting of appropriate number of non-target ENSEMBL gene ids
'''


def simulation(mode):
    other=list(distance_dict_nontargs.keys())
    random.shuffle(other)
    if mode=='UP':
        sim=other[:len(distance_dict_up.values())]
        return sim#,number#len(sim)
    elif mode=='DOWN':
        sim=other[:len(distance_dict_down.values())]
        return sim#,number#len(sim)

    elif mode=='BOTH':
        sim=other[:len(dist_ent.values())]
        return sim#,number#len(sim)
    

'''
    The test_stat() function calculates the average of the dataset. It reads off the list generated by simulation(mode),
    and gets test-statistic values by referring to the dictionary 'distance_dict_nontargs' which contains 1/d values.
    
    OUTPUT OF FUNCTION: The value of the test-statistic corresponding to the entire dataset
'''

iter=10000 #Number of simulations
def test_stat(inp):

    statistic=[]
    for gene in inp:
        statistic.append(distance_dict_nontargs[gene])
    return sum(statistic)/len(statistic)
    
'''
    index_hist_val(og_stat,mode) calculates the fraction of simulations with test-statistic values greater than the orginal
    lncrna experiment data.
    og_stat is the test-statistic value corresponding to the orginal dataset of a particular experiment file
    mode can assume three parameters - 'UP', 'DOWN' or 'BOTH'. The specified mode generates the respective p-value
    
    OUTPUT OF FUNCTION: Directional or overall p-value depending on the mode.
'''


def index_hist_val(og_stat,mode):
    counter=0
    if mode=='DOWN':
        for i in test_stat_val:
            if i>og_stat:
                counter=counter+1
        print('p-val (Down Reg.)',counter/len(test_stat_val))
        return counter/len(test_stat_val)
    
    elif mode=='UP':
        for i in test_stat_val:
            if i>og_stat:
                counter=counter+1
        print('p-val (Up Reg.)',counter/len(test_stat_val))
        return counter/len(test_stat_val)

    elif mode=='BOTH':
        for i in test_stat_val:
            if i>og_stat:
                counter=counter+1
        print('p-val (Both)',counter/len(test_stat_val))
        return counter/len(test_stat_val)
        
'''
    The evaluator() function is the main fucntion that takes each experiment file - parses through it to separate upregulated,
    downregulated, and non-target genes, then proceeds to calculate 1/d values. For each dictionary generated (distance_dict_up,
    distance_dict_down, and distance_dict_nontargs) in the format {ENSEMBL Gene ID:1/d}, the test-statistic is calculated and
    printed.
    Next, a .csv file is created to contain the output. Each dataset is then sent over to generate 10,000 simulations, compare the
    dataset's test-statistic value with those, and finally report p-values.

    OUTPUT OF FUNCTION: .csv file at the location that the user specifies and contains the following columns:
                         Experiment_ID, lncrna gene name, p-value(UPREGULATED), p-value(DOWNREGULATED), p-value(BOTH)
'''
    
def evaluator():

    with open(args.o[0]+'FANTOM_OUTPUT.csv','a') as w:
        header = ['Experiment_id' ,'lncRNA_name', 'pvalue_up','pvalue_down','pvalue_both']
        writer = csv.writer(w)
        writer.writerow(header)


    for i in exp_meta:
        with open(args.fantom[0]+'/'+i+'.txt','r') as f:
            
            global distance_dict_up
            global distance_dict_down
            global distance_dict_nontargs
            global test_stat_val
            global dist_ent
            distance_dict_up={}
            distance_dict_down={}
            distance_dict_nontargs={}
            
            p=1
            next(f)
            for line in f:
                line=line.split()
                #Each dictionary in this step will be setup as {GENCODE Gene ID : 1/d} where d - distance of gene from lncrna
                #A gene is only included if it is on the same chromosome as the lncrna as read off from the GENCODE file. The lncrna
                #knocked-out in the experiment file is not included in the list as it will introduce zero-error.
                
                if line[1]=='1' and line[0]!=ensembl_dict[i] and chrom_gencode.get(line[0])==chr_dict[i]:
                    d=abs(int(dist_gencode.get(line[0])) - int(tss_dict[i]))
                    distance_dict_up.update({line[0]:1/int(d**p)})
                    
                elif line[1]=='-1' and line[0]!=ensembl_dict[i] and chrom_gencode.get(line[0])==chr_dict[i]:
                    d=abs(int(dist_gencode.get(line[0])) - int(tss_dict[i]))
                    distance_dict_down.update({line[0]:1/int(d**p)})
                    
                elif line[1]=='0' and line[0]!=ensembl_dict[i] and chrom_gencode.get(line[0])==chr_dict[i]:
                    d=abs(int(dist_gencode.get(line[0])) - int(tss_dict[i]))
                    distance_dict_nontargs.update({line[0]:1/int(d**p)})
    
       
        dist_ent = {**distance_dict_up, **distance_dict_down}
        avg_up=sum(list(distance_dict_up.values()))/len(distance_dict_up.values())
        avg_down=sum(list(distance_dict_down.values()))/len(distance_dict_down.values())
        avg_entire=(sum(list(distance_dict_up.values()))+sum(list(distance_dict_down.values())))/(len(list(distance_dict_up.values()))+len(list(distance_dict_down.values())))
        print('Avg_up:',avg_up)
        print('Avg_down:',avg_down)
        print('Avg_entire:',avg_entire)
        
        
        with open(args.o[0]+'FANTOM_OUTPUT.csv','a') as w:
            writer = csv.writer(w)
            
            test_stat_val=[]
            for _ in range(iter):
                test_stat_val.append(test_stat(simulation('DOWN')))
            down=index_hist_val(avg_down,'DOWN')
            
            test_stat_val=[]
            for _ in range(iter):
                test_stat_val.append(test_stat(simulation('UP')))
            up=index_hist_val(avg_up,'UP')
            
            test_stat_val=[]
            for _ in range(iter):
                test_stat_val.append(test_stat(simulation('BOTH')))
            both=index_hist_val(avg_entire,'BOTH')
            
            data=[i,lncrna_name[i],up,down,both]
            writer.writerow(data)
    
 
load_meta()
load_gencode_data()
load_FANTOM_files()
evaluator()