In [3]:
import pandas as pd
import numpy as np
import requests
from anytree import Node, LevelOrderGroupIter
from intervaltree import Interval, IntervalTree
import yaml
import sys

with open("../configuration.yaml", "r") as yml_file:
    config = yaml.load(yml_file, yaml.Loader)

sys.path.insert(1, config['SCRIPTS_FOLDER'])
from assembly_converter import convert_assembly_hg19_to_hg38

In [4]:
def read_data():
    df = pd.read_csv('../data/dataset_uncensored.csv')
    tst = pd.read_csv('../data/test_data_final.csv')
    tst = tst[['id', 'chr', 'start', 'end', 'ref', 'alt', 'driver', 'data_source']]
    df = pd.concat([df, tst]).reset_index(drop=True)
    return df

def read_lncrna_data():
    lncdf = pd.read_csv("../data/lncRNA/lncipedia_5_2_hg19.bed", sep="\t", header = None)
    lncdf.columns = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']
    lncdf.rename({'chrom': 'chr', 'chromStart': 'start', 'chromEnd': 'end'}, inplace=True, axis=1)
    lncdf['chr'] = lncdf['chr'].apply(lambda x: x.replace('chr', ''))
    return lncdf

def find_overlaps(df, lncdf):
    """This function divides the regions around the lncRNA into categories
    and finds the overlap of the given mutations with these regions"""
    lnc_tree = IntervalTree()

    for index, row in lncdf.iterrows():
        lnc_tree.add(Interval(row['start'], row['end'], (row['chr'], row['name'], '0kb')))

        lnc_tree.add(Interval(row['start']-2000, row['start'], (row['chr'], row['name'], '2kb upstream')))
        lnc_tree.add(Interval(row['start']-10000, row['start']-2000, (row['chr'], row['name'], '10kb upstream')))
        lnc_tree.add(Interval(row['start']-100000, row['start']-10000, (row['chr'], row['name'], '100kb upstream')))

        lnc_tree.add(Interval(row['end'], row['end']+2000, (row['chr'], row['name'], '2kb downstream')))
        lnc_tree.add(Interval(row['end']+2000, row['end']+10000, (row['chr'], row['name'], '10kb downstream')))
        lnc_tree.add(Interval(row['end']+10000, row['end']+100000, (row['chr'], row['name'], '100kb downstream')))

    df_tree = []

    for index, row in df.iterrows():
        df_tree.append(Node(name = row['chr'] + ':' + str(row['start']) + '-' + str(row['end']), chr = row['chr'], start = row['start'], end = row['end']))

    df['overlaps'] = 0
    df['overlap_lncrna'] = np.nan
    df['overlap_info'] = np.nan

    for index in range(len(df_tree)):
        child_list = [node for node in LevelOrderGroupIter(df_tree[index])][0]
        for node in child_list:
            found_list = list(set(list(lnc_tree.overlap(node.start, node.end)) + list(lnc_tree.at(node.start)) + list(lnc_tree.at(node.end))))  # use its coordinates to find interactions in the IntervalTree
            listindex = 0
            while listindex < len(found_list) and found_list[listindex].data[0] != node.chr:
                listindex += 1
            if listindex < len(found_list):
                children_left = True
                found_interaction = found_list[listindex].data
                Node(name = found_interaction[0] + ':' +  str(found_list[listindex].begin) + '-' +  str(found_list[listindex].end), chr = found_interaction[0], start = found_list[listindex].begin, end = found_list[listindex].end, checked = 0, parent = node)
                df.at[index, 'overlaps'] = len(df_tree[index].descendants)
                df.at[index, 'overlap_lncrna'] = found_interaction[1]
                df.at[index, 'overlap_info'] = found_interaction[2]
    return df

In [5]:
df = read_data()
lncdf = read_lncrna_data()
df1 = find_overlaps(df, lncdf)

In [6]:
df1#['overlap_info'].value_counts()

Unnamed: 0,id,chr,start,end,ref,alt,driver,data_source,overlaps,overlap_lncrna,overlap_info
0,mut0,3,101578254,101578255,CAGTT,C,1,ICGC,1,lnc-CEP97-4:1,100kb downstream
1,mut1,3,101578285,101578286,GACCATTTGCCTT,G,1,ICGC,1,lnc-CEP97-4:1,100kb downstream
2,mut2,3,101578250,101578251,CTG,C,1,ICGC,1,lnc-CEP97-4:1,100kb downstream
3,mut3,17,48940015,48940016,CTAAAT,C,1,ICGC,1,lnc-TOB1-1:1,100kb downstream
4,mut4,17,48939986,48939987,AT,A,1,ICGC,1,lnc-TOB1-1:1,100kb downstream
...,...,...,...,...,...,...,...,...,...,...,...
2143,test1116,9,125026995,125026996,C,G,1,Dr.Nod 2023,1,lnc-MRRF-2:1,100kb upstream
2144,test1117,9,124049461,124049462,T,A,1,Dr.Nod 2023,1,lnc-GSN-2:1,100kb upstream
2145,test1118,X,70338403,70338404,G,A,1,Dr.Nod 2023,1,lnc-ZMYM3-1:2,100kb upstream
2146,test1119,X,70401837,70401838,G,T,1,Dr.Nod 2023,1,lnc-IL2RG-5:1,0kb
