In [1]:
import pandas as pd
import numpy as np
import requests
from anytree import Node, LevelOrderGroupIter
from intervaltree import Interval, IntervalTree
import yaml
import sys

with open("./configuration.yaml", "r") as yml_file:
    config = yaml.load(yml_file, yaml.Loader)

sys.path.insert(1, config['SCRIPTS_FOLDER'])
from assembly_converter import convert_assembly_hg19_to_hg38

In [16]:
lncdf = pd.read_csv("../data/lncRNA/lncipedia_5_2_hg19.bed", sep="\t", header = None)
lncdf.columns = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']
lncdf.rename({'chrom': 'chr', 'chromStart': 'start', 'chromEnd': 'end'}, inplace=True, axis=1)
lncdf

Unnamed: 0,chr,start,end,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts
0,chr1,84267198,84326229,LINC01725:44,0,-,84267198,84267198,000,3,1736116139,04839158892
1,chr16,74226290,74249420,lnc-ZFHX3-27:11,0,-,74226290,74226290,000,4,335261200170,0135131791422960
2,chrX,129611042,129658231,lnc-GPR119-1:1,0,-,129611042,129611042,000,12,181619212396784271001147790207,"0,8502,10397,14263,16666,18060,33346,34220,352..."
3,chr17,48634534,48636771,CACNA1G-AS1:3,0,-,48634534,48634534,000,2,28583,02154
4,chr1,2481358,2488450,TNFRSF14-AS1:18,0,-,2481358,2481358,000,6,200929189196197335,018553706418148047057
...,...,...,...,...,...,...,...,...,...,...,...,...
127427,chrX,102785694,102809881,LINC02589:3,0,-,102785694,102785694,000,4,1315093378,07140736923809
127428,chr5,180688212,180691568,TRIM52-AS1:36,0,+,180688212,180688212,000,3,173139516,024162840
127429,chr3,182216077,182224912,LINC01995:3,0,-,182216077,182216077,000,4,1343764342,0578963828793
127430,chr7,112262435,112348127,lnc-LSMEM1-3:4,0,+,112262435,112262435,000,8,23910888855719371336,010177128981345717781769497898685356


In [2]:
def read_data():
    df = pd.read_csv('../data/dataset_uncensored.csv')
    tst = pd.read_csv('../data/test_data_final.csv')
    tst = tst[['id', 'chr', 'start', 'end', 'ref', 'alt', 'driver', 'data_source']]
    df = pd.concat([df, tst]).reset_index(drop=True)
    return df

def read_lncrna_data():
    lncdf = pd.read_csv("../data/lncRNA/lncipedia_5_2_hg19.bed", sep="\t", header = None)
    lncdf.columns = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']
    lncdf.rename({'chrom': 'chr', 'chromStart': 'start', 'chromEnd': 'end'}, inplace=True, axis=1)
    lncdf['chr'] = lncdf['chr'].apply(lambda x: x.replace('chr', ''))
    return lncdf

def find_overlaps(df, lncdf):
    """This function divides the regions around the lncRNA into categories
    and finds the overlap of the given mutations with these regions"""
    lnc_tree = IntervalTree()

    for index, row in lncdf.iterrows():
        lnc_tree.add(Interval(row['start'], row['end'], (row['chr'], row['name'], '0kb')))

        lnc_tree.add(Interval(row['start']-2000, row['start'], (row['chr'], row['name'], '2kb upstream')))
        lnc_tree.add(Interval(row['start']-10000, row['start']-2000, (row['chr'], row['name'], '10kb upstream')))
        lnc_tree.add(Interval(row['start']-100000, row['start']-10000, (row['chr'], row['name'], '100kb upstream')))

        lnc_tree.add(Interval(row['end'], row['end']+2000, (row['chr'], row['name'], '2kb downstream')))
        lnc_tree.add(Interval(row['end']+2000, row['end']+10000, (row['chr'], row['name'], '10kb downstream')))
        lnc_tree.add(Interval(row['end']+10000, row['end']+100000, (row['chr'], row['name'], '100kb downstream')))

    df_tree = []

    for index, row in df.iterrows():
        df_tree.append(Node(name = row['chr'] + ':' + str(row['start']) + '-' + str(row['end']), chr = row['chr'], start = row['start'], end = row['end']))

    df['overlaps'] = 0
    df['overlap_lncrna'] = np.nan
    df['overlap_info'] = np.nan

    for index in range(len(df_tree)):
        child_list = [node for node in LevelOrderGroupIter(df_tree[index])][0]
        for node in child_list:
            found_list = list(set(list(lnc_tree.overlap(node.start, node.end)) + list(lnc_tree.at(node.start)) + list(lnc_tree.at(node.end))))  # use its coordinates to find interactions in the IntervalTree
            listindex = 0
            while listindex < len(found_list) and found_list[listindex].data[0] != node.chr:
                listindex += 1
            if listindex < len(found_list):
                children_left = True
                found_interaction = found_list[listindex].data
                Node(name = found_interaction[0] + ':' +  str(found_list[listindex].begin) + '-' +  str(found_list[listindex].end), chr = found_interaction[0], start = found_list[listindex].begin, end = found_list[listindex].end, checked = 0, parent = node)
                df.at[index, 'overlaps'] = len(df_tree[index].descendants)
                df.at[index, 'overlap_lncrna'] = found_interaction[1]
                df.at[index, 'overlap_info'] = found_interaction[2]
    return df

In [6]:
df = read_data()
lncdf = read_lncrna_data()
df1 = find_overlaps(df, lncdf)

In [15]:
df1['overlap_info'].value_counts()

100kb upstream      1265
100kb downstream    1113
0kb                  243
10kb downstream      168
10kb upstream        143
2kb downstream        88
2kb upstream          62
Name: overlap_info, dtype: int64

In [14]:
df1['data_source'].value_counts()

COSMIC                 1599
Rheinbay et al 2020     865
ICGC                    345
Dr.Nod 2023             197
TCGA                    142
Name: data_source, dtype: int64

In [12]:
df1[df1['overlaps'] == 0]

Unnamed: 0,id,chr,start,end,ref,alt,driver,data_source,overlaps,overlap_lncrna,overlap_info
123,mut123,5,112440053,112440053,C,A,1,ICGC,0,,
450,mut450,11,48177352,48177352,G,C,1,TCGA,0,,
451,mut451,11,48185171,48185171,G,T,1,TCGA,0,,
515,mut515,10,111061663,111061663,A,T,0,COSMIC,0,,
519,mut519,1,75327940,75327940,G,C,0,COSMIC,0,,
...,...,...,...,...,...,...,...,...,...,...,...
2043,mut2043,13,65021556,65021556,G,T,0,COSMIC,0,,
2053,mut2053,11,101873252,101873252,A,G,0,COSMIC,0,,
2073,mut2073,2,102450926,102450926,G,A,0,COSMIC,0,,
2256,test170,3,102155761,102155761,G,C,1,Rheinbay et al 2020,0,,


In [13]:
df1.to_csv('../data/lncrna.csv', index=False)