In [1]:
import pandas as pd
import numpy as np
import os
import io

In [2]:
def calculate_total_interactions(metadata):
    """This function calculates the total number of interactions present in ChIA-PET data for CTCF and POLR2A separately"""
    ctcf_files = list(metadata[~(metadata['Biosample term name'].str.contains('positive')) & ~(metadata['Biosample term name'].str.contains('activated')) & ~(metadata['Biosample term name'].str.contains('T-cell')) & (metadata['Experiment target'].str.contains('CTCF'))]['File accession'])
    polr_files = list(metadata[~(metadata['Biosample term name'].str.contains('positive')) & ~(metadata['Biosample term name'].str.contains('activated')) & ~(metadata['Biosample term name'].str.contains('T-cell')) & (metadata['Experiment target'].str.contains('POLR2A'))]['File accession'])

    sum_ctcf = 0
    for i in ctcf_files:
        f = pd.read_csv('ChIA-PET data/' + i + '.bedpe', sep = '\t',  header = None)
        sum_ctcf = sum_ctcf + len(f)

    sum_polr2a = 0
    for i in polr_files:
        f = pd.read_csv('ChIA-PET data/' + i + '.bedpe', sep = '\t',  header = None)
        sum_polr2a = sum_polr2a + len(f)
    return sum_ctcf, sum_polr2a

def add_interactions(metadata, all_files, sum_ctcf, sum_polr2a):
    """This function adds together all CTCF and POLR2A interactions and chains (separately)
    and then normalizes them using sum of total interactions"""
    files_to_keep = list(metadata[~(metadata['Biosample term name'].str.contains('positive')) & ~(metadata['Biosample term name'].str.contains('activated')) & ~(metadata['Biosample term name'].str.contains('T-cell'))]['File accession'])

    df = pd.read_csv('Results/processed_'+files_to_keep[0]+'.csv') # using an arbitrary file as starting dataframe
    df = df[['chr', 'start', 'end', 'pos_37', 'driver']]
    df[['CTCF_interactions', 'CTCF_chains', 'POLR2A_interactions', 'POLR2A_chains']] = 0    # making its columns 0

    for file in all_files:
        if file.replace('processed_', '').replace('.csv', '') in files_to_keep:
            f = pd.read_csv('Results/' + file)
            df[['CTCF_interactions', 'CTCF_chains', 'POLR2A_interactions', 'POLR2A_chains']] = df[['CTCF_interactions', 'CTCF_chains', 'POLR2A_interactions', 'POLR2A_chains']].add(f[['CTCF_interactions', 'CTCF_chains', 'POLR2A_interactions', 'POLR2A_chains']])

    df['CTCF_interactions'] = df['CTCF_interactions']/sum_ctcf
    df['CTCF_chains'] = df['CTCF_chains']/sum_ctcf
    df['POLR2A_interactions'] = df['POLR2A_interactions']/sum_polr2a
    df['POLR2A_chains'] = df['POLR2A_chains']/sum_polr2a
    return df

def long_range_interactions_results():
    all_files = os.listdir('Results')
    metadata = pd.read_csv('ChIA-PET data/metadata.tsv', sep='\t')
    df = pd.read_csv('../data/dataset_uncensored.csv')

    sum_ctcf, sum_polr2a = calculate_total_interactions(metadata)
    combined_df = add_interactions(metadata, all_files, sum_ctcf, sum_polr2a)

    df = df.merge(combined_df[['chr', 'pos_37', 'driver', 'CTCF_interactions', 'CTCF_chains', 'POLR2A_interactions', 'POLR2A_chains']], left_on=['chr', 'start', 'driver'], right_on=['chr', 'pos_37', 'driver'], how='right')
    df.drop('pos_37', inplace=True, axis=1)
    return df

In [34]:
all_files = os.listdir('Results')
metadata = pd.read_csv('ChIA-PET data/metadata.tsv', sep='\t')
df = pd.read_csv('../data/dataset_uncensored.csv')

sum_ctcf, sum_polr2a = calculate_total_interactions(metadata)

In [5]:
dfraw = pd.read_csv('Results/processed_ENCFF986WPD.csv') # using an arbitrary file as starting dataframe
dfraw = dfraw[['chr', 'start', 'end', 'pos_37', 'driver']]
# dfraw.rename({'start_hg19': 'pos_37'}, axis = 1, inplace = True)
dfraw

Unnamed: 0,chr,start,end,pos_37,driver
0,1,43981392,43981392,44447064,0
1,1,193142068,193142068,193111198,1
2,1,239913497,239913497,240076797,0
3,1,189107161,189107161,189076292,0
4,1,212035724,212035724,212209066,1
...,...,...,...,...,...
1043,X,54527681,54527681,54554114,0
1044,X,75622499,75622499,74842334,0
1045,X,50192749,50192749,49957400,0
1046,X,73034552,73034552,72254391,0


In [17]:
positive_set = read_ICGC_TCGA_data()
negative_set = read_COSMIC_data()

raw_data = pd.concat([positive_set, negative_set])
raw_data.reset_index(inplace=True, drop=True)

In [29]:
dfraw.drop(['start', 'end'], inplace=True, axis=1)
merged = raw_data.merge(dfraw, left_on=['chr', 'start', 'driver'], right_on=['chr', 'pos_37', 'driver'], how='left')
merged.drop_duplicates(inplace=True)
merged.reset_index(inplace=True, drop=True)
merged

Unnamed: 0,chr,start,ref,alt,data_source,driver,pos_37
0,3,101578255,AGTT,-,ICGC,1,101578255.0
1,3,101578286,ACCATTTGCCTT,-,ICGC,1,101578286.0
2,3,101578251,TG,-,ICGC,1,101578251.0
3,17,48940016,TAAAT,-,ICGC,1,48940016.0
4,17,48939987,T,-,ICGC,1,48939987.0
...,...,...,...,...,...,...,...
1081,4,95496939,C,T,COSMIC,0,95496939.0
1082,6,53026396,C,T,COSMIC,0,53026396.0
1083,2,220077160,G,T,COSMIC,0,220077160.0
1084,16,31471209,C,T,COSMIC,0,31471209.0


In [31]:
import requests
merged1 = calculate_end_coordinates(merged)

Calling UCSC Genome Browser API to extract end coordinates. This may take some time...
Extraction complete!


In [41]:
merged1

Unnamed: 0,chr,new_pos37,ref,alt,data_source,driver,pos_37,end
0,3,101578254,CAGTT,C,ICGC,1,101578255.0,101578255
1,3,101578285,GACCATTTGCCTT,G,ICGC,1,101578286.0,101578286
2,3,101578250,CTG,C,ICGC,1,101578251.0,101578251
3,17,48940015,CTAAAT,C,ICGC,1,48940016.0,48940016
4,17,48939986,AT,A,ICGC,1,48939987.0,48939987
...,...,...,...,...,...,...,...,...
1081,4,95496939,C,T,COSMIC,0,95496939.0,95496939
1082,6,53026396,C,T,COSMIC,0,53026396.0,53026396
1083,2,220077160,G,T,COSMIC,0,220077160.0,220077160
1084,16,31471209,C,T,COSMIC,0,31471209.0,31471209


In [215]:
dfraw[(dfraw['chr'] == '3') & (dfraw['pos_37'] == 176755883)]
# df[(df['chr'] == '2') & (df['start'] == 242760774)]

Unnamed: 0,chr,start,end,pos_37,driver
700,3,177038095,177038096,176755883,1


In [226]:
combined_df[(combined_df['chr'] == '20') & (combined_df['pos_37'] == 60758100)]

Unnamed: 0,chr,start,end,pos_37,driver,CTCF_interactions,CTCF_chains,POLR2A_interactions,POLR2A_chains
612,20,62183044,62183044,60758100,1,0.011898,0.009185,0.016204,0.01169


In [40]:
combined_df = add_interactions(metadata, all_files, sum_ctcf, sum_polr2a)
combined_df.drop_duplicates(subset=['chr', 'pos_37', 'driver', 'CTCF_interactions', 'CTCF_chains', 'POLR2A_interactions', 'POLR2A_chains'], inplace=True)
combined_df

Unnamed: 0,chr,start,end,pos_37,driver,CTCF_interactions,CTCF_chains,POLR2A_interactions,POLR2A_chains
0,1,43981392,43981392,44447064,0,1.301264e-02,9.945691e-03,2.827225e-02,1.903515e-02
1,1,193142068,193142068,193111198,1,4.731307e-02,3.595848e-02,3.163439e-02,2.096407e-02
2,1,239913497,239913497,240076797,0,4.034180e-02,2.997513e-02,1.273936e-02,9.145570e-03
3,1,189107161,189107161,189076292,0,5.857861e-03,4.473829e-03,3.840686e-07,2.560458e-07
4,1,212035724,212035724,212209066,1,8.985486e-02,6.714920e-02,6.975467e-02,4.664053e-02
...,...,...,...,...,...,...,...,...,...
1043,X,54527681,54527681,54554114,0,2.826022e-03,1.681075e-03,7.670491e-04,4.919279e-04
1044,X,75622499,75622499,74842334,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1045,X,50192749,50192749,49957400,0,3.708701e-03,2.369070e-03,6.401144e-08,6.401144e-08
1046,X,73034552,73034552,72254391,0,1.021294e-07,5.106469e-08,0.000000e+00,0.000000e+00


In [44]:
# combined_df[['chr', 'start', 'end', 'pos_37', 'driver']] = dfraw[['chr', 'start', 'end', 'pos_37', 'driver']]#[(combined_df['chr'] == '20') & (combined_df['pos_37'] == 60758100)]
# merged1.rename({'start': 'new_pos37'}, inplace=True, axis=1)
combined_df = combined_df.merge(merged1, left_on=['chr', 'pos_37'], right_on=['chr', 'pos_37'])
combined_df.drop(['pos_37', 'driver_y', 'end_y'], inplace=True, axis=1)
combined_df.rename({'end_x': 'end', 'driver_x': 'driver', 'new_pos37': 'pos_37'}, axis=1,inplace=True)

In [45]:
combined_df

Unnamed: 0,chr,start,end,driver,CTCF_interactions,CTCF_chains,POLR2A_interactions,POLR2A_chains,pos_37,ref,alt,data_source
0,1,43981392,43981392,0,1.301264e-02,9.945691e-03,2.827225e-02,1.903515e-02,44447064,G,T,COSMIC
1,1,193142068,193142068,1,4.731307e-02,3.595848e-02,3.163439e-02,2.096407e-02,193111198,T,C,ICGC
2,1,239913497,239913497,0,4.034180e-02,2.997513e-02,1.273936e-02,9.145570e-03,240076797,C,T,COSMIC
3,1,189107161,189107161,0,5.857861e-03,4.473829e-03,3.840686e-07,2.560458e-07,189076292,T,G,COSMIC
4,1,212035724,212035724,1,8.985486e-02,6.714920e-02,6.975467e-02,4.664053e-02,212209066,G,C,TCGA
...,...,...,...,...,...,...,...,...,...,...,...,...
1079,X,54527681,54527681,0,2.826022e-03,1.681075e-03,7.670491e-04,4.919279e-04,54554114,A,T,COSMIC
1080,X,75622499,75622499,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,74842334,A,C,COSMIC
1081,X,50192749,50192749,0,3.708701e-03,2.369070e-03,6.401144e-08,6.401144e-08,49957400,C,T,COSMIC
1082,X,73034552,73034552,0,1.021294e-07,5.106469e-08,0.000000e+00,0.000000e+00,72254391,C,T,COSMIC


In [246]:
df.reset_index(drop=True,inplace=True)
combined_df.reset_index(drop=True,inplace=True)
for index, row in df.iterrows():
    if len(row['ref']) != len(row['alt']):
        if (row['id'] == 'mut39') or (row['id'] == 'mut41'):
            print("CAUGHT")
        # print(combined_df.loc[(combined_df['chr'] == row['chr']) & (combined_df['pos_37'] == row['end'])])
        combined_df.loc[(combined_df['chr'] == row['chr']) & (combined_df['pos_37'] == row['end']), 'pos_37'] = row['start']
        print(row['id'], row['chr'], row['start'], row['end'])
        


mut0 3 101578254 101578255
mut1 3 101578285 101578286
mut2 3 101578250 101578251
mut3 17 48940015 48940016
mut4 17 48939986 48939987
mut5 17 48939977 48939978
mut6 17 48940335 48940334
mut16 14 38060229 38060230
mut17 14 38060017 38060018
mut18 14 38059225 38059226
mut19 14 38059425 38059424
mut20 14 38060342 38060341
mut21 14 38060441 38060442
mut22 14 38059231 38059230
mut23 14 38060496 38060497
mut38 20 60758099 60758100
mut40 8 97274001 97274000
mut53 14 20811285 20811284
mut59 9 35657809 35657808
mut60 9 35657926 35657927
mut78 4 775698 775699
mut79 4 775701 775702
mut83 9 35658032 35658031
mut107 16 68846034 68846035
mut110 10 8111432 8111433
mut118 17 11984670 11984671
mut130 3 176755883 176755884
mut131 3 176755882 176755883
mut136 4 74280750 74280751
mut145 10 89692754 89692755
mut168 11 64575359 64575360
mut183 13 49047486 49047487
mut206 18 48586227 48586228
mut220 3 10191461 10191462
mut221 3 10191463 10191464
mut231 19 45284141 45284142
mut238 11 108190668 108190669
mut252

In [46]:
tmpdfr = df.merge(combined_df[['chr', 'pos_37', 'driver', 'CTCF_interactions', 'CTCF_chains', 'POLR2A_interactions', 'POLR2A_chains']], left_on=['chr', 'start', 'driver'], right_on=['chr', 'pos_37', 'driver'], how='right')
tmpdfr[tmpdfr['start'].isna()]

Unnamed: 0,id,chr,start,end,ref,alt,driver,data_source,pos_37,CTCF_interactions,CTCF_chains,POLR2A_interactions,POLR2A_chains


In [48]:
tmpdfl = df.merge(combined_df[['chr', 'pos_37', 'driver', 'CTCF_interactions', 'CTCF_chains', 'POLR2A_interactions', 'POLR2A_chains']], left_on=['chr', 'start', 'driver'], right_on=['chr', 'pos_37', 'driver'], how='left')
tmpdfl#[tmpdfl['pos_37'].isna()]

Unnamed: 0,id,chr,start,end,ref,alt,driver,data_source,pos_37,CTCF_interactions,CTCF_chains,POLR2A_interactions,POLR2A_chains
0,mut0,3,101578254,101578255,CAGTT,C,1,ICGC,101578254.0,0.038831,0.028762,3.049831e-02,1.940539e-02
1,mut1,3,101578285,101578286,GACCATTTGCCTT,G,1,ICGC,101578285.0,0.038625,0.028612,3.093142e-02,1.968601e-02
2,mut2,3,101578250,101578251,CTG,C,1,ICGC,101578250.0,0.038831,0.028762,3.050286e-02,1.940820e-02
3,mut3,17,48940015,48940016,CTAAAT,C,1,ICGC,48940015.0,0.044100,0.034074,3.473907e-02,2.470957e-02
4,mut4,17,48939986,48939987,AT,A,1,ICGC,48939986.0,0.043593,0.033675,3.473907e-02,2.470970e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1169,mut1081,4,95496939,95496939,C,T,0,COSMIC,95496939.0,0.016053,0.011297,2.572620e-03,1.701488e-03
1170,mut1082,6,53026396,53026396,C,T,0,COSMIC,53026396.0,0.035686,0.025806,1.069791e-02,6.956507e-03
1171,mut1083,2,220077160,220077160,G,T,0,COSMIC,220077160.0,0.049747,0.035302,2.426974e-02,1.519382e-02
1172,mut1084,16,31471209,31471209,C,T,0,COSMIC,31471209.0,0.025523,0.019007,2.272476e-02,1.528875e-02


In [50]:
tmpdfl[['id', 'chr', 'start', 'end', 'ref', 'alt', 'driver', 'CTCF_interactions', 'CTCF_chains', 'POLR2A_interactions', 'POLR2A_chains']].to_csv('../data/interactions_result.csv')

In [51]:
# tmpdf['id'].value_counts()
tmpdfl[tmpdfl['CTCF_chains'].isna()]

Unnamed: 0,id,chr,start,end,ref,alt,driver,data_source,pos_37,CTCF_interactions,CTCF_chains,POLR2A_interactions,POLR2A_chains
680,mut592,2,242760774,242760774,G,A,0,COSMIC,,,,,
796,mut708,1,142858610,142858610,T,C,0,COSMIC,,,,,


In [82]:
tmpdf[tmpdf['id'] == 'mut41']

Unnamed: 0,id,chr,start,end,ref,alt,driver,data_source,pos_37,CTCF_interactions,CTCF_chains,POLR2A_interactions,POLR2A_chains
41,mut41,8,97274000,97274000,T,C,1,ICGC,97274000.0,0.02274,0.017067,0.01838,0.011893
42,mut41,8,97274000,97274000,T,C,1,ICGC,97274000.0,0.02274,0.017067,0.01838,0.011893


In [10]:
pd.read_csv('final_interactions_result.csv')

Unnamed: 0,chr,start,end,pos_37,driver,CTCF_interactions,CTCF_chains,POLR2A_interactions,POLR2A_chains
0,1,43981392,43981392,44447064,0,509653,389533,441675,297371
1,1,193142068,193142068,193111198,1,1549463,1169254,173754,107885
2,1,239913497,239913497,240076797,0,460811,328873,83415,59100
3,1,189107161,189107161,189076292,0,87071,70547,6,4
4,1,212035724,212035724,212209066,1,912001,670970,408752,278048
...,...,...,...,...,...,...,...,...,...
1043,X,54527681,54527681,54554114,0,3,3,66,39
1044,X,75622499,75622499,74842334,0,0,0,0,0
1045,X,50192749,50192749,49957400,0,2693,1882,1,1
1046,X,73034552,73034552,72254391,0,4,2,0,0


In [16]:
def read_ICGC_TCGA_data():
    """
    This function reads data provided by PCAWG for driver mutations and filters it to retain only non-coding driver mutations.
    Data source: https://dcc.icgc.org/releases/PCAWG/driver_mutations
    """
    icgc = pd.read_csv('../data/TableS3_panorama_driver_mutations_ICGC_samples.controlled.tsv', sep='\t')
    icgc = icgc[icgc['category'] == 'noncoding']    #filter for noncoding mutations
    icgc['data_source'] = 'ICGC'
    tcga = pd.read_csv('D:/Users/26092539/OneDrive - ARÇELİK A.Ş/Downloads/TCGA_data.csv')  # this file is not publicly available and hence is not provided in this repository
    tcga = tcga[tcga['category'] == 'noncoding']
    tcga['data_source'] = 'TCGA'
    df = pd.concat([icgc, tcga]).drop(['Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15'], axis = 1).reset_index(drop=True)
    df.drop_duplicates(subset=['chr', 'pos', 'ref', 'alt'], inplace = True)
    df.reset_index(drop=True, inplace=True)
    df = df[['chr', 'pos', 'ref', 'alt', 'data_source']]
    df.rename({'pos': 'start'}, axis = 1, inplace = True)
    df['start'] = df['start'].apply(lambda x: int(x))
    df['driver'] = 1
    return df

def read_COSMIC_data():
    """
    This function reads non-coding mutations data provided by COSMIC
    Data source: https://cancer.sanger.ac.uk/cosmic/download
    File name: Cosmic_NonCodingVariants_Vcf_v98_GRCh37.tar
    This file contains a huge amount of non-coding mutations.
    Using BCFTools in a WSL-2 environment, 599 mutations were randomly selected and saved as a VCF file called negative_samples.vcf.
    These mutations were fisrt confirmed to not overlap with our positive set, and then made to be used as our negative set.
    """
    df = pd.read_csv('../data/negative_samples.vcf', sep='\t', header=None)
    df.columns=['chr', 'start', 'ref', 'alt']
    df['data_source'] = 'COSMIC'
    df = df[['chr', 'start', 'ref', 'alt', 'data_source']]
    df['start'] = df['start'].apply(lambda x: int(x))
    df['driver'] = 0
    return df

def calculate_end_coordinates(df):
    """
    This function calls the UCSC Genome Browser API to extract the end coordinate based on insertions or deletions
    """
    print("Calling UCSC Genome Browser API to extract end coordinates. This may take some time...")
    df['end'] = 0
    for index, row in df.iterrows():
        start_pos = row['start']
        ref = row['ref']
        alt = row['alt']
        if row['ref'] != '-' and row['alt'] != '-':
            df.at[index, 'end'] = start_pos
        elif row['ref'] == '-':                 # insertion
            url = """https://api.genome.ucsc.edu/getData/sequence?genome=hg19;chrom={};start={};end={}""".format(row['chr'], start_pos - 1 , start_pos)
            # print(url)          
            response = requests.get(url)        # this API is 0 based, while my dataset is 1 based
            seq = response.json()
            df.at[index, 'end'] = start_pos
            df.at[index, 'start'] = start_pos + 1
            df.at[index, 'ref'] = seq['dna']
            df.at[index, 'alt'] = seq['dna'] + alt
        elif row['alt'] == '-':                 # deletion
            url = """https://api.genome.ucsc.edu/getData/sequence?genome=hg19;chrom={};start={};end={}""".format(row['chr'], start_pos - 1 - 1 , start_pos - 1)
            # print(url)          
            response = requests.get(url)        # this API is 0 based, while my dataset is 1 based
            seq = response.json()
            df.at[index, 'start'] = start_pos - 1
            df.at[index, 'end'] = start_pos
            df.at[index, 'ref'] = seq['dna'] + ref
            df.at[index, 'alt'] = seq['dna']
    print("Extraction complete!")
    return df