In [1]:
import pandas as pd
import numpy as np
import os
import io

In [65]:
def calculate_total_interactions(metadata):
    """This function calculates the total number of interactions present in ChIA-PET data for CTCF and POLR2A separately"""
    ctcf_files = list(metadata[~(metadata['Biosample term name'].str.contains('positive')) & ~(metadata['Biosample term name'].str.contains('activated')) & ~(metadata['Biosample term name'].str.contains('T-cell')) & (metadata['Experiment target'].str.contains('CTCF'))]['File accession'])
    polr_files = list(metadata[~(metadata['Biosample term name'].str.contains('positive')) & ~(metadata['Biosample term name'].str.contains('activated')) & ~(metadata['Biosample term name'].str.contains('T-cell')) & (metadata['Experiment target'].str.contains('POLR2A'))]['File accession'])

    sum_ctcf = 0
    for i in ctcf_files:
        f = pd.read_csv('ChIA-PET data/' + i + '.bedpe', sep = '\t',  header = None)
        sum_ctcf = sum_ctcf + len(f)

    sum_polr2a = 0
    for i in polr_files:
        f = pd.read_csv('ChIA-PET data/' + i + '.bedpe', sep = '\t',  header = None)
        sum_polr2a = sum_polr2a + len(f)
    return sum_ctcf, sum_polr2a

def add_interactions(metadata, all_files, sum_ctcf, sum_polr2a):
    """This function adds together all CTCF and POLR2A interactions and chains (separately)
    and then normalizes them using sum of total interactions"""
    files_to_keep = list(metadata[~(metadata['Biosample term name'].str.contains('positive')) & ~(metadata['Biosample term name'].str.contains('activated')) & ~(metadata['Biosample term name'].str.contains('T-cell'))]['File accession'])

    df = pd.read_csv('Results/processed_'+files_to_keep[0]+'.csv') # using an arbitrary file as starting dataframe
    df = df[['chr', 'start', 'end', 'pos_37', 'driver']]
    df[['CTCF_interactions', 'CTCF_chains', 'POLR2A_interactions', 'POLR2A_chains']] = 0    # making its columns 0

    for file in all_files:
        if file.replace('processed_', '').replace('.csv', '') in files_to_keep:
            f = pd.read_csv('Results/' + file)
            df[['CTCF_interactions', 'CTCF_chains', 'POLR2A_interactions', 'POLR2A_chains']] = df[['CTCF_interactions', 'CTCF_chains', 'POLR2A_interactions', 'POLR2A_chains']].add(f[['CTCF_interactions', 'CTCF_chains', 'POLR2A_interactions', 'POLR2A_chains']])

    df['CTCF_interactions'] = df['CTCF_interactions']/sum_ctcf
    df['CTCF_chains'] = df['CTCF_chains']/sum_ctcf
    df['POLR2A_interactions'] = df['POLR2A_interactions']/sum_polr2a
    df['POLR2A_chains'] = df['POLR2A_chains']/sum_polr2a
    return df

def long_range_interactions_results():
    all_files = os.listdir('Results')
    metadata = pd.read_csv('ChIA-PET data/metadata.tsv', sep='\t')
    df = pd.read_csv('../data/dataset_uncensored.csv')

    sum_ctcf, sum_polr2a = calculate_total_interactions(metadata)
    combined_df = add_interactions(metadata, all_files, sum_ctcf, sum_polr2a)

    df = df.merge(combined_df[['chr', 'pos_37', 'driver', 'CTCF_interactions', 'CTCF_chains', 'POLR2A_interactions', 'POLR2A_chains']], left_on=['chr', 'start', 'driver'], right_on=['chr', 'pos_37', 'driver'], how='right')
    df.drop('pos_37', inplace=True, axis=1)
    return df

In [248]:
all_files = os.listdir('Results')
metadata = pd.read_csv('ChIA-PET data/metadata.tsv', sep='\t')
df = pd.read_csv('../data/dataset_uncensored.csv')

# sum_ctcf, sum_polr2a = calculate_total_interactions(metadata)

In [250]:
dfraw = pd.read_csv('Results/processed_ENCFF986WPD.csv') # using an arbitrary file as starting dataframe
dfraw = dfraw[['chr', 'start', 'end', 'start_hg19', 'driver']]
dfraw.rename({'start_hg19': 'pos_37'}, axis = 1, inplace = True)
dfraw

Unnamed: 0,chr,start,end,pos_37,driver
0,1,35743531,35743531,36209132,1
1,1,1406995,1406995,1342375,0
2,1,222665600,222665600,222838942,0
3,1,193142068,193142068,193111198,1
4,1,16150965,16150965,16477460,1
...,...,...,...,...,...
1080,X,45110078,45110078,44969323,1
1081,X,45107408,45107408,44966653,1
1082,X,128775992,128775992,127909970,0
1083,X,109743024,109743024,108986253,0


In [215]:
dfraw[(dfraw['chr'] == '3') & (dfraw['pos_37'] == 176755883)]
# df[(df['chr'] == '2') & (df['start'] == 242760774)]

Unnamed: 0,chr,start,end,pos_37,driver
700,3,177038095,177038096,176755883,1


In [226]:
combined_df[(combined_df['chr'] == '20') & (combined_df['pos_37'] == 60758100)]

Unnamed: 0,chr,start,end,pos_37,driver,CTCF_interactions,CTCF_chains,POLR2A_interactions,POLR2A_chains
612,20,62183044,62183044,60758100,1,0.011898,0.009185,0.016204,0.01169


In [249]:
combined_df = add_interactions(metadata, all_files, sum_ctcf, sum_polr2a)
combined_df.drop_duplicates(subset=['chr', 'pos_37', 'driver', 'CTCF_interactions', 'CTCF_chains', 'POLR2A_interactions', 'POLR2A_chains'], inplace=True)
combined_df

Unnamed: 0,chr,start,end,pos_37,driver,CTCF_interactions,CTCF_chains,POLR2A_interactions,POLR2A_chains
0,1,43981392,43981392,44447064,0,1.301264e-02,9.945691e-03,2.827219e-02,1.903508e-02
1,1,193142068,193142068,193111198,1,4.731307e-02,3.595848e-02,3.163439e-02,2.096407e-02
2,1,239913497,239913497,240076797,0,4.034180e-02,2.997513e-02,1.273936e-02,9.145570e-03
3,1,189107161,189107161,189076292,0,5.857861e-03,4.473829e-03,3.840686e-07,2.560458e-07
4,1,212035724,212035724,212209066,1,8.985486e-02,6.714920e-02,6.975461e-02,4.664046e-02
...,...,...,...,...,...,...,...,...,...
1043,X,54527681,54527681,54554114,0,2.826022e-03,1.681075e-03,7.670491e-04,4.919279e-04
1044,X,75622499,75622499,74842334,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1045,X,50192749,50192749,49957400,0,3.708701e-03,2.369070e-03,6.401144e-08,6.401144e-08
1046,X,73034552,73034552,72254391,0,1.021294e-07,5.106469e-08,0.000000e+00,0.000000e+00


In [191]:
# combined_df[['chr', 'start', 'end', 'pos_37', 'driver']] = dfraw[['chr', 'start', 'end', 'pos_37', 'driver']]#[(combined_df['chr'] == '20') & (combined_df['pos_37'] == 60758100)]
combined_df

Unnamed: 0,chr,start,end,pos_37,driver,CTCF_interactions,CTCF_chains,POLR2A_interactions,POLR2A_chains
0,1,43981392,43981392,44447064,0,1.301264e-02,9.945691e-03,2.827219e-02,1.903508e-02
1,1,193142068,193142068,193111198,1,4.731307e-02,3.595848e-02,3.163439e-02,2.096407e-02
2,1,239913497,239913497,240076797,0,4.034180e-02,2.997513e-02,1.273936e-02,9.145570e-03
3,1,189107161,189107161,189076292,0,5.857861e-03,4.473829e-03,3.840686e-07,2.560458e-07
4,1,212035724,212035724,212209066,1,8.985486e-02,6.714920e-02,6.843565e-02,4.579878e-02
...,...,...,...,...,...,...,...,...,...
1043,X,54527681,54527681,54554114,0,2.826022e-03,1.681075e-03,7.670491e-04,4.919279e-04
1044,X,75622499,75622499,74842334,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1045,X,50192749,50192749,49957400,0,3.708701e-03,2.369070e-03,6.401144e-08,6.401144e-08
1046,X,73034552,73034552,72254391,0,1.021294e-07,5.106469e-08,0.000000e+00,0.000000e+00


In [245]:
combined_df

Unnamed: 0,chr,start,end,pos_37,driver,CTCF_interactions,CTCF_chains,POLR2A_interactions,POLR2A_chains
0,1,43981392,43981392,44447064,0,1.301264e-02,9.945691e-03,2.827219e-02,1.903508e-02
1,1,193142068,193142068,193111198,1,4.731307e-02,3.595848e-02,3.163439e-02,2.096407e-02
2,1,239913497,239913497,240076797,0,4.034180e-02,2.997513e-02,1.273936e-02,9.145570e-03
3,1,189107161,189107161,189076292,0,5.857861e-03,4.473829e-03,3.840686e-07,2.560458e-07
4,1,212035724,212035724,212209066,1,8.985486e-02,6.714920e-02,6.975461e-02,4.664046e-02
...,...,...,...,...,...,...,...,...,...
1043,X,54527681,54527681,54554114,0,2.826022e-03,1.681075e-03,7.670491e-04,4.919279e-04
1044,X,75622499,75622499,74842334,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1045,X,50192749,50192749,49957400,0,3.708701e-03,2.369070e-03,6.401144e-08,6.401144e-08
1046,X,73034552,73034552,72254391,0,1.021294e-07,5.106469e-08,0.000000e+00,0.000000e+00


In [246]:
df.reset_index(drop=True,inplace=True)
combined_df.reset_index(drop=True,inplace=True)
for index, row in df.iterrows():
    if len(row['ref']) != len(row['alt']):
        if (row['id'] == 'mut39') or (row['id'] == 'mut41'):
            print("CAUGHT")
        # print(combined_df.loc[(combined_df['chr'] == row['chr']) & (combined_df['pos_37'] == row['end'])])
        combined_df.loc[(combined_df['chr'] == row['chr']) & (combined_df['pos_37'] == row['end']), 'pos_37'] = row['start']
        print(row['id'], row['chr'], row['start'], row['end'])
        


mut0 3 101578254 101578255
mut1 3 101578285 101578286
mut2 3 101578250 101578251
mut3 17 48940015 48940016
mut4 17 48939986 48939987
mut5 17 48939977 48939978
mut6 17 48940335 48940334
mut16 14 38060229 38060230
mut17 14 38060017 38060018
mut18 14 38059225 38059226
mut19 14 38059425 38059424
mut20 14 38060342 38060341
mut21 14 38060441 38060442
mut22 14 38059231 38059230
mut23 14 38060496 38060497
mut38 20 60758099 60758100
mut40 8 97274001 97274000
mut53 14 20811285 20811284
mut59 9 35657809 35657808
mut60 9 35657926 35657927
mut78 4 775698 775699
mut79 4 775701 775702
mut83 9 35658032 35658031
mut107 16 68846034 68846035
mut110 10 8111432 8111433
mut118 17 11984670 11984671
mut130 3 176755883 176755884
mut131 3 176755882 176755883
mut136 4 74280750 74280751
mut145 10 89692754 89692755
mut168 11 64575359 64575360
mut183 13 49047486 49047487
mut206 18 48586227 48586228
mut220 3 10191461 10191462
mut221 3 10191463 10191464
mut231 19 45284141 45284142
mut238 11 108190668 108190669
mut252

In [222]:
tmpdfr = df.merge(combined_df[['chr', 'pos_37', 'driver', 'CTCF_interactions', 'CTCF_chains', 'POLR2A_interactions', 'POLR2A_chains']], left_on=['chr', 'start', 'driver'], right_on=['chr', 'pos_37', 'driver'], how='right')
tmpdfr[tmpdfr['start'].isna()]

Unnamed: 0,id,chr,start,end,ref,alt,driver,data_source,pos_37,CTCF_interactions,CTCF_chains,POLR2A_interactions,POLR2A_chains
522,,19,,,,,0,,45976823,0.022005,0.017053,0.018048,0.01248
1047,,Un_KI270742v1,,,,,0,,142858610,0.0,0.0,0.0,0.0


In [247]:
tmpdfl = df.merge(combined_df[['chr', 'pos_37', 'driver', 'CTCF_interactions', 'CTCF_chains', 'POLR2A_interactions', 'POLR2A_chains']], left_on=['chr', 'start', 'driver'], right_on=['chr', 'pos_37', 'driver'], how='left')
tmpdfl[tmpdfl['pos_37'].isna()]

Unnamed: 0,id,chr,start,end,ref,alt,driver,data_source,pos_37,CTCF_interactions,CTCF_chains,POLR2A_interactions,POLR2A_chains
39,mut39,20,60758100,60758100,G,A,1,ICGC,,,,,
41,mut41,8,97274000,97274000,T,C,1,ICGC,,,,,
135,mut130,3,176755883,176755884,TACCTTTAAAGTCATGTCGTCAGAA,T,1,ICGC,,,,,
138,mut132,3,176755884,176755884,A,T,1,ICGC,,,,,
408,mut397,17,11984671,11984671,A,G,1,TCGA,,,,,
478,mut466,17,7579311,7579311,C,A,1,TCGA,,,,,
482,mut470,17,7579311,7579311,C,T,1,TCGA,,,,,
498,mut486,17,7590693,7590693,A,C,1,TCGA,,,,,
604,mut592,2,242760774,242760774,G,A,0,COSMIC,,,,,
720,mut708,1,142858610,142858610,T,C,0,COSMIC,,,,,


In [238]:
len(tmpdfl[tmpdfl['pos_37'].isna()].loc[39]['ref'])

KeyError: 39

In [86]:
# tmpdf['id'].value_counts()
tmpdf#.drop_duplicates()

Unnamed: 0,id,chr,start,end,ref,alt,driver,data_source,pos_37,CTCF_interactions,CTCF_chains,POLR2A_interactions,POLR2A_chains
0,mut709,1,44447064.0,44447064.0,G,T,0,COSMIC,44447064,1.301264e-02,9.945691e-03,2.827225e-02,1.903515e-02
1,mut125,1,193111198.0,193111198.0,T,C,1,ICGC,193111198,4.731307e-02,3.595848e-02,3.163439e-02,2.096407e-02
2,mut1010,1,240076797.0,240076797.0,C,T,0,COSMIC,240076797,4.034180e-02,2.997513e-02,1.273936e-02,9.145570e-03
3,mut589,1,189076292.0,189076292.0,T,G,0,COSMIC,189076292,5.857861e-03,4.473829e-03,3.840686e-07,2.560458e-07
4,mut358,1,212209066.0,212209066.0,G,C,1,TCGA,212209066,8.985486e-02,6.714920e-02,6.843572e-02,4.579884e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1084,mut971,X,54554114.0,54554114.0,A,T,0,COSMIC,54554114,2.826022e-03,1.681075e-03,7.670491e-04,4.919279e-04
1085,mut727,X,74842334.0,74842334.0,A,C,0,COSMIC,74842334,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1086,mut968,X,49957400.0,49957400.0,C,T,0,COSMIC,49957400,3.708701e-03,2.369070e-03,6.401144e-08,6.401144e-08
1087,mut1055,X,72254391.0,72254391.0,C,T,0,COSMIC,72254391,1.021294e-07,5.106469e-08,0.000000e+00,0.000000e+00


In [82]:
tmpdf[tmpdf['id'] == 'mut41']

Unnamed: 0,id,chr,start,end,ref,alt,driver,data_source,pos_37,CTCF_interactions,CTCF_chains,POLR2A_interactions,POLR2A_chains
41,mut41,8,97274000,97274000,T,C,1,ICGC,97274000.0,0.02274,0.017067,0.01838,0.011893
42,mut41,8,97274000,97274000,T,C,1,ICGC,97274000.0,0.02274,0.017067,0.01838,0.011893


In [10]:
pd.read_csv('final_interactions_result.csv')

Unnamed: 0,chr,start,end,pos_37,driver,CTCF_interactions,CTCF_chains,POLR2A_interactions,POLR2A_chains
0,1,43981392,43981392,44447064,0,509653,389533,441675,297371
1,1,193142068,193142068,193111198,1,1549463,1169254,173754,107885
2,1,239913497,239913497,240076797,0,460811,328873,83415,59100
3,1,189107161,189107161,189076292,0,87071,70547,6,4
4,1,212035724,212035724,212209066,1,912001,670970,408752,278048
...,...,...,...,...,...,...,...,...,...
1043,X,54527681,54527681,54554114,0,3,3,66,39
1044,X,75622499,75622499,74842334,0,0,0,0,0
1045,X,50192749,50192749,49957400,0,2693,1882,1,1
1046,X,73034552,73034552,72254391,0,4,2,0,0
