In [1]:
'''
This file preprocesses the BioPlex data and returns a file in the format that is compatible with our preprocessed STRING DB file.
'''

'\nThis file preprocesses the BioPlex data and returns a file in the format that is compatible with our preprocessed STRING DB file.\n'

In [2]:
import pandas as pd 
from tqdm import tqdm

In [3]:
bioplex = pd.read_csv('../data/BioPlex3.tsv', sep='\t')    # Load BioPlex data
print(bioplex.shape)
bioplex.head()

(118162, 9)


Unnamed: 0,GeneA,GeneB,UniprotA,UniprotB,SymbolA,SymbolB,pW,pNI,pInt
0,100,728378,P00813,A5A3E0,ADA,POTEF,6.881844e-10,0.000118,0.999882
1,222389,6137,Q8N7W2-2,P26373,BEND7,RPL13,1.34038e-18,0.225664,0.774336
2,222389,5928,Q8N7W2-2,Q09028-3,BEND7,RBBP4,7.221401e-21,6.4e-05,0.999936
3,222389,25873,Q8N7W2-2,Q9Y3U8,BEND7,RPL36,7.058372e-17,0.128183,0.871817
4,222389,6124,Q8N7W2-2,P36578,BEND7,RPL4,1.632313e-22,0.200638,0.799362


In [4]:
hpa = pd.read_csv('../data/hpa.csv')    # Load HPA data
print(hpa.shape)
hpa.head()

(11723, 4)


Unnamed: 0,Gene name,Gene,Reliability,Locations
0,TSPAN6,ENSG00000000003,Approved,"{'Nucleoli fibrillar center', 'Cytosol', 'Cell..."
1,C1orf112,ENSG00000000460,Approved,{'Mitochondria'}
2,FGR,ENSG00000000938,Approved,"{'Aggresome', 'Plasma membrane'}"
3,CFH,ENSG00000000971,Approved,{'Vesicles'}
4,GCLC,ENSG00000001084,Approved,"{'Cytosol', 'Nucleoplasm', 'Nucleoli'}"


In [5]:
drop_i = list()

for i in tqdm(bioplex.index):             # Dropping rows which have p(Wrong) > 5%
    error = bioplex['pW'][i]
    if error > 0.05:
        drop_i.append(i)

bioplex = bioplex.drop(drop_i)
bioplex = bioplex[['GeneA', 'GeneB', 'pInt']]   # Selecting appropriate columns

bioplex.columns = ['gene1_name', 'gene2_name', 'combined_score']
bioplex.shape

100%|██████████| 118162/118162 [00:02<00:00, 48006.41it/s]


(114205, 3)

In [6]:
gene2data = dict()

# gene = set()

for i in tqdm(hpa.index):                       # Store a mapping from gene name to its hpa data
    gene_name = hpa['Gene name'][i]
    gene_id = hpa['Gene'][i]
    reliability = hpa['Reliability'][i]
    locations = hpa['Locations'][i]
    gene2data[gene_name] = (gene_id, reliability, locations)

    # l1 = len(gene)
    # gene.add(gene_name)
    # l2 = len(gene)
    # if l1 == l2:
    #     print("Gene name = ", gene_name)

len(gene2data)

100%|██████████| 11723/11723 [00:00<00:00, 12349.75it/s]


11716

In [7]:
# Converts a string of template '{location1, location2}' to a set of locations.
def get_set(x):                 
    x = x[1:-1].strip()
    x = x.replace('"', '')
    x = x.replace("'", "")
    return set([s.strip() for s in x.split(',')])
    

In [8]:
id_A = list()
id_B = list()
reliability_A = list()
reliability_B = list()
loc_A = list()
loc_B = list()
locs = list()

# Extract information from HPA data using gene names from Bioplex data.
for i in tqdm(bioplex.index):
    gene1 = bioplex['gene1_name'][i]
    gene2 = bioplex['gene2_name'][i]
    
    try:
        id1, r1, l1 = gene2data[gene1]
        l1 = get_set(l1)
    except:
        id1, r1, l1 = None, None, None
    try:
        id2, r2, l2 = gene2data[gene2]
        l2 = get_set(l2)
    except:
        id2, r2, l2 = None, None, None
    
    id_A.append(id1)
    id_B.append(id2)
    reliability_A.append(r1)
    reliability_B.append(r2)
    loc_A.append(l1)
    loc_B.append(l2)

    if (l1 != None) and (l2 != None):
        locs.append(frozenset(l1.intersection(l2)))     
    elif l1 != None:
        locs.append(frozenset(l1))                      
    elif l2 != None:
        locs.append(frozenset(l2))
    else:                                               
        locs.append(frozenset({}))

bioplex['gene1'] = id_A
bioplex['gene2'] = id_B
bioplex['reliability1'] = reliability_A
bioplex['reliability2'] = reliability_B
bioplex['location1'] = loc_A
bioplex['location2'] = loc_B
bioplex['locations'] = locs

100%|██████████| 114205/114205 [00:04<00:00, 23485.96it/s]


In [63]:
# bioplex.to_csv('../data/bio_test.csv', index=None)

In [64]:
drop_i = list()
for i in bioplex.index:                 # Remove datapoints that do not belong to any location
    if len(bioplex['locations'][i]) == 0:
        drop_i.append(i)

bioplex = bioplex.drop(drop_i)
bioplex.shape

(73664, 10)

In [65]:
bioplex['locations'] = bioplex['locations'].apply(lambda x: list(x))
cols = ['gene1_name', 'gene1', 'reliability1', 'location1', 'gene2_name', 'gene2', 'reliability2', 'location2', 'combined_score', 'locations']
bioplex = bioplex[cols]

In [66]:
bioplex.to_csv('../data/bioplex_locs.csv', index=None)