In [1]:
import pandas as pd
import pickle

In [2]:
hpa = pd.read_csv('../data/hpa.csv')                # Load the processed Human Protein Atlas data
string = pd.read_csv('../data/string_clean.csv')    # Load the processed STRING data
with open('../data/prot2gene.pickle', 'rb') as f:   # Load the protein to gene mapping
    prot2gene = pickle.load(f)

In [3]:
# stores a mapping from gene to a list of its subcellular locations, along with their reliability
gene2loc = dict()   

for i in hpa.index:
    gene = hpa['Gene'][i]
    reliability = hpa['Reliability'][i]
    loc = hpa['Locations'][i]
    gene2loc[gene] = (loc, reliability)

In [4]:
# Converts a string of template '{location1, location2}' to a set of locations.
def get_set(x):                 
    x = x[1:-1]
    return set(x.split(','))

# list to store locations to be added to the STRING data
locs = list()

# lists to store gene ids to be added to the STRING data
gene1 = list()                  
gene2 = list()

# lists to store reliability of locations to be added to the STRING data
reliability1 = list()
reliability2 = list()

# lists to store locations to be added to the STRING data
loc1 = list()
loc2 = list()

for i in string.index:

    # Get protein ids
    p1 = string['protein1'][i]           
    p2 = string['protein2'][i]      

    # Get gene from protein and (location, reliability) from gene    
    try:
        g1 = prot2gene[p1]
        l1, r1 = gene2loc[g1]    
        l1 = get_set(l1)
    except:
        g1 = None
        l1 = None
        r1 = None
    try:
        g2 = prot2gene[p2]
        l2, r2 = gene2loc[g2]
        l2 = get_set(l2)
    except:
        g2 = None
        l2 = None
        r2 = None

    # Add Gene ids that need to be added to STRING data
    gene1.append(g1)
    gene2.append(g2)

    # Add reliability of locations that need to be added to STRING data
    reliability1.append(r1)
    reliability2.append(r2)

    # Add Locations that need to be added to STRING data
    loc1.append(l1)
    loc2.append(l2)

    # If both proteins match with gene in HPA add intersection of their locations.
    # If exactly one of the protein matches, use its location to annotate the other protein.
    # Otherwise add an empty set.
    if (l1 != None) and (l2 != None):
        locs.append(frozenset(l1.intersection(l2)))     
    elif l1 != None:
        locs.append(frozenset(l1))                      
    elif l2 != None:
        locs.append(frozenset(l2))
    else:                                               
        locs.append(frozenset({}))

In [5]:
# Add the above computed lists to the STRING dataframe
string['locations'] = locs
string['gene1'] = gene1
string['gene2'] = gene2
string['reliability1'] = reliability1
string['reliability2'] = reliability2
string['location1'] = loc1
string['location2'] = loc2

# Drop the rows that have empty locations
drop_i = list()
for i in string.index:
    if len(string['locations'][i]) == 0:
        drop_i.append(i)
print(len(drop_i))


2752910


In [6]:
string = string.drop(drop_i)
string.to_csv('../data/string_locs.csv', index=None)

In [1]:
import pandas as pd

#formatting the location column in STRING data
def f1(l):
    l = l.split('{')[-1]
    l = l.split('}')[0]
    l = l.split(',')
    l = [x[2:-2].strip() for x in l]
    l1 = list()
    l1.append(l[0])
    for i in l[1:]:
        l1.append(i[1:])
    return l1

def f2(l):
    try:
        l = l[1:-1]
        l = l.replace('"','')
        l = l.replace("'", "")
        return l.split(',')
    except:
        return None

string = pd.read_csv('../data/string_locs.csv')
string['locations'] = string['locations'].apply(lambda x: f1(x))
string['location1'] = string['location1'].apply(lambda x: f2(x))
string['location2'] = string['location2'].apply(lambda x: f2(x))
string.head()

Unnamed: 0,protein1,protein2,neighborhood,fusion,cooccurence,coexpression,experimental,database,textmining,combined_score,locations,gene1,gene2,reliability1,reliability2,location1,location2
0,ENSP00000000233,ENSP00000253401,0,0,0,0,186,0,56,198,[Cytosol],,ENSG00000131089,,Supported,,[Cytosol]
1,ENSP00000000233,ENSP00000418915,0,0,0,61,158,0,542,606,[Nucleoli],,ENSG00000147889,,Supported,,[Nucleoli]
2,ENSP00000000233,ENSP00000327801,0,0,0,88,78,0,89,167,[Endoplasmic reticulum],,ENSG00000185624,,Supported,,[Endoplasmic reticulum]
3,ENSP00000000233,ENSP00000466298,0,0,0,141,131,0,98,267,[Golgi apparatus],,ENSG00000104915,,Approved,,[Golgi apparatus]
4,ENSP00000000233,ENSP00000232564,0,0,0,62,171,0,56,201,[Plasma membrane],,ENSG00000114450,,Supported,,[Plasma membrane]


In [2]:
# Rearrange the columns 
cols = string.columns.values.tolist()
p1_cols = [cols[0]] + ['gene1', 'reliability1', 'location1']
p2_cols = [cols[1]] + ['gene2', 'reliability2', 'location2']
rem_cols = cols[2:-6]
new_cols = p1_cols + p2_cols + rem_cols

string = string.reindex(columns=new_cols) 
string.to_csv('../data/string_locs2.csv', index=None)

In [3]:
import pandas as pd

string = pd.read_csv('../data/string_locs2.csv')


In [4]:
def f(l):
    try:
        l = l.replace('"','')
        return l.replace("'", "")
    except:
        return None
string['locations'] = string['locations'].apply(lambda x: f(x))
string['location1'] = string['location1'].apply(lambda x: f(x))
string['location2'] = string['location2'].apply(lambda x: f(x))
string.to_csv('../data/string_locs2.csv', index=None)

In [5]:
string.shape

(3126817, 17)