In [112]:
import pandas as pd

In [113]:
# Load the file downloaded from Uniprot (https://www.uniprot.org/uploadlists/).
df = pd.read_csv('../data/bioplex2ensemblt.tab', sep='\t')
df.dropna(inplace=True)
df.columns = ['uniprot', 'entry', 'id', 'enst']    # Rename columns
df.head()

Unnamed: 0,uniprot,entry,id,enst
0,P20700,P20700,4001;,ENST00000261366;
1,P28715-2,P28715,2073;,ENST00000535557 [P28715-3];ENST00000652225 [P2...
2,O60266,O60266,109;,ENST00000260600 [O60266-1];
3,P23470-2,P23470,5793;,ENST00000295874 [P23470-2];ENST00000474889 [P2...
4,Q05823,Q05823,6041;,ENST00000367559 [Q05823-1];ENST00000539397 [Q0...


In [114]:
def clean(x):               # Clean the format of ensembl IDs and gene IDs
    x = x.split(';')[0]
    x = x.split('[')[0].strip()
    return x

In [115]:
df['enst'] = df['enst'].apply(lambda x: clean(x))
df['id'] = df['id'].apply(lambda x: clean(x))
df.head()

Unnamed: 0,uniprot,entry,id,enst
0,P20700,P20700,4001,ENST00000261366
1,P28715-2,P28715,2073,ENST00000535557
2,O60266,O60266,109,ENST00000260600
3,P23470-2,P23470,5793,ENST00000295874
4,Q05823,Q05823,6041,ENST00000367559


In [116]:
def get_unique(df, col):        # Extract unique gene IDs or uniprot IDs from BioPlex data
    colA = col+'A'
    colB = col+'B'

    s1 = set(df[colA].to_list())
    s2 = set(df[colB].to_list())
    return s1.union(s2)

In [117]:
bioplex = pd.read_csv('../data/BioPlex3.tsv', sep='\t')
uniprots = get_unique(bioplex, 'Uniprot')
ids = get_unique(bioplex, 'Gene')
# symbols = get_unique(bioplex, 'Symbol')

drop_l = list()

# Pick common (wrt uniprot and gene IDs) rows from Bioplex and downloaded file from uniprot.
for i in df.index:           
    temp_id = ids
    temp_uniprot = uniprots
    # temp_symbol = symbols

    l1 = len(temp_id)
    gene_id = int(df['id'][i])
    temp_id.add(gene_id)
    ll1 = len(temp_id)
    
    l2 = len(temp_uniprot)
    gene = df['uniprot'][i]
    temp_uniprot.add(gene)
    ll2 = len(temp_uniprot)

    # if gene == "Q00887":
        # print(gene_id)

    # l3 = len(temp_symbol)
    # gene = df['gene_symbol'][i]
    # temp_symbol.add(gene)
    # ll3 = len(temp_symbol)
    
    if l1==ll1 and l2==ll2:
        continue
    else:
        # print(gene)
        
        drop_l.append(i)

print(df.shape, len(genes))
df = df.drop(drop_l) 
df.shape

(13177, 4) 13689


(13099, 4)

In [118]:
gene2id = dict()        # gene name to gene ID

for i in bioplex.index:
    gene1 = bioplex['SymbolA'][i]
    gene2 = bioplex['SymbolB'][i]
    id1 = bioplex['GeneA'][i]
    id2 = bioplex['GeneB'][i]

    gene2id[gene1] = id1
    gene2id[gene2] = id2

In [119]:
l = df['enst'].unique().tolist()

# write all the unique ensembl transcript IDs into a txt file
with open('../data/enst.txt', 'w') as f:    
    for val in l:
        f.write(f'{val}\n')

In [120]:
# Upload the generated txt file and get the ENST -> ENSG,ENSP mappings from biomart

In [121]:
mapping = pd.read_csv('../data/transcript2gp.csv')
mapping.columns = ['enst', 'ensg', 'ensp']
mapping.head()

Unnamed: 0,enst,ensg,ensp
0,ENST00000361390,ENSG00000198888,ENSP00000354687
1,ENST00000361453,ENSG00000198763,ENSP00000355046
2,ENST00000361624,ENSG00000198804,ENSP00000354499
3,ENST00000361739,ENSG00000198712,ENSP00000354876
4,ENST00000361851,ENSG00000228253,ENSP00000355265


In [123]:
gene2enst = dict()      # gene ID to transcript ensembl ID

for i in df.index:
    gene_id = df['id'][i]
    enst = df['enst'][i]
    gene2enst[gene_id] = enst
len(gene2enst)

13089

In [124]:
enst2ensgp = dict()     # transcript ensembl ID to gene, protein ensembl IDs

for i in mapping.index:
    enst = mapping['enst'][i]
    ensg = mapping['ensg'][i]
    ensp = mapping['ensp'][i]
    enst2ensgp[enst] = (ensg, ensp)
len(enst2ensgp)

13067

In [125]:
df = pd.read_csv('../data/bioplex_locs.csv')    # Load processed BioPlex data
print(df.shape)
df.head()

(73664, 10)


Unnamed: 0,gene1_name,gene1,reliability1,location1,gene2_name,gene2,reliability2,location2,combined_score,locations
0,ADA,ENSG00000196839,Supported,"{'Plasma membrane', 'Cytosol'}",POTEF,,,,0.999882,"['Plasma membrane', 'Cytosol']"
1,BEND7,ENSG00000165626,Approved,"{'Nucleoli fibrillar center', 'Vesicles', 'Nuc...",RBBP4,ENSG00000162521,Enhanced,{'Nucleoplasm'},0.999936,['Nucleoplasm']
2,BEND7,ENSG00000165626,Approved,"{'Nucleoli fibrillar center', 'Vesicles', 'Nuc...",RPL36,,,,0.871817,"['Nucleoli fibrillar center', 'Vesicles', 'Nuc..."
3,BEND7,ENSG00000165626,Approved,"{'Nucleoli fibrillar center', 'Vesicles', 'Nuc...",LRP4,,,,0.999669,"['Nucleoli fibrillar center', 'Vesicles', 'Nuc..."
4,BEND7,ENSG00000165626,Approved,"{'Nucleoli fibrillar center', 'Vesicles', 'Nuc...",KPNA1,ENSG00000114030,Enhanced,"{'Cytosol', 'Nucleoplasm'}",0.999971,['Nucleoplasm']


In [133]:
def do_map(x, mode):    # maps gene name to gene, protein ensembl IDs
    try:
        gp = enst2ensgp[gene2enst[str(gene2id[x])]]
    except:
        return None
    if mode == 'p':
        return gp[1]
    return gp[0]

In [134]:
df['protein1'] = df['gene1_name'].apply(lambda x: do_map(x, 'p'))
df['protein2'] = df['gene2_name'].apply(lambda x: do_map(x, 'p'))

df['gene1'] = df['gene1_name'].apply(lambda x: do_map(x, 'g'))
df['gene2'] = df['gene2_name'].apply(lambda x: do_map(x, 'g'))

df.head(2)

Unnamed: 0,gene1_name,gene1,reliability1,location1,gene2_name,gene2,reliability2,location2,combined_score,locations,protein1,protein2
0,ADA,ENSG00000196839,Supported,"{'Plasma membrane', 'Cytosol'}",POTEF,ENSG00000196604,,,0.999882,"['Plasma membrane', 'Cytosol']",ENSP00000361965,ENSP00000386786
1,BEND7,ENSG00000165626,Approved,"{'Nucleoli fibrillar center', 'Vesicles', 'Nuc...",RBBP4,ENSG00000162521,Enhanced,{'Nucleoplasm'},0.999936,['Nucleoplasm'],ENSP00000345773,ENSP00000362584


In [135]:
# Extract appropriate columns
cols = ['protein1', 'gene1', 'reliability1', 'location1', 'protein2', 'gene2', 'reliability2', 'location2', 'combined_score', 'locations']
df = df[cols]
df.head(2)

Unnamed: 0,protein1,gene1,reliability1,location1,protein2,gene2,reliability2,location2,combined_score,locations
0,ENSP00000361965,ENSG00000196839,Supported,"{'Plasma membrane', 'Cytosol'}",ENSP00000386786,ENSG00000196604,,,0.999882,"['Plasma membrane', 'Cytosol']"
1,ENSP00000345773,ENSG00000165626,Approved,"{'Nucleoli fibrillar center', 'Vesicles', 'Nuc...",ENSP00000362584,ENSG00000162521,Enhanced,{'Nucleoplasm'},0.999936,['Nucleoplasm']


In [137]:
df.to_csv('../data/bioplex_locs2.csv', index=None)