In this notebook, we mapped the Human Proteins RNA expression GeneID's from the Human Proteine Altas to Uniprot ID's using https://www.uniprot.org/. 

## Load packages 

In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import numpy as np

## Load Original Expression Consensus data 

In [21]:
#%%
temp = []
ENTRY_LENGTH = 4

raw_data = open("HPA_RNA_CONSENSUS.tsv",'r')
data = raw_data.readlines()
for element in data:
    temp.append(element.strip().split("\t"))
num_instances = len(data)/ENTRY_LENGTH

# split into instances
expression = []
for i in range(ENTRY_LENGTH, len(temp)+1, ENTRY_LENGTH):
    tmp_list = temp[i-3]
    expression.append(tmp_list)
    del tmp_list 

# make dataframe 
df = pd.DataFrame(expression, columns =['Gene', 'Gene_name', 'Tissue', 'NX'])
df


Unnamed: 0,Gene,Gene_name,Tissue,NX
0,ENSG00000000003,TSPAN6,adipose tissue,27.0
1,ENSG00000000003,TSPAN6,B-cells,0.3
2,ENSG00000000003,TSPAN6,cerebellum,2.9
3,ENSG00000000003,TSPAN6,corpus callosum,12.6
4,ENSG00000000003,TSPAN6,endometrium,11.1
...,...,...,...,...
295111,ENSG00000285509,AP000646.1,ovary,0.8
295112,ENSG00000285509,AP000646.1,prostate,0.6
295113,ENSG00000285509,AP000646.1,skeletal muscle,0.0
295114,ENSG00000285509,AP000646.1,spleen,0.0


## Make .txt file containing all Gene ID's which will be uploaded in Uniprot to obtain Uniprot ID's

In [4]:
Genes = df["Gene"]
Genes = Genes.drop_duplicates()
textfile = open("ExpressionGeneNames.txt", "w")
for element in Genes:
    textfile.write(element + "\n")
textfile.close()


In [5]:
Genes

0         ENSG00000000003
16        ENSG00000000005
31        ENSG00000000419
46        ENSG00000000457
61        ENSG00000000460
               ...       
295061    ENSG00000285471
295072    ENSG00000285472
295083    ENSG00000285480
295094    ENSG00000285508
295105    ENSG00000285509
Name: Gene, Length: 19670, dtype: object

## Load Uniprot ID file obtained from Uniprot

In [6]:
ExpressionUniprot = []
Expression_GeneID = []
f=open("/Users/priya/Documenten/GitHub/BioinformaticsMajorProject/Data/Raw_Data/GeneToUniprotExpression.txt","r")
next(f)
lines=f.readlines()
for x in lines:
    ExpressionUniprot.append(x.split('\t')[1].strip())
    Expression_GeneID.append(x.split('\t')[0].strip())
f.close()

df_Expression = pd.DataFrame({"Gene_ID":Expression_GeneID,"Uniprot_ID":ExpressionUniprot})


## Making dictonary to map Uniprot ID's (values) to Gene ID's (keys):

In [7]:
new_dict = {}
for i in range(len(Expression_GeneID)):
    if Expression_GeneID[i] in new_dict:
        new_dict[Expression_GeneID[i]].append(ExpressionUniprot[i])
    else:
        new_dict[Expression_GeneID[i]] = [ExpressionUniprot[i]]

temp = []
for i in range(len(df)):
    if df["Gene"][i] in new_dict:
        test = new_dict[df["Gene"][i]]
        temp.append(test)
    else:
        temp.append(np.nan)

df_uni = pd.DataFrame({"Uniprot_ID":temp})
df_Complete = pd.concat([df, df_uni], axis=1, join='inner')

In [19]:
test = df_Complete.explode('Uniprot_ID')
test.to_csv('ExpressionGenesAndUniprot.csv')

In [17]:
test

Unnamed: 0,Gene,Gene_name,Tissue,NX,Uniprot_ID
0,ENSG00000000003,TSPAN6,adipose tissue,27.0,O43657
1,ENSG00000000003,TSPAN6,B-cells,0.3,O43657
2,ENSG00000000003,TSPAN6,cerebellum,2.9,O43657
3,ENSG00000000003,TSPAN6,corpus callosum,12.6,O43657
4,ENSG00000000003,TSPAN6,endometrium,11.1,O43657
...,...,...,...,...,...
295111,ENSG00000285509,AP000646.1,ovary,0.8,
295112,ENSG00000285509,AP000646.1,prostate,0.6,
295113,ENSG00000285509,AP000646.1,skeletal muscle,0.0,
295114,ENSG00000285509,AP000646.1,spleen,0.0,


## Make .txt file containing all Uniprot ID's. This file will be uploaded to Uniprot to obtain PDB ID's

In [8]:
new_df=(df_Expression.set_index(["Uniprot_ID"])
   .apply(lambda x: x.str.split(',').explode())
   .reset_index())
new_df=new_df.set_index(["Gene_ID"]).reset_index()
new_df["Uniprot_ID"].to_csv(r'ExpressionUniprot.txt', index=False)


## Load file obtained from Uniprot that contains Uniprot ID's and their corresponding PDB ID's

In [9]:
Uni=[]
PDB=[]
f=open("ExpressionUniPDB.txt","r")
next(f)
lines=f.readlines()
for x in lines:
    PDB.append(x.split('\t')[1].strip())
    Uni.append(x.split('\t')[0].strip())
f.close()

ex_df = df_Complete.explode('Uniprot_ID').reset_index()

## Making dictionary of where the keys are Uniprot ID and the values are the corresponding PDB ID's

In [10]:
dic = {}
for i in range(len(Uni)):
    if Uni[i] in dic:
        dic[Uni[i]].append(PDB[i])
    else:
        dic[Uni[i]] = [PDB[i]]


temp=[]
for i in range(len(ex_df)):
    if ex_df["Uniprot_ID"][i] in dic:
        test = dic[ex_df["Uniprot_ID"][i]]
        temp.append(test)
    else:
        temp.append([np.nan])


In [11]:
df_PDB = pd.DataFrame({"PDB":temp})
ex_df = pd.concat([ex_df, df_PDB], axis=1, join='inner')


In [16]:
ex_df=ex_df.explode('PDB').reset_index()

In [18]:
ex_df.to_csv(r'ExpressionComplete.csv', index=None)