In [1]:
# open gene list
with open('../data/processed_data/primekg_gene_list.txt', 'r') as f:
    primekg_gene_list = f.readlines()

primekg_gene_list = [gene.strip() for gene in primekg_gene_list]

In [2]:
import requests
import pandas as pd
from tqdm import tqdm
import io
# 假设你的基因名称列表如下
gene_list = primekg_gene_list[:100]

# 定义物种ID为人类
taxonomy_id = "9606"

# 将基因名称列表转换为批量查询字符串
query = " OR ".join([f"gene:{gene_name}" for gene_name in gene_list])
query = f"({query}) AND taxonomy_id:{taxonomy_id}"

# 构建UniProt API URL
uniprot_url = f"https://rest.uniprot.org/uniprotkb/search?query={query}&fields=accession,gene_names,sequence&format=tsv"

# 发送HTTP请求获取数据
response = requests.get(uniprot_url)

# 检查响应状态
if response.status_code == 200:
    # 解析响应内容
    data = response.text
    # 将TSV格式的数据转换为Pandas DataFrame
    df = pd.read_csv(io.StringIO(data), sep='\t')
    
    # 创建结果列表
    results = []
    
    # 遍历DataFrame中的每一行并存储结果
    for index, row in df.iterrows():
        gene_names = row['Gene Names'].split()  # 处理可能包含多个基因名称的情况
        for gene_name in gene_names:
            if gene_name in gene_list:
                results.append([gene_name, row['Entry'], row['Sequence']])
else:
    print(f"Failed to retrieve data, status code: {response.status_code}")

# 将结果列表转换为Pandas DataFrame
results_df = pd.DataFrame(results, columns=["Gene Name", "UniProt ID", "Sequence"])

# 打印或保存结果
print(results_df)
# results_df.to_csv("uniprot_results.csv", index=False)  # 保存为CSV文件

   Gene Name UniProt ID                                           Sequence
0     ZNF578     Q96N58  MLHEEAAQKRKGKEPGMALPQGRLTFRDVAIEFSLAEWKFLNPAQR...
1     TRIM48     Q8IWZ4  MSRRIIVGTLQRTQRNMNSGISQVFQRELTCPICMNYFIDPVTIDC...
2      LCE2A     Q5TA79  MSCQQNQQQCQPPPKCPPKCPPKCPPKCRPQCPAPCPPPVSSCCGP...
3      KDM4E     B2RXH2  MKSVHSSPQNTSHTIMTFYPTMEEFADFNTYVAYMESQGAHQAGLA...
4      H2BS1     P57053  MPEPAKSAPAPKKGSKKAVTKAQKKDGRKRKRSRKESYSVYVYKVL...
5     B3GNT6     Q6ZMB0  MAFPCRRSLTAKTLACLLVGVSFLALQQWFLQAPRSPREERSPQEE...
6     GPR101     Q96P66  MTSTCTNSTRESNSSHTCMPLSKMPISLAHGIIRSTVLVIFLAASF...
7     CAPN11     Q9UMQ6  MLYSPGPSLPESAESLDGSQEDKPRGSCAEPTFTDTGMVAHINNSR...
8      CD207     Q9UJ71  MTVEKEAPDAHFTVDKQNISLWPREPPPKSGPSLVPGKTPTVRAAL...
9      HABP4     Q5JVS0  MKGALGSPVAAAGAAMQESFGCVVANRFHQLLDDESDPFDILREAE...
10  SERPINA6     P08185  MPLLLYTCLLWLPTSGLWTVQAMDPNAAYVNMSNHHRGLASANVDF...
11    SCNN1B     P51168  MHVKKYLLKGLHRLQKGPGYTYKELLVWYCDNTNTHGPKRIICEGP...
12    ELOVL3     Q9HB03  

In [3]:
print(response.text)

Entry	Gene Names	Sequence
Q96N58	ZNF578	MLHEEAAQKRKGKEPGMALPQGRLTFRDVAIEFSLAEWKFLNPAQRALYREVMLENYRNLEAVDISSKRMMKEVLSTGQGNTEVIHTGMLQRHESYHTGDFCFQEIEKDIHDFEFQSQKDERNGHEASMPKIKELMGSTDRHDQRHAGNKPIKDQLGLSFHLHLPELHIFQPEEKIANQVEKSVNDASSISTSQRISCRPETHTPNNYGNNFFHSSLLTQKQEVHMREKSFQCNETGEAFNCSSFVRKHQIIHLGEKQYKFDICGKVFNEKRYLARHRRCHTSEKPYKCNECGKSFSYKSSLTCHRRCHTGEKPYKCNECGKSFSYKSSLTCHHRCHTGEKPYKCNECGKSFSYKSSLRCHRRLHTGIKPYKCNECGKMFGQNSTLVIHKAIHTGEKPYKCNECGKAFNQQSHLSRHHRLHTGEKPYKCNDCGKAFIHQSSLARHHRLHTGEKSYKCEECDRVFSQKSNLERHKIIHTGEKPYKCNECHKTFSHRSSLPCHRRLHSGEKPYKCNECGKTFNVQSHLSRHHRLHTGEKPYKCKVCDKAFMCHSYLANHTRIHSGEKPYKCNECGKAHNHLIDSSIKPCMSS
Q8IWZ4	TRIM48 RNF101	MSRRIIVGTLQRTQRNMNSGISQVFQRELTCPICMNYFIDPVTIDCGHSFCRPCFYLNWQDIPILTQCFECIKTIQQRNLKTNIRLKKMASLARKASLWLFLSSEEQMCGIHRETKKMFCEVDRSLLCLLCSSSQEHRYHRHCPAEWAAEEHWEKLLKKMQSLWEKACENQRNLNVETTRISHWKAFGDILYRSESVLLHMPQPLNLALRAGPITGLRDRLNQF
Q5TA79	LCE2A LEP9	MSCQQNQQQCQPPPKCPPKCPPKCPPKCRPQCPAPCPPPVSSCCGPSSGGCCGSSSGGCCSSGGGGCCLSHHRPRLFHRHRHQSPDCCECEPSGGSGCCHSSGDC

In [4]:
import requests
import pandas as pd
from tqdm import tqdm
import io
from concurrent.futures import ThreadPoolExecutor

# 假设你的基因名称列表如下
gene_list = primekg_gene_list

# 定义物种ID为人类
taxonomy_id = "9606"

# 准备存储结果的列表
results = []

# 定义一个函数来处理每个基因名称
def fetch_data(gene_name):
    # 构建UniProt API URL
    uniprot_url = f"https://rest.uniprot.org/uniprotkb/search?query=gene:{gene_name}+AND+taxonomy_id:{taxonomy_id}&fields=accession,gene_names,sequence&format=tsv"
    
    # 发送HTTP请求获取数据
    response = requests.get(uniprot_url)
    
    # 检查响应状态
    if response.status_code == 200:
        # 解析响应内容
        data = response.text
        # 将TSV格式的数据转换为Pandas DataFrame
        df = pd.read_csv(io.StringIO(data), sep='\t')
        
        # 存储结果
        local_results = []
        for index, row in df.iterrows():
            gene_names = row['Gene Names'].split()  # 拆分多个基因名称
            for gene in gene_names:
                local_results.append([gene, row['Entry'], row['Sequence']])
        return local_results
    else:
        print(f"Failed to retrieve data for gene: {gene_name}, status code: {response.status_code}")
        return []

# 使用ThreadPoolExecutor并发处理请求
with ThreadPoolExecutor(max_workers=8) as executor:
    # 使用tqdm显示进度条
    with tqdm(total=len(gene_list), desc="Fetching data from UniProt") as pbar:
        future_to_gene = {executor.submit(fetch_data, gene_name): gene_name for gene_name in gene_list}
        for future in concurrent.futures.as_completed(future_to_gene):
            gene_name = future_to_gene[future]
            try:
                result = future.result()
                results.extend(result)
            except Exception as exc:
                print(f"{gene_name} generated an exception: {exc}")
            pbar.update(1)

# 将结果列表转换为Pandas DataFrame
results_df = pd.DataFrame(results, columns=["Gene Name", "UniProt ID", "Sequence"])

# 打印或保存结果
print(results_df)
results_df.to_csv("../data/processed_data/uniprot_results.csv", index=False)  # 保存为CSV文件

Fetching data from UniProt:   0%|          | 0/27671 [00:00<?, ?it/s]


In [1]:
import requests
import pandas as pd
from tqdm import tqdm
import io
from concurrent.futures import ThreadPoolExecutor

# open gene list
with open('../data/processed_data/primekg_gene_list.txt', 'r') as f:
    primekg_gene_list = f.readlines()

primekg_gene_list = [gene.strip() for gene in primekg_gene_list]

# 假设你的基因名称列表如下
import requests
import pandas as pd
from tqdm import tqdm
import io
# 假设你的基因名称列表如下
gene_list = primekg_gene_list[:100]

# 定义物种ID为人类
taxonomy_id = "9606"

# 准备存储结果的列表
results = []

# 遍历基因名称列表并查询UniProt API
for gene_name in tqdm(gene_list, desc="Fetching data from UniProt"):
    # 构建UniProt API URL
    uniprot_url = f"https://rest.uniprot.org/uniprotkb/search?query=gene:{gene_name}+AND+taxonomy_id:{taxonomy_id}&fields=accession,sequence&format=tsv"
    
    # 发送HTTP请求获取数据
    response = requests.get(uniprot_url)
    
    # 检查响应状态
    if response.status_code == 200:
        # 解析响应内容
        data = response.text
        # 将TSV格式的数据转换为Pandas DataFrame
        # print(data)
        df = pd.read_csv(io.StringIO(data), sep='\t')
        
        # 遍历DataFrame中的每一行并存储结果
        for index, row in df.iterrows():
            results.append([gene_name, row['Entry'], row['Sequence']])
    else:
        print(f"Failed to retrieve data for gene: {gene_name}, status code: {response.status_code}")

# 将结果列表转换为Pandas DataFrame
results_df = pd.DataFrame(results, columns=["Gene Name", "UniProt ID", "Sequence"])

print(results_df)
results_df.to_csv("../data/processed_data/uniprot_results.csv", index=False)  # 保存为CSV文件

Fetching data from UniProt: 100%|██████████| 100/100 [02:28<00:00,  1.49s/it]

    Gene Name UniProt ID                                           Sequence
0      PHYKPL     Q8IUZ5  MAADQRPKADTLALRQRLISSSCRLFFPEDPVKIVRAQGQYMYDEQ...
1      PHYKPL     D6RAR0  MAADQRPKADTLALRQRLISSSCRLFFPEDPVKIVRAQGQYMYDEQ...
2      PHYKPL     D6RD89  MAADQRPKADTLALRQRLIRHAALWSPHGTGQPPFLRIRETDADTC...
3      PHYKPL     H0Y9N3  QAAHEQNQVLNTNSRYLHDNIVDYAQRLSETLPEQLCVFYFLNSGS...
4      PHYKPL     H0YAK5                       IAAFFAESLPSVGGQIIPPAGYFSQVAE
..        ...        ...                                                ...
320    INO80B     C9JKY0  MSKLWRRGSTSGAMEAPEPGEALELSLAGAHGHGVHKKKHKKHKKK...
321    INO80B     B8ZZ93  MSKLWRRGSTSGAMEAPEPGEALELSLAGAHGHGVHKKKHKKHKKK...
322    INO80B     H7C171  KKHHQEEDAGPTQPSPAKPQLKLKIKLGGQVLGTKSVPTFTVIPEG...
323    INO80B     F8WCL7  MSKLWRRGSTSGAMEAPEPGEALELSLAGAHGHGVHKKKHKKHKKK...
324    INO80B     J3KQ70  MSKLWRRGSTSGAMEAPEPGEALELSLAGAHGHGVHKKKHKKHKKK...

[325 rows x 3 columns]





In [7]:
import requests
import pandas as pd
from tqdm import tqdm
import io
from concurrent.futures import ThreadPoolExecutor, as_completed

# 打开基因列表文件
with open('../data/processed_data/primekg_gene_list.txt', 'r') as f:
    primekg_gene_list = f.readlines()

primekg_gene_list = [gene.strip() for gene in primekg_gene_list]

# 假设你的基因名称列表如下
gene_list = primekg_gene_list

# 定义物种ID为人类
taxonomy_id = "9606"

# 准备存储结果的列表
results = []

def fetch_data(gene_name):
    uniprot_url = f"https://rest.uniprot.org/uniprotkb/search?query=gene:{gene_name}+AND+taxonomy_id:{taxonomy_id}&fields=accession,sequence&format=tsv"
    try:
        response = requests.get(uniprot_url)
        response.raise_for_status()
        data = response.text
        result_list = []
        if data.strip():
            df = pd.read_csv(io.StringIO(data), sep='\t')
            for index, row in df.iterrows():
                sequence = row['Sequence']
                if len(sequence) >= 30:
                    result_list.append([gene_name, row['Entry'], sequence])
        if not result_list:
            result_list.append([gene_name, "", ""])
        return result_list
    except requests.RequestException as e:
        print(f"Failed to retrieve data for gene: {gene_name}, error: {e}")
        return [[gene_name, "", ""]]

def save_results(results):
    results_df = pd.DataFrame(results, columns=["Gene Name", "UniProt ID", "Sequence"])
    results_df.to_csv("../data/processed_data/uniprot_results.csv", index=False)
    print("Results saved to CSV.")

try:
    with ThreadPoolExecutor(max_workers=100) as executor:
        futures = {executor.submit(fetch_data, gene_name): gene_name for gene_name in gene_list}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching data from UniProt"):
            result = future.result()
            results.extend(result)
except Exception as e:
    print(f"An error occurred: {e}")
    save_results(results)

# 在完成所有请求后保存结果
save_results(results)


Fetching data from UniProt: 100%|██████████| 27671/27671 [39:00<00:00, 11.82it/s]  


Results saved to CSV.


In [3]:
import requests
import pandas as pd
from tqdm import tqdm
import io

# 打开基因列表文件
with open('../data/processed_data/primekg_gene_list.txt', 'r') as f:
    primekg_gene_list = f.readlines()

primekg_gene_list = [gene.strip() for gene in primekg_gene_list]

# 假设你的基因名称列表如下
gene_list = primekg_gene_list[:10]

# 定义物种ID为人类
taxonomy_id = "9606"

# 准备存储结果的列表
results = []

def save_results(results):
    results_df = pd.DataFrame(results, columns=["Gene Name", "UniProt ID", "Sequence"])
    results_df.to_csv("../data/processed_data/uniprot_results.csv", index=False)
    print("Results saved to CSV.")

try:
    # 遍历基因名称列表并查询UniProt API
    for gene_name in tqdm(gene_list, desc="Fetching data from UniProt"):
        # 构建UniProt API URL
        uniprot_url = f"https://rest.uniprot.org/uniprotkb/search?query=gene:{gene_name}+AND+taxonomy_id:{taxonomy_id}&fields=accession,sequence&format=tsv"
        
        # 发送HTTP请求获取数据
        response = requests.get(uniprot_url)
        
        # 检查响应状态
        if response.status_code == 200:
            # 解析响应内容
            data = response.text
            # 检查是否返回了数据
            if data.strip():  # 如果返回了数据
                # 将TSV格式的数据转换为Pandas DataFrame
                df = pd.read_csv(io.StringIO(data), sep='\t')
                
                # 遍历DataFrame中的每一行并存储结果
                for index, row in df.iterrows():
                    sequence = row['Sequence']
                    results.append([gene_name, row['Entry'], sequence])
            else:  # 如果没有返回数据，添加一个空行
                results.append([gene_name, "", ""])
        else:
            print(f"Failed to retrieve data for gene: {gene_name}, status code: {response.status_code}")
            # 添加一个空行
            results.append([gene_name, "", ""])

except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")
    save_results(results)  # 保存当前获取到的数据

# 在完成所有请求后保存结果
save_results(results)

Fetching data from UniProt: 100%|██████████| 10/10 [00:14<00:00,  1.42s/it]

Results saved to CSV.





In [11]:
# 处理，选择最长的序列作为代表；如果序列长度少于30，则舍弃
data_path = '../data/processed_data/uniprot_results_all.csv'
data = pd.read_csv(data_path)
data

Unnamed: 0,Gene Name,UniProt ID,Sequence
0,SERPINA6,P08185,MPLLLYTCLLWLPTSGLWTVQAMDPNAAYVNMSNHHRGLASANVDF...
1,SERPINA6,G3V4V7,MPLLLYTCLLWLPTSGLWTVQAMDPNAAYVNMSNHHRGLASANVDF...
2,SERPINA6,G3V350,MPLLLYTCLLWLPTSGLWTVQAMDPNAAYVNMSNHHRGLASANVDF...
3,RNU6-652P,,
4,MIR941-5,,
...,...,...,...
102261,PLXND1,H0YA64,LQPEQLDCGAAHLQHPLSILQPLKATPVFRAPGLTSVAVASVNNYT...
102262,PLXND1,H0YAM9,XSLHPGSLLKDLDTEKYFHLVLPTDELAEPKKSHRQSHRKKVLPEI...
102263,PLXND1,H0YAB2,XEEGISLFSSLLNNKHFLIVFVHALEQQKDFAVRDRCSLASLLTIA...
102264,PLXND1,Q6P657,MAEIYKYAKRYRPQIMAALEANPTARRTQLQHKFEQVVALMEDNIY...


In [13]:


# Define a function to select the longest sequence for each gene
def select_longest_sequence(df):
    result = df.groupby('Gene Name').apply(
        lambda x: x.loc[x['Sequence'].str.len().idxmax() if x['Sequence'].str.len().max() > 30 else x.index[0]]
    ).reset_index(drop=True)
    return result

# Apply the function to the data
filtered_data = select_longest_sequence(data)

In [14]:
filtered_data

Unnamed: 0,Gene Name,UniProt ID,Sequence
0,A1BG,P04217,MSMLVVFLLLWGVTWGPVTEAAIFYETQPSLWAESESLLKPLANVT...
1,A1BG-AS1,,
2,A1CF,F8W9F8,MEAVCLGTCPEPEASMSTAIPGLKKGNNALQSIILQTLLEKENGQR...
3,A2M,P01023,MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEK...
4,A2M-AS1,,
...,...,...,...
27666,ZYG11A,Q6WRX3,MVHFLHPGHTPRNIVPPDAQKDALGCCVVQEEASPYTLVNICLNVL...
27667,ZYG11B,Q9C0D3,MPEDQAGAAMEEASPYSLLDICLNFLTTHLEKFCSARQDGTLCLQE...
27668,ZYX,Q9BUS0,PSPPGWRLLRTGTQSLRTRRRGGHPRRGAHAPACARPGHGGPPPVS...
27669,ZZEF1,O43149,MGNAPSHSSEDEAAAAGGEGWGPHQDWAAVSGTTPGPGVAAPALPP...


In [15]:
# save
filtered_data.to_csv('../data/train_data/uniprot_results_filtered.csv', index=False)