In [None]:
pip install biopython




In [None]:
from Bio import Entrez
import xml.etree.ElementTree as ET
import csv
import time
import pandas as pd
from tqdm import tqdm

In [None]:
api_key ="xxxxx"
Entrez.email = "xxx"

# Provide the API key to Entrez
Entrez.api_key = api_key

In [None]:
excel_file="All genes.xlsx"
xls = pd.ExcelFile(excel_file)
sheets_dict = pd.read_excel(xls, sheet_name=None)
# Initialize a list to hold all the gene IDs from all sheets
gene_ids = []

# Extract GeneIDs from each sheet and append to the list
for sheet_name, sheet_data in sheets_dict.items():
    if 'GeneID' in sheet_data.columns:
        gene_ids.extend(sheet_data['GeneID'].tolist())

# Remove duplicate gene IDs, if any
gene_ids = list(set(gene_ids))

In [None]:
len(gene_ids)

6380

In [None]:
import time
from tqdm import tqdm
import xml.etree.ElementTree as ET
from Bio import Entrez

# Function to fetch gene information with exponential backoff and retries
def fetch_gene_info_with_retry(gene_ids, max_retries=5, base_delay=1):
    retry_delay = base_delay
    for attempt in range(max_retries):
        try:
            handle = Entrez.efetch(db="gene", id=gene_id, rettype="xml")
            xml_data = handle.read()
            handle.close()
            return xml_data, None  # Return xml_data and no error
        except Exception as e:
            print(f"Error occurred for gene ID {gene_id}: {e}. Retrying in {retry_delay} seconds...")
            time.sleep(retry_delay)
            retry_delay *= 2  # Exponential backoff
    return None, f"Failed to fetch gene information for gene ID {gene_id} after {max_retries} retries."

# Initialize the list to store gene data
gene_data = []


# Initialize tqdm progress bar
progress_bar = tqdm(total=len(gene_ids), desc="Fetching Gene Information")

# Fetch information for each geneID from NCBI
for gene_id in gene_ids:
    xml_data, error = fetch_gene_info_with_retry(gene_id)  # Fetch gene info with retries

    if error:
        print("Error occurred for gene ID {}: {}".format(gene_id, error))
        continue  # Skip to the next gene ID if an error occurs

    # Parse XML data
    root = ET.fromstring(xml_data)

    # Navigate the XML to extract needed information
    for gene in root.findall('.//Entrezgene'):
        gene_id = gene.find('.//Gene-track_geneid').text
        status = gene.find('.//Gene-track_status').attrib.get('value', '')  # Use get method to handle None
        gene_name = gene.find('.//Gene-ref_locus').text if gene.find('.//Gene-ref_locus') is not None else ""
        description = gene.find('.//Gene-ref_desc').text if gene.find('.//Gene-ref_desc') is not None else ""
        summary = gene.find('.//Entrezgene_summary').text if gene.find('.//Entrezgene_summary') is not None else ""

        # Check if any of the attributes is None, and if so, skip this gene
        if None in [gene_id, status, gene_name, description, summary]:
            print(f"One or more attributes missing for gene ID {gene_id}. Skipping.")
            continue

        synonyms = [syn.text for syn in gene.findall('.//Gene-ref_syn')]
        synonym_text = ', '.join(synonyms)

        gene_data.append({
            "Gene ID": gene_id,
            "Status": status,
            "Gene Name": gene_name,
            "Description": description,
            "Summary": summary,
            "Synonyms": synonym_text
        })

    # Update progress bar
    progress_bar.update(1)

    time.sleep(0.1)  # Sleep for 0.1 second (10 requests per second)

# Close tqdm progress bar
progress_bar.close()

# Convert the list of dictionaries into a DataFrame
import pandas as pd
df_gene_info = pd.DataFrame(gene_data)
print(df_gene_info)


Fetching Gene Information:   1%|▏         | 80/6380 [00:40<53:50,  1.95it/s]

Error occurred for gene ID CDZ96283: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:   2%|▏         | 156/6380 [01:15<29:22,  3.53it/s]

Error occurred for gene ID CDZ97970: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:   3%|▎         | 206/6380 [01:45<40:50,  2.52it/s]

Error occurred for gene ID CDZ96242: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:   5%|▌         | 329/6380 [02:45<1:33:14,  1.08it/s]

Error occurred for gene ID CED82797: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:   6%|▌         | 354/6380 [02:59<41:43,  2.41it/s]

Error occurred for gene ID CDZ98407: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:   6%|▌         | 359/6380 [03:05<1:28:25,  1.13it/s]

Error occurred for gene ID CED85140: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:   7%|▋         | 440/6380 [03:44<39:50,  2.48it/s]

Error occurred for gene ID CDZ98058: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:   8%|▊         | 491/6380 [04:06<32:26,  3.03it/s]

Error occurred for gene ID CDZ97882: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:   9%|▊         | 545/6380 [04:28<1:10:09,  1.39it/s]

Error occurred for gene ID CED82156: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:   9%|▉         | 576/6380 [04:46<59:23,  1.63it/s]  

Error occurred for gene ID CED84085: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  11%|█         | 686/6380 [05:40<53:10,  1.78it/s]

Error occurred for gene ID CED82247: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  12%|█▏        | 796/6380 [06:21<32:13,  2.89it/s]

Error occurred for gene ID CED82756: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  13%|█▎        | 807/6380 [06:30<1:23:53,  1.11it/s]

Error occurred for gene ID CED83179: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  13%|█▎        | 814/6380 [06:34<42:35,  2.18it/s]

Error occurred for gene ID CDZ98617: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  16%|█▌        | 1012/6380 [07:57<30:26,  2.94it/s]

Error occurred for gene ID CED85339: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  16%|█▋        | 1051/6380 [08:13<24:46,  3.59it/s]

Error occurred for gene ID CED83793: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  20%|██        | 1299/6380 [10:06<36:15,  2.34it/s]

Error occurred for gene ID CDZ97352: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  23%|██▎       | 1480/6380 [11:32<55:06,  1.48it/s]  

Error occurred for gene ID CDZ96270: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  24%|██▍       | 1544/6380 [12:07<32:39,  2.47it/s]

Error occurred for gene ID CED84388: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  29%|██▊       | 1828/6380 [14:15<29:24,  2.58it/s]

Error occurred for gene ID CED83219: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  31%|███       | 1982/6380 [15:16<19:37,  3.73it/s]

Error occurred for gene ID CED82922: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  31%|███▏      | 2000/6380 [15:30<38:16,  1.91it/s]

Error occurred for gene ID CED84482: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  33%|███▎      | 2086/6380 [16:07<21:07,  3.39it/s]

Error occurred for gene ID CDZ98196: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  35%|███▌      | 2243/6380 [17:21<47:32,  1.45it/s]

Error occurred for gene ID CDZ97746: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  37%|███▋      | 2385/6380 [18:30<45:05,  1.48it/s]

Error occurred for gene ID CDZ98660: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  38%|███▊      | 2446/6380 [18:56<52:49,  1.24it/s]

Error occurred for gene ID CDZ97144: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  42%|████▏     | 2695/6380 [20:43<26:46,  2.29it/s]

Error occurred for gene ID CDZ98271: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  45%|████▌     | 2885/6380 [22:13<17:07,  3.40it/s]

Error occurred for gene ID CED83640: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  46%|████▌     | 2924/6380 [22:32<21:46,  2.65it/s]

Error occurred for gene ID CDZ97595: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  46%|████▋     | 2964/6380 [22:49<21:07,  2.70it/s]

Error occurred for gene ID CED84039: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  47%|████▋     | 2969/6380 [22:53<27:38,  2.06it/s]

Error occurred for gene ID CED84033: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  48%|████▊     | 3061/6380 [23:33<26:42,  2.07it/s]

Error occurred for gene ID CDZ96770: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  49%|████▊     | 3095/6380 [23:49<19:33,  2.80it/s]

Error occurred for gene ID CED85285: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  49%|████▉     | 3114/6380 [23:57<23:53,  2.28it/s]

Error occurred for gene ID CDZ97294: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  52%|█████▏    | 3326/6380 [25:28<19:39,  2.59it/s]

Error occurred for gene ID CED83888: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  53%|█████▎    | 3373/6380 [25:48<14:36,  3.43it/s]

Error occurred for gene ID CDZ97481: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  53%|█████▎    | 3374/6380 [25:51<48:41,  1.03it/s]

Error occurred for gene ID CDZ96226: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  55%|█████▍    | 3490/6380 [26:39<15:06,  3.19it/s]

Error occurred for gene ID CED82109: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  56%|█████▌    | 3568/6380 [27:20<41:36,  1.13it/s]

Error occurred for gene ID CDZ96181: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  56%|█████▋    | 3591/6380 [27:38<26:11,  1.77it/s]

Error occurred for gene ID CED84536: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  57%|█████▋    | 3617/6380 [27:51<22:47,  2.02it/s]

Error occurred for gene ID CDZ97753: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  59%|█████▊    | 3747/6380 [28:47<15:13,  2.88it/s]

Error occurred for gene ID CDZ98207: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  60%|██████    | 3858/6380 [29:31<13:49,  3.04it/s]

Error occurred for gene ID CDZ96618: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  62%|██████▏   | 3932/6380 [30:02<14:38,  2.79it/s]

Error occurred for gene ID CDZ96958: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  62%|██████▏   | 3961/6380 [30:12<11:02,  3.65it/s]

Error occurred for gene ID CDZ97237: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  62%|██████▏   | 3962/6380 [30:14<36:23,  1.11it/s]

Error occurred for gene ID CDZ97268: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  63%|██████▎   | 4012/6380 [30:43<14:19,  2.76it/s]

Error occurred for gene ID CDZ97421: HTTP Error 400: Bad Request. Retrying in 1 seconds...
Error occurred for gene ID CDZ97421: HTTP Error 400: Bad Request. Retrying in 2 seconds...


Fetching Gene Information:  66%|██████▌   | 4217/6380 [32:12<10:19,  3.49it/s]

Error occurred for gene ID CED82174: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  67%|██████▋   | 4258/6380 [32:35<09:32,  3.71it/s]

Error occurred for gene ID CED85012: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  67%|██████▋   | 4278/6380 [32:43<10:38,  3.29it/s]

Error occurred for gene ID CED83641: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  67%|██████▋   | 4301/6380 [32:54<12:22,  2.80it/s]

Error occurred for gene ID CDZ97455: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  68%|██████▊   | 4334/6380 [33:08<10:35,  3.22it/s]

Error occurred for gene ID CDZ96935: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  68%|██████▊   | 4339/6380 [33:13<27:53,  1.22it/s]

Error occurred for gene ID CED85621: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  73%|███████▎  | 4688/6380 [35:51<16:11,  1.74it/s]

Error occurred for gene ID CED85544: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  74%|███████▍  | 4715/6380 [36:02<09:17,  2.99it/s]

Error occurred for gene ID CDZ98173: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  74%|███████▍  | 4742/6380 [36:16<23:21,  1.17it/s]

Error occurred for gene ID CDZ97287: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  75%|███████▌  | 4792/6380 [36:37<09:28,  2.79it/s]

Error occurred for gene ID CED82516: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  76%|███████▌  | 4834/6380 [36:56<11:52,  2.17it/s]

Error occurred for gene ID CDZ97542: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  76%|███████▋  | 4869/6380 [37:16<17:13,  1.46it/s]

Error occurred for gene ID CDZ96665: HTTP Error 400: Bad Request. Retrying in 1 seconds...
Error occurred for gene ID CDZ96665: HTTP Error 400: Bad Request. Retrying in 2 seconds...


Fetching Gene Information:  77%|███████▋  | 4885/6380 [37:29<12:54,  1.93it/s]

Error occurred for gene ID CED82032: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  77%|███████▋  | 4903/6380 [37:43<24:37,  1.00s/it]

Error occurred for gene ID CDZ96896: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  77%|███████▋  | 4904/6380 [37:46<34:42,  1.41s/it]

Error occurred for gene ID CED83913: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  77%|███████▋  | 4929/6380 [37:59<08:42,  2.78it/s]

Error occurred for gene ID CED85532: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  78%|███████▊  | 4999/6380 [38:37<09:53,  2.33it/s]

Error occurred for gene ID CDZ98633: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  80%|███████▉  | 5077/6380 [39:15<18:36,  1.17it/s]

Error occurred for gene ID CED82592: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  80%|████████  | 5106/6380 [39:33<15:30,  1.37it/s]

Error occurred for gene ID CED84321: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  81%|████████  | 5181/6380 [40:12<12:35,  1.59it/s]

Error occurred for gene ID CDZ97914: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  82%|████████▏ | 5202/6380 [40:24<07:24,  2.65it/s]

Error occurred for gene ID CED82548: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  83%|████████▎ | 5296/6380 [41:11<10:06,  1.79it/s]

Error occurred for gene ID CDZ98024: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  83%|████████▎ | 5319/6380 [41:21<07:21,  2.40it/s]

Error occurred for gene ID CED84819: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  85%|████████▍ | 5408/6380 [42:05<05:19,  3.04it/s]

Error occurred for gene ID CED84987: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  86%|████████▌ | 5489/6380 [42:47<06:56,  2.14it/s]

Error occurred for gene ID CDZ98744: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  87%|████████▋ | 5533/6380 [43:03<04:27,  3.17it/s]

Error occurred for gene ID CED82282: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  88%|████████▊ | 5597/6380 [43:31<04:21,  3.00it/s]

Error occurred for gene ID CDZ98728: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  91%|█████████ | 5802/6380 [45:11<05:56,  1.62it/s]

Error occurred for gene ID CDZ97943: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  91%|█████████ | 5803/6380 [45:14<10:57,  1.14s/it]

Error occurred for gene ID CDZ96204: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  91%|█████████ | 5805/6380 [45:16<11:08,  1.16s/it]

Error occurred for gene ID CED84525: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  92%|█████████▏| 5868/6380 [45:47<08:13,  1.04it/s]

Error occurred for gene ID CED84456: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  97%|█████████▋| 6197/6380 [48:10<00:54,  3.36it/s]

Error occurred for gene ID CDZ96423: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information:  99%|█████████▉| 6323/6380 [49:05<00:18,  3.09it/s]

Error occurred for gene ID CDZ98076: HTTP Error 400: Bad Request. Retrying in 1 seconds...


Fetching Gene Information: 100%|██████████| 6380/6380 [49:30<00:00,  2.15it/s]

     Gene ID        Status      Gene Name  \
0      97577     secondary  E230019G03Rik   
1      97252     secondary         C81327   
2      96289  discontinued       LOC96289   
3      84545          live         MRPL43   
4      82891  discontinued       LOC82891   
...      ...           ...            ...   
6310   84278          live       MFSD14CP   
6311   96658  discontinued       LOC96658   
6312   96793  discontinued       LOC96793   
6313   97854     secondary         C77215   
6314   98269     secondary       AI132431   

                                            Description  \
0                            RIKEN cDNA E230019G03 gene   
1                             expressed sequence C81327   
2     similar to enabled homolog (Drosophila) (M. mu...   
3                   mitochondrial ribosomal protein L43   
4     similar to RED PROTEIN (RER PROTEIN) (M. muscu...   
...                                                 ...   
6310  major facilitator superfamily domain con




In [None]:
df_gene_info.to_excel('df_gene_info.xlsx')

In [None]:
df_gene_info

Unnamed: 0,Gene ID,Status,Gene Name,Description,Summary,Synonyms
0,97577,secondary,E230019G03Rik,RIKEN cDNA E230019G03 gene,,
1,97252,secondary,C81327,expressed sequence C81327,,
2,96289,discontinued,LOC96289,similar to enabled homolog (Drosophila) (M. mu...,DISCONTINUED: This record has been withdrawn b...,
3,84545,live,MRPL43,mitochondrial ribosomal protein L43,Mammalian mitochondrial ribosomal proteins are...,\n
4,82891,discontinued,LOC82891,similar to RED PROTEIN (RER PROTEIN) (M. muscu...,DISCONTINUED: This record has been withdrawn b...,
...,...,...,...,...,...,...
6310,84278,live,MFSD14CP,major facilitator superfamily domain containin...,Predicted to enable transmembrane transporter ...,\n
6311,96658,discontinued,LOC96658,similar to ZINC FINGER PROTEIN 23 (ZINC FINGER...,DISCONTINUED: This record has been withdrawn b...,
6312,96793,discontinued,LOC96793,similar to hypothetical protein MGC2615 (H. sa...,DISCONTINUED: This record has been withdrawn b...,
6313,97854,secondary,C77215,expressed sequence C77215,,


In [None]:
excel_file_path = 'All genes.xlsx'
excel_sheets = pd.ExcelFile(excel_file_path)

In [None]:
# Initialize an ExcelWriter to write back to the Excel file
with pd.ExcelWriter('updated.xlsx', engine='openpyxl') as writer:
    for sheet_name in excel_sheets.sheet_names:
        # Read the current sheet into a DataFrame
        df_sheet = pd.read_excel(excel_sheets, sheet_name=sheet_name)

        # Extract numerical ID from 'GeneID' in the sheet for matching
        df_sheet['Numerical_ID'] = df_sheet['GeneID'].astype(str).str.extract('(\d+)').astype(int)

        # Merge the current sheet with the 'df' DataFrame based on numerical IDs
        df_merged = df_sheet.merge(df, left_on='Numerical_ID', right_on='GeneID', how='left')

        # Safely drop the 'Numerical_ID' column if it exists
        if 'Numerical_ID' in df_merged.columns:
            df_merged.drop('Numerical_ID', axis=1, inplace=True)

        # Write the merged DataFrame to the ExcelWriter object
        df_merged.to_excel(writer, index=False, sheet_name=sheet_name)
