In [52]:
try:
    import google.colab
    # Running on Google Colab, so install Biopython first
    !pip install biopython
except ImportError:
    pass




In [53]:
from Bio import Entrez
from Bio import SeqIO
Entrez.email = "A.N.Other@example.com"
import pandas as pd
import xml.etree.ElementTree as ET

In [54]:
dis = pd.read_table('gene_disease_associations.tsv')
dis

Unnamed: 0,gene_pli,score,diseaseid,gene_dsi,protein_class_name,geneid,disease_class,disease_type,uniprotid,disease_class_name,disease_name,el,year_initial,source,ei,gene_dpi,protein_class,year_final,gene_symbol,disease_semantic_type
0,1.643200e-05,1.00,C0001080,0.391,Kinase,2261,C16;C05,disease,P22607,"Congenital, Hereditary, and Neonatal Diseas...",Achondroplasia,definitive,1994.0,CURATED,0.974,0.846,DTO_03300101,2020.0,FGFR3,Congenital Abnormality
1,1.040000e-05,0.80,C0000744,0.505,Transporter,4547,C16;C18,disease,P55157,"Congenital, Hereditary, and Neonatal Diseas...",Abetalipoproteinemia,,1985.0,CURATED,0.982,0.846,DTO_05007405,2019.0,MTTP,Disease or Syndrome
2,9.973100e-01,0.80,C0001193,0.380,Kinase,2263,C16;C05,disease,P21802,"Congenital, Hereditary, and Neonatal Diseas...",Apert syndrome,,1994.0,CURATED,0.976,0.808,DTO_03300101,2019.0,FGFR2,Congenital Abnormality
3,5.323500e-01,0.70,C0001418,0.236,Transcription factor,7157,C04,group,P04637,Neoplasms,Adenocarcinoma,,1986.0,CURATED,0.975,0.962,DTO_05007542,2020.0,TP53,Neoplastic Process
4,8.460200e-01,0.69,C0001126,0.522,Transporter,6521,C16;C18;C13;C12,phenotype,P02730,"Congenital, Hereditary, and Neonatal Diseas...",Renal tubular acidosis,,1996.0,CURATED,1.000,0.731,DTO_05007405,2013.0,SLC4A1,Disease or Syndrome
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
453,1.047800e-06,0.30,C0001430,0.578,,6948,C04,group,P20062,Neoplasms,Adenoma,,2007.0,CURATED,1.000,0.615,,2007.0,TCN2,Neoplastic Process
454,2.095700e-32,0.30,C0001430,0.419,Transporter,9429,C04,group,Q9UNQ0,Neoplasms,Adenoma,,2011.0,CURATED,1.000,0.885,DTO_05007405,2011.0,ABCG2,Neoplastic Process
455,1.000000e+00,0.30,C0001546,0.552,Epigenetic regulator,10014,F03,group,Q9UQL6,Mental Disorders,Adjustment Disorders,,2007.0,CURATED,1.000,0.769,DTO_05007378,2007.0,HDAC5,Mental or Behavioral Dysfunction
456,8.919400e-01,0.30,C0001618,0.636,Enzyme modulator,2771,C04;C19,group,P04899,Neoplasms; Endocrine System Diseases,Tumors of Adrenal Cortex,,,CURATED,,0.577,DTO_05007584,,GNAI2,Neoplastic Process


In [55]:
dis.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gene_pli               442 non-null    float64
 1   score                  458 non-null    float64
 2   diseaseid              458 non-null    object 
 3   gene_dsi               458 non-null    float64
 4   protein_class_name     276 non-null    object 
 5   geneid                 458 non-null    int64  
 6   disease_class          458 non-null    object 
 7   disease_type           458 non-null    object 
 8   uniprotid              452 non-null    object 
 9   disease_class_name     458 non-null    object 
 10  disease_name           458 non-null    object 
 11  el                     8 non-null      object 
 12  year_initial           451 non-null    float64
 13  source                 458 non-null    object 
 14  ei                     451 non-null    float64
 15  gene_d

In [56]:
unique_des=dis['disease_name'].unique()

In [57]:
def get_rquest_db(request='Tumors of Adrenal Cortex'):
    request_db = dis.loc[dis['disease_name'] == request]
    display(request_db)
    return request_db

In [58]:
def get_root(id):
  handle = Entrez.efetch(db="gene", id=str(id), rettype = 'xml')
  xml_text = handle.read()   # читаем полностью!
  handle.close()
  # 2. Теперь парсим из текста
  root=ET.fromstring(xml_text)
  return root

In [59]:
def get_chromosome(root, idx):
    # Находим NC номер гена
    locus_list = root.findall(".//Gene-commentary")

    accession = None
    for locus_elem in locus_list:
        accession = locus_elem.findtext("Gene-commentary_accession")
        if accession is not None and accession.startswith("NC_"):
            break

    return accession

In [60]:
# 3. Ищем описание гена
def get_gene_desc(root, idx):
   return root.findtext("Entrezgene/Entrezgene_gene/Gene-ref/Gene-ref_desc")

In [61]:
# Место на ДНК, где находится ген
def get_locus(root, idx):
  return root.findtext("Entrezgene/Entrezgene_gene/Gene-ref/Gene-ref_maploc")


In [62]:
#находим описание гена
def get_chromosome_desc(idx, id):
    handle_nc = Entrez.efetch(db="nucleotide",
                       id=str(id),
                       rettype = 'fasta',
                       retmode = 'text')
    output=next(SeqIO.parse(handle_nc, 'fasta')).description
    handle_nc.close()
    return output


In [85]:
def get_id_start_end_strand(idx, idg):
    handle = Entrez.efetch(db="gene", id=str(idg), rettype = 'xml')
    gene_locus = ET.parse(handle).getroot().find("Entrezgene/Entrezgene_locus")
    region = gene_locus.find("Gene-commentary/Gene-commentary_seqs/Seq-loc/Seq-loc_int/Seq-interval")
    seq_start = int(region.find("Seq-interval_from").text) + 1
    seq_end = int(region.find("Seq-interval_to").text) + 1
    seq_id = region.find("Seq-interval_id/Seq-id/Seq-id_gi").text
    strand = region.find("Seq-interval_strand/Na-strand").get("value")
    handle.close()

    return seq_start, seq_end, seq_id , strand




In [86]:
def get_seq(idx, seq_start, seq_end, id):
        handle_nc = Entrez.efetch(db="nucleotide",
                              id=str(id),
                              rettype = 'fasta',
                              retmode = 'text')

        chr=SeqIO.read(handle_nc, 'fasta')
        start=int(seq_start)
        end=int(seq_end)
        gene_seq=chr.seq[start:end]
        handle_nc.close()
        return str(gene_seq)

In [87]:
def get_synonyms(root, idx):
    synonyms=[]
    for elem in root.findall("Entrezgene/Entrezgene_gene/Gene-ref/Gene-ref_syn/Gene-ref_syn_E"):
      synonyms.append(elem.text)
    return ', '.join(synonyms)

In [89]:
def get_answer_db(request='Tumors of Adrenal Cortex'):
      print('ТАБЛИЦА ГЕНЕТИЧЕСКИХ ЗАБОЛЕВАНИЙ')
      request_db=get_rquest_db(request=request)
      final_db = pd.DataFrame(columns=['geneid', 'gene_symbol', 'gene_desc', 'locus', 'chromosome', 'chromosome_desc', 'seq_id', 'seq_start', 'seq_end', 'strand', 'seq', 'gene_syn'])
      final_db['geneid']=request_db['geneid']
      final_db['gene_symbol']=request_db['gene_symbol']
      print('\n\n\n\n')

      for idx, row in final_db.iterrows():
            root=get_root(row['geneid'])
            final_db.loc[idx,'locus']=get_locus(root, idx)
            final_db.loc[idx, 'chromosome'] = get_chromosome(root, idx)
            final_db.loc[idx,'gene_desc']=get_gene_desc(root, idx)
            final_db.loc[idx,'chromosome_desc'] = get_chromosome_desc(idx, final_db.loc[idx,'chromosome'])
            final_db.loc[idx,'seq_start'], final_db.loc[idx,'seq_end'], final_db.loc[idx,'seq_id'], final_db.loc[idx,'strand']=get_id_start_end_strand(idx, row['geneid'])
            final_db.loc[idx,'seq']=get_seq(idx, final_db.loc[idx,'seq_start'], final_db.loc[idx,'seq_end'], final_db.loc[idx,'chromosome'])
            final_db.loc[idx,'gene_syn']=get_synonyms(root, idx)

      print('ТАБЛИЦА ГЕНОВ')
      return final_db

In [90]:
unique_des

array(['Achondroplasia', 'Abetalipoproteinemia', 'Apert syndrome',
       'Adenocarcinoma', 'Renal tubular acidosis',
       'Tumors of Adrenal Cortex', 'Addison Disease', 'Acidosis, Lactic',
       'Congenital Abnormality', 'Multiple congenital anomalies',
       'Spontaneous abortion', 'Acromegaly', 'ACTH Syndrome, Ectopic',
       'Adenoma', 'Acquired Immunodeficiency Syndrome', 'Acrodermatitis',
       'Abortion, Habitual', 'Abruptio Placentae', 'Acne Vulgaris',
       'Adrenal Cortex Diseases', 'Abdominal Pain',
       'Abnormalities, Drug-Induced', 'Threatened abortion',
       'Abortion, Tubal', 'Acidosis',
       'Herpetic Acute Necrotizing Encephalitis', 'Acute-Phase Reaction',
       'Massive Hepatic Necrosis', 'Adjustment Disorders'], dtype=object)

# ВПИШИТЕ СВОЙ ЗАПРОС ИЗ ЗНАЧЕНИЙ ВЫШЕ

In [91]:

genes_table=get_answer_db(request='Tumors of Adrenal Cortex')
display(genes_table)

ТАБЛИЦА ГЕНЕТИЧЕСКИХ ЗАБОЛЕВАНИЙ


Unnamed: 0,gene_pli,score,diseaseid,gene_dsi,protein_class_name,geneid,disease_class,disease_type,uniprotid,disease_class_name,disease_name,el,year_initial,source,ei,gene_dpi,protein_class,year_final,gene_symbol,disease_semantic_type
7,0.53235,0.5,C0001618,0.236,Transcription factor,7157,C04;C19,group,P04637,Neoplasms; Endocrine System Diseases,Tumors of Adrenal Cortex,,2005.0,CURATED,1.0,0.962,DTO_05007542,2018.0,TP53,Neoplastic Process
456,0.89194,0.3,C0001618,0.636,Enzyme modulator,2771,C04;C19,group,P04899,Neoplasms; Endocrine System Diseases,Tumors of Adrenal Cortex,,,CURATED,,0.577,DTO_05007584,,GNAI2,Neoplastic Process
457,0.99985,0.3,C0001618,0.401,,5573,C04;C19,group,P10644,Neoplasms; Endocrine System Diseases,Tumors of Adrenal Cortex,,,CURATED,,0.846,,,PRKAR1A,Neoplastic Process







ТАБЛИЦА ГЕНОВ


Unnamed: 0,geneid,gene_symbol,gene_desc,locus,chromosome,chromosome_desc,seq_id,seq_start,seq_end,strand,seq,gene_syn
7,7157,TP53,tumor protein p53,17p13.1,NC_000017,"NC_000017.11 Homo sapiens chromosome 17, GRCh3...",568815581,7668421,7687490,minus,GGCAGCAAAGTTTTATTGTAAAATAAGAGATCGATATAAAAATGGG...,"P53, BCC7, LFS1, BMFS5, TRP53"
456,2771,GNAI2,G protein subunit alpha i2,3p21.31,NC_000003,"NC_000003.12 Homo sapiens chromosome 3, GRCh38...",568815595,50227068,50263358,plus,CATCACCGTCTAATCTCTGCTGTGAAGTGGAAGCGCGAGAAGGAGG...,"GIP, HG1C, GNAI2B, H_LUCA15.1, H_LUCA16.1"
457,5573,PRKAR1A,protein kinase cAMP-dependent type I regulator...,17q24.2,NC_000017,"NC_000017.11 Homo sapiens chromosome 17, GRCh3...",568815581,68413623,68551316,plus,AGCTGTGGTGGGCTCCACCCAGTTCGAGCTTCCCGGCTGCTTTGGT...,"CAR, CNC, CNC1, PKR1, TSE1, ADOHR, PPNAD1, PRK..."
