# Biopyhton Tests

## Example from Biopython documentation

In [None]:
from Bio import Entrez

Entrez.email = "fabien.allemand@etu.unistra.fr"
handle = Entrez.egquery(term="Opuntia AND rpl16")
record = Entrez.read(handle)
for row in record["eGQueryResult"]:
    if row["DbName"] == "nuccore":
        print(row["Count"])

handle = Entrez.esearch(db="nuccore", term="Opuntia AND rpl16")
record = Entrez.read(handle)
gi_list = record["IdList"]
print(gi_list)

handle = Entrez.efetch(db="nuccore", id=gi_list, rettype="gb", retmode="text")

text = handle.read()
print(text)

print("END")

## Example from: https://www.youtube.com/watch?v=XiWcXUS15fI&ab_channel=BioinformaticsCoach

In [None]:
from Bio import Entrez
from Bio import SeqIO

Entrez.email="fabien.allemand@etu.unistra.fr"

### Download and explore files

In [4]:
# Identifier: GI number
# Data base: nuccore
handle = Entrez.efetch(db="nuccore", id="34577062")

print(handle.read())

Seq-entry ::= set {
  level 1,
  class nuc-prot,
  descr {
    source {
      genome genomic,
      org {
        taxname "Homo sapiens",
        common "human",
        db {
          {
            db "taxon",
            tag id 9606
          }
        },
        orgname {
          name binomial {
            genus "Homo",
            species "sapiens"
          },
          lineage "Eukaryota; Metazoa; Chordata; Craniata; Vertebrata;
 Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
 Catarrhini; Hominidae; Homo",
          gcode 1,
          mgcode 2,
          div "PRI"
        }
      },
      subtype {
        {
          subtype chromosome,
          name "1"
        },
        {
          subtype map,
          name "1cen-q12"
        }
      }
    },
    pub {
      pub {
        pmid 19115993,
        article {
          title {
            name "Association analyses of the interaction between the ADSS and
 ATM genes with schizophrenia in a Chinese 

In [5]:
# Identifier: GI number
# Data base: nuccore
handle = Entrez.efetch(db="nuccore", id="34577062", rettype="fasta")
print(handle.read())

>NM_001126.2 Homo sapiens adenylosuccinate synthase (ADSS), mRNA
GGAAGGGGCGTGGCCTCGGTCCGGGGTGGCGGCCGTTGCCGCCACCAGGGCCTCTTCCTGCGGGCGGTGC
TGCCGAGGCCGGCCTGCGCGGGGCAGTCATGGTACCCCCTTGAGCGGGCTGTGGCGGAGAGCGGGGCGGG
GACTGGCTGGAGGGTGGCGGCCCGGCGGGGCGGGGGCGGGGCCGGCCTCTGGCTCCTTCTTCCTCTGCAT
GTGGCTGGCGGCCGCAGAGCAGTTCAGTTCGCTCACTCCTCGCCGGCCGCCTCTCCTTCGGGCTCTCCTC
GCGTCACTGGAGCCATGGCGTTCGCCGAGACCTACCCGGCGGCATCCTCCCTGCCCAACGGCGATTGCGG
CCGCCCCAGGGCGCGGCCCGGAGGAAACCGGGTGACGGTGGTGCTCGGTGCGCAGTGGGGCGACGAAGGC
AAAGGGAAGGTGGTGGACCTGCTGGCGCAGGACGCCGACATCGTGTGCCGCTGCCAGGGAGGAAATAATG
CTGGCCATACAGTTGTTGTGGATTCTGTGGAATATGATTTTCATCTCTTACCCAGTGGAATAATTAATCC
AAATGTCACTGCATTCATTGGAAATGGTGTGGTAATTCATCTACCTGGATTGTTTGAAGAAGCAGAGAAA
AATGTTCAAAAAGGAAAAGGACTAGAAGGCTGGGAAAAAAGGCTTATTATATCTGACAGAGCTCATATTG
TATTTGATTTTCATCAAGCAGCTGATGGTATCCAGGAACAACAGAGACAAGAACAAGCAGGAAAAAATTT
GGGTACAACAAAAAAGGGCATTGGCCCAGTTTATTCGTCCAAAGCTGCTCGGAGTGGACTCAGGATGTGC
GACCTTGTTTCTGACTTTGATGGCTTCTCTGAGAGGTTTAAAGTTCTAGCTAACCAATACAAATCTATAT
ACCCCACTTTGG

In [6]:
# Data base: nucleotide
# Identifier: accession member
# Output format: genbank
# Display format: text
handle = Entrez.efetch(db="nucleotide", id="NM_001126.2", rettype="gb", retmode="text")
print(handle.read())

LOCUS       NM_001126               2775 bp    mRNA    linear   PRI 10-OCT-2010
DEFINITION  Homo sapiens adenylosuccinate synthase (ADSS), mRNA.
ACCESSION   NM_001126
VERSION     NM_001126.2
KEYWORDS    RefSeq.
SOURCE      Homo sapiens (human)
  ORGANISM  Homo sapiens
            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
            Catarrhini; Hominidae; Homo.
REFERENCE   1  (bases 1 to 2775)
  AUTHORS   Sivendran,S. and Colman,R.F.
  TITLE     Effect of a new non-cleavable substrate analog on wild-type and
            serine mutants in the signature sequence of adenylosuccinate lyase
            of Bacillus subtilis and Homo sapiens
  JOURNAL   Protein Sci. 17 (7), 1162-1174 (2008)
   PUBMED   18469177
  REMARK    GeneRIF: Kinetic data reveal that human Ser(289) and B. subtilis
            Ser(262) and Ser(263) are essential for catalysis, while the
            ability of these Ser mutant

### Download and explore sequences

In [11]:
# Data base: nucleotide
# Identifier: accession number
# Output format: fasta
# Display mode: text
handle = Entrez.efetch(db="nucleotide", id="NM_001126.2", rettype="fasta", retmode="text")
record = SeqIO.read(handle, "fasta")

print("record id:", record.id)
print("record name:", record.name)
print("record description:", record.description)

seq = record.seq
print("sequence length:", len(seq))
print("sequence head:", seq[1:20])


record id: NM_001126.2
record name: NM_001126.2
record description: NM_001126.2 Homo sapiens adenylosuccinate synthase (ADSS), mRNA
sequence length: 2775
sequence head: GAAGGGGCGTGGCCTCGGT


In [16]:
# Identifier: accession number
# Output format: genbank
# Display mode: text
# Data base: nucleotide
handle = Entrez.efetch(db="nucleotide", id="NM_001126.2", rettype="gb", retmode="text")
record = SeqIO.read(handle, "gb")

print("record id:", record.id)
print("record name:", record.name)
print("record description:", record.description)

features = record.features
print("length sequence features:", len(features))
print("sequence features:", features)

seq = record.seq
print("sequence length:", len(seq))
print("sequence head:", seq[1:20])

# print(seq.startswith("GAA")) # To explore


record id: NM_001126.2
record name: NM_001126
record description: Homo sapiens adenylosuccinate synthase (ADSS), mRNA
length sequence features: 5
sequence features: [SeqFeature(SimpleLocation(ExactPosition(0), ExactPosition(2775), strand=1), type='source', qualifiers=...), SeqFeature(SimpleLocation(ExactPosition(0), ExactPosition(2775), strand=1), type='gene', qualifiers=...), SeqFeature(SimpleLocation(ExactPosition(294), ExactPosition(1665), strand=1), type='CDS', qualifiers=...), SeqFeature(SimpleLocation(ExactPosition(2731), ExactPosition(2737), strand=1), type='regulatory', qualifiers=...), SeqFeature(SimpleLocation(ExactPosition(2756), ExactPosition(2757), strand=1), type='polyA_site', qualifiers=...)]
sequence length: 2775
sequence head: GAAGGGGCGTGGCCTCGGT
False


### Download multiple genomes

In [19]:
# Multiple identifiers
handle = Entrez.efetch(db="nuccore", id="34577062,186972394", rettype="fasta")
print(handle.read())

>NM_001126.2 Homo sapiens adenylosuccinate synthase (ADSS), mRNA
GGAAGGGGCGTGGCCTCGGTCCGGGGTGGCGGCCGTTGCCGCCACCAGGGCCTCTTCCTGCGGGCGGTGC
TGCCGAGGCCGGCCTGCGCGGGGCAGTCATGGTACCCCCTTGAGCGGGCTGTGGCGGAGAGCGGGGCGGG
GACTGGCTGGAGGGTGGCGGCCCGGCGGGGCGGGGGCGGGGCCGGCCTCTGGCTCCTTCTTCCTCTGCAT
GTGGCTGGCGGCCGCAGAGCAGTTCAGTTCGCTCACTCCTCGCCGGCCGCCTCTCCTTCGGGCTCTCCTC
GCGTCACTGGAGCCATGGCGTTCGCCGAGACCTACCCGGCGGCATCCTCCCTGCCCAACGGCGATTGCGG
CCGCCCCAGGGCGCGGCCCGGAGGAAACCGGGTGACGGTGGTGCTCGGTGCGCAGTGGGGCGACGAAGGC
AAAGGGAAGGTGGTGGACCTGCTGGCGCAGGACGCCGACATCGTGTGCCGCTGCCAGGGAGGAAATAATG
CTGGCCATACAGTTGTTGTGGATTCTGTGGAATATGATTTTCATCTCTTACCCAGTGGAATAATTAATCC
AAATGTCACTGCATTCATTGGAAATGGTGTGGTAATTCATCTACCTGGATTGTTTGAAGAAGCAGAGAAA
AATGTTCAAAAAGGAAAAGGACTAGAAGGCTGGGAAAAAAGGCTTATTATATCTGACAGAGCTCATATTG
TATTTGATTTTCATCAAGCAGCTGATGGTATCCAGGAACAACAGAGACAAGAACAAGCAGGAAAAAATTT
GGGTACAACAAAAAAGGGCATTGGCCCAGTTTATTCGTCCAAAGCTGCTCGGAGTGGACTCAGGATGTGC
GACCTTGTTTCTGACTTTGATGGCTTCTCTGAGAGGTTTAAAGTTCTAGCTAACCAATACAAATCTATAT
ACCCCACTTTGG

In [23]:
handle = Entrez.efetch(db="nuccore", id="34577062,186972394", rettype="fasta")

records = SeqIO.parse(handle, "fasta")
records = [i for i in records]
print("number of genomes:", len(records))

first_record = records[0]
print("first record id:", first_record.id)

number of genomes: 2
first record id: NM_001126.2


### Save genomes

In [24]:
# Output: genbank
handle = Entrez.efetch(db="nucleotide", id="34577062", rettype="gb")
record = SeqIO.read(handle, "gb")
output_name = "test.gb"
SeqIO.write(record, output_name, "gb") # return the number of sequences successfully written

1

In [25]:
# Output: fasta
# Output: genbank
handle = Entrez.efetch(db="nucleotide", id="34577062", rettype="fasta")
record = SeqIO.read(handle, "fasta")
output_name = "test.fasta"
SeqIO.write(record, output_name, "fasta") # return the number of sequences successfully written

1

In [26]:
# Output: multi fasta
# Output: fasta
# Output: genbank
handle = Entrez.efetch(db="nucleotide", id="34577062,186972394", rettype="fasta")
record = SeqIO.parse(handle, "fasta")
output_name = "multi_test.fasta"
SeqIO.write(record, output_name, "fasta") # return the number of sequences successfully written

2

## Example from: https://www.youtube.com/watch?v=pmYyEsRNt98&ab_channel=BioinformaticsCoach


In [None]:
from Bio import SeqIO

file_path = ""
genbank_object = SeqIO.read(file_path, "bg")

record_id = genbank_object.id
print(record_id)

record_name = genbank_object.name
print(record_name)

record_seq = genbank_object.seq
sequence_len = len(record_seq)
print(sequence_len)

description = genbank_object.description
print(description)

annotations = genbank_object.annotations
print(annotations)

features = genbank_object.features
feature_types = [feature.type for feature in features]
feature_types = set(feature_types)
feature_types = list(feature_types) # Remove duplicates
print(feature_types)

selected_feature_types = feature_types[1:2]
print(selected_feature_types)
for feature in selected_feature_types:
    allfeatures = [i for i in features if i.type==feature]
    number_of_features = len(allfeatures)
    print("%s:%d"%(feature, number_of_features))