# Analysing DNA Sequences and Finding Protein-Coding Regions

Gene: Human Insulin

## Downloading Sequence Data from Entrez Database

In [None]:
# Library
from Bio import Entrez, SeqIO
from Bio.SeqRecord import SeqRecord

In [None]:
# Always provide your email when using NCBI services
Entrez.email = "<your_email@example.com"

In [None]:
# Fetch the human insulin gene sequence
handle = Entrez.efetch(db="nucleotide", id="NM_000207", rettype="fasta", retmode="text")
record: SeqRecord = SeqIO.read(handle, "fasta")
handle.close()

print(f"Sequence ID: {record.id}")
print(f"Description: {record.description}")
print(f"Length: {len(record.seq)} bp")

Sequence ID: NM_000207.3
Description: NM_000207.3 Homo sapiens insulin (INS), transcript variant 1, mRNA
Length: 465 bp


## Basic Sequence Analysis

In [4]:
record.seq

Seq('AGCCCTCCAGGACAGGCTGCATCAGAAGAGGCCATCAAGCAGATCACTGTCCTT...AGC')

In [None]:
print(record.seq)

AGCCCTCCAGGACAGGCTGCATCAGAAGAGGCCATCAAGCAGATCACTGTCCTTCTGCCATGGCCCTGTGGATGCGCCTCCTGCCCCTGCTGGCGCTGCTGGCCCTCTGGGGACCTGACCCAGCCGCAGCCTTTGTGAACCAACACCTGTGCGGCTCACACCTGGTGGAAGCTCTCTACCTAGTGTGCGGGGAACGAGGCTTCTTCTACACACCCAAGACCCGCCGGGAGGCAGAGGACCTGCAGGTGGGGCAGGTGGAGCTGGGCGGGGGCCCTGGTGCAGGCAGCCTGCAGCCCTTGGCCCTGGAGGGGTCCCTGCAGAAGCGTGGCATTGTGGAACAATGCTGTACCAGCATCTGCTCCCTCTACCAGCTGGAGAACTACTGCAACTAGACGCAGCCCGCAGGCAGCCCCACACCCGCCGCCTCCTGCACCGAGAGAGATGGAATAAAGCCCTTGAACCAGC


### Complement

In [6]:
record.seq.complement()

Seq('TCGGGAGGTCCTGTCCGACGTAGTCTTCTCCGGTAGTTCGTCTAGTGACAGGAA...TCG')

### Reverse Complement

In [7]:
record.seq.reverse_complement()

Seq('GCTGGTTCAAGGGCTTTATTCCATCTCTCTCGGTGCAGGAGGCGGCGGGTGTGG...GCT')

### Calculate GC Content

In [None]:
from Bio.SeqUtils import gc_fraction

gc_content = gc_fraction(record.seq)

print(f"GC Content: {gc_content:.2%}")

GC Content: 6387.10%
