# Basic Concepts of Python for Bioinformatics:

Variables and Data Types

In [76]:
name = "Bioinformatics"
count = 42
percentage = 75.5
gene_list = ["GeneA", "GeneB", "GeneC"]

Control Flow

In [77]:
for gene in gene_list:
 if len(gene) > 5:
  print(gene)
else:
 print("Short gene:", gene)

Short gene: GeneC


Functions

In [78]:
def calculate_gc_content(sequence):
    gc_count = sequence.count('G') + sequence.count('C')
    total_bases = len(sequence)
    gc_content = (gc_count / total_bases) * 100
    return gc_content

File Handling

In [79]:
#with open('sequences.txt', 'r') as file:
    #sequences = file.readlines()

Bioinformatics Libraries

In [80]:
from Bio import SeqIO

#sequence_record = SeqIO.read("sequence.fasta", "fasta")

# Importance of Clean Code and Documentation in Bioinformatics Projects:

Readability

In [81]:
# Bad
#x = dna.count('G') + dna.count('C')

# Good
#gc_count = dna.count('G') + dna.count('C')

Modularity

In [82]:
# Bad
#def analyze_sequence(data):
        # ...

# Good
#def calculate_gc_content(sequence):
    # ...

#def analyze_sequence(data):
    #gc_content = calculate_gc_content(data)
    # ...

Documentation

In [83]:
def calculate_gc_content(sequence):
    """
Calculate the GC content of a DNA sequence.

Parameters:
- sequence (str): DNA sequence.

Returns:
- float: GC content percentage.
"""
# ...

# Basic Input and Output in Python for Bioinformatics:

Input Methods for Biological Data

a. Reading from Files

In [84]:
from Bio import SeqIO

# Reading a FASTA file
fasta_file = "sequence.fasta"
#sequence_record = SeqIO.read(fasta_file, "fasta")

b. User Input

In [85]:
#user_sequence = input("Enter a DNA sequence: ")

Formatted Output for Interpretation

a. Print Statements

In [86]:
gc_content = 45.2
print("GC Content:", gc_content, "%")

GC Content: 45.2 %


b. Formatted Strings

In [87]:
gene_name = "GeneA"
expression_level = 2.5

output_message = f"The expression level of {gene_name} is {expression_level} units."
print(output_message)

The expression level of GeneA is 2.5 units.


c. Tabular Data

In [88]:
genes = ["GeneA", "GeneB", "GeneC"]
expression_levels = [2.5, 3.1, 1.8]

print("Gene\tExpression Level")
for gene, level in zip(genes, expression_levels):
    print(f"{gene}\t{level}")

Gene	Expression Level
GeneA	2.5
GeneB	3.1
GeneC	1.8


In [89]:
output_file = "results.txt"
with open(output_file, 'w') as file:
    file.write(f"Gene\tExpression Level\n")
    for gene, level in zip(genes, expression_levels):
        file.write(f"{gene}\t{level}\n")

In [90]:
organism = "Homo sapiens"
num_genes = 1500

summary_message = f"The genome of {organism} contains {num_genes} genes."
print(summary_message)



The genome of Homo sapiens contains 1500 genes.


# Mathematical Operations in Bioinformatics:

GC Content Calculation

In [91]:
def calculate_gc_content(sequence):
    gc_count = sequence.count('G') + sequence.count('C')
    total_bases = len(sequence)
    gc_content = (gc_count / total_bases) * 100
    return gc_content

# Example
dna_sequence = "ATGCGATCGATCGTACG"
gc_percentage = calculate_gc_content(dna_sequence)
print(f"GC Content: {gc_percentage:.2f}%")

GC Content: 52.94%


Transcription Factor Binding Site Analysis

In [92]:
def find_motif_occurrences(sequence, motif):
    occurrences = []
    i = sequence.find(motif)
    while i != -1:
        occurrences.append(i)
        i = sequence.find(motif, i + 1)
    return occurrences

# Example
gene_sequence = "ATGCGTACGCTAGCTAGCTAGCG"
motif = "CTAG"
motif_occurrences = find_motif_occurrences(gene_sequence, motif)
print(f"Motif '{motif}' found at positions: {motif_occurrences}")

Motif 'CTAG' found at positions: [9, 13, 17]


In [93]:
%pip install --upgrade numpy scipy

Defaulting to user installation because normal site-packages is not writeable
    pyparsing>=2.4*
             ~~~~~^[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


Statistical Analysis

In [94]:
import numpy as np
import scipy
from scipy.stats import ttest_ind

# Example: Gene expression levels in two conditions
condition_1 = np.array([2.5, 3.1, 2.8, 3.0, 2.7])
condition_2 = np.array([1.8, 2.2, 2.0, 2.5, 2.1])

# Mean and standard deviation
mean_condition_1 = np.mean(condition_1)
mean_condition_2 = np.mean(condition_2)

std_dev_condition_1 = np.std(condition_1)
std_dev_condition_2 = np.std(condition_2)

# Independent t-test
t_stat, p_value = ttest_ind(condition_1, condition_2)

print(f"Mean Condition 1: {mean_condition_1:.2f}")
print(f"Mean Condition 2: {mean_condition_2:.2f}")
print(f"Standard Deviation Condition 1: {std_dev_condition_1:.2f}")
print(f"Standard Deviation Condition 2: {std_dev_condition_2:.2f}")
print(f"T-statistic: {t_stat:.2f}")
print(f"P-value: {p_value:.4f}")

Mean Condition 1: 2.82
Mean Condition 2: 2.12
Standard Deviation Condition 1: 0.21
Standard Deviation Condition 2: 0.23
T-statistic: 4.45
P-value: 0.0022


Protein Folding

In [95]:
import math

def calculate_protein_folding_energy(temperature, entropy_change):
    gas_constant = 8.314 # J/(mol*K)
    delta_g = -temperature * entropy_change
    folding_energy = math.exp(-delta_g / (gas_constant * temperature))
    return folding_energy

# Example
temperature = 298 # in Kelvin
entropy_change = -50 # in J/(mol*K)
folding_energy = calculate_protein_folding_energy(temperature, entropy_change)
print(f"Protein Folding Energy: {folding_energy:.4f}")

Protein Folding Energy: 0.0024


# String Manipulation Techniques in Bioinformatics:

Sequence Alignment

In [96]:
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

sequence_1 = "ATGCGTACGCTAGCTAGCTAGCG"
sequence_2 = "ATGCGATCGATCGTACG"

alignments = pairwise2.align.globalxx(sequence_1, sequence_2, one_alignment_only=True)
best_alignment = alignments[0]

print("Sequence Alignment:")
print(format_alignment(*best_alignment))

Sequence Alignment:
ATGCGTA-CGCTAGCTAGC-TAGCG
||||| | ||  |  |  | || ||
ATGCG-ATCG--A--T--CGTA-CG
  Score=15





Reverse Complement

In [97]:
def reverse_complement(sequence):
    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    return ''.join(complement[base] for base in reversed(sequence))

# Example
dna_sequence = "ATGCGTACGCTAGCTAGCTAGCG"
rev_comp_sequence = reverse_complement(dna_sequence)
print(f"Original Sequence: {dna_sequence}")
print(f"Reverse Complement: {rev_comp_sequence}")

Original Sequence: ATGCGTACGCTAGCTAGCTAGCG
Reverse Complement: CGCTAGCTAGCTAGCGTACGCAT


Motif Extraction

In [98]:
def extract_motifs(sequence, motif_length):
    motifs = [sequence[i:i+motif_length] for i in range(len(sequence)-motif_length+1)]
    return motifs

# Example
gene_sequence = "ATGCGTACGCTAGCTAGCTAGCG"
motif_length = 3
gene_motifs = extract_motifs(gene_sequence, motif_length)
print(f"Gene Motifs: {gene_motifs}")

Gene Motifs: ['ATG', 'TGC', 'GCG', 'CGT', 'GTA', 'TAC', 'ACG', 'CGC', 'GCT', 'CTA', 'TAG', 'AGC', 'GCT', 'CTA', 'TAG', 'AGC', 'GCT', 'CTA', 'TAG', 'AGC', 'GCG']


Counting Nucleotides

In [99]:
def count_nucleotides(sequence):
    nucleotide_counts = {base: sequence.count(base) for base in 'ATCG'}
    return nucleotide_counts

# Example
dna_sequence = "ATGCGTACGCTAGCTAGCTAGCG"
nucleotide_counts = count_nucleotides(dna_sequence)
print("Nucleotide Counts:")
for base, count in nucleotide_counts.items():
    print(f"{base}: {count}")

Nucleotide Counts:
A: 5
T: 5
C: 6
G: 7


Searching for Motifs

In [100]:
def find_motif_positions(sequence, motif):
    positions = [i for i in range(len(sequence)-len(motif)+1) if sequence[i:i+len(motif)] == motif]
    return positions

# Example
gene_sequence = "ATGCGTACGCTAGCTAGCTAGCG"
motif = "CTAG"
motif_positions = find_motif_positions(gene_sequence, motif)
print(f"Motif '{motif}' found at positions: {motif_positions}")

Motif 'CTAG' found at positions: [9, 13, 17]


Concatenation and Slicing

In [101]:
header = ">geneA|description"
gene_name = header.split('|')[0][1:]
description = header.split('|')[1]

print(f"Gene Name: {gene_name}")
print(f"Description: {description}")

Gene Name: geneA
Description: description


# Iterable Objects in Python (Dictionaries, Lists, Tuples, Sets)

Representation of Genetic Information

In [102]:
gene_sequences = {
"GeneA": "ATGCGTACGCTAGCTAGCTAGCG",
"GeneB": "ATCGATCGATCGTACGTAGCTAGC",
"GeneC": "ATCGATCGATCGATCGATCG"
}

# Accessing sequence for GeneA
gene_a_sequence = gene_sequences["GeneA"]
print(f"Sequence for GeneA: {gene_a_sequence}")

Sequence for GeneA: ATGCGTACGCTAGCTAGCTAGCG


Gene Annotations

In [103]:
gene_annotations = {
"GeneA": {"length": 21, "chromosome": "X", "start_position": 1000},
"GeneB": {"length": 24, "chromosome": "Y", "start_position": 1500},
"GeneC": {"length": 18, "chromosome": "2", "start_position": 2000}
}

# Accessing annotation for GeneB
gene_b_annotation = gene_annotations["GeneB"]
print("Annotation for GeneB:")
for key, value in gene_b_annotation.items():
    print(f"{key}: {value}")

Annotation for GeneB:
length: 24
chromosome: Y
start_position: 1500


Codon Usage Table

In [104]:
codon_usage = {
"ATG": 0.85,
"TAA": 0.05,
"TAG": 0.03,
"TGA": 0.02,
"GCT": 0.10,
"GCC": 0.15,
"GCA": 0.30,
"GCG": 0.45
}

# Accessing frequency for GCT codon
gct_frequency = codon_usage.get("GCT", 0)
print(f"Frequency of GCT codon: {gct_frequency}")

Frequency of GCT codon: 0.1


Protein Structures

In [105]:
protein_structures = {
"ProteinA": {"length": 300, "secondary_structure": "alpha-helix", "domains": ["A", "B", "C"]},
"ProteinB": {"length": 200, "secondary_structure": "beta-sheet", "domains": ["X", "Y"]},
"ProteinC": {"length": 400, "secondary_structure": "random-coil", "domains": ["M", "N", "O"]}
}

# Accessing information for ProteinC
protein_c_info = protein_structures.get("ProteinC", {})
print("Information for ProteinC:")
for key, value in protein_c_info.items():
    print(f"{key}: {value}")

Information for ProteinC:
length: 400
secondary_structure: random-coil
domains: ['M', 'N', 'O']


Pathways and Interactions

In [106]:
pathway_interactions = {
"PathwayA": {"genes": ["GeneA", "GeneB", "GeneC"], "reactions": ["Reaction1", "Reaction2"]},
"PathwayB": {"genes": ["GeneD", "GeneE"], "reactions": ["Reaction3", "Reaction4"]},
"PathwayC": {"genes": ["GeneF"], "reactions": ["Reaction5"]}
}

# Accessing genes for PathwayA
pathway_a_genes = pathway_interactions.get("PathwayA", {}).get("genes", [])
print(f"Genes in PathwayA: {pathway_a_genes}")

Genes in PathwayA: ['GeneA', 'GeneB', 'GeneC']


# Usage of Lists in Bioinformatics:

List of Genes

In [107]:
gene_list = ["GeneA", "GeneB", "GeneC", "GeneD", "GeneE"]

Length Distribution

In [108]:
gene_lengths = [1200, 800, 1500, 1000, 900]

# Calculate average length
average_length = sum(gene_lengths) / len(gene_lengths)

# Filter genes longer than the average length
long_genes = [gene for gene, length in zip(gene_list, gene_lengths) if length > average_length]

print(f"Average Gene Length: {average_length:.2f} bp")
print(f"Genes Longer than Average: {long_genes}")

Average Gene Length: 1080.00 bp
Genes Longer than Average: ['GeneA', 'GeneC']


Gene Expression Levels

In [109]:
expression_levels = [2.5, 3.1, 2.8, 3.0, 2.7]

# Normalize expression levels
normalized_levels = [level / max(expression_levels) for level in expression_levels]

print(f"Original Expression Levels: {expression_levels}")
print(f"Normalized Expression Levels: {normalized_levels}")

Original Expression Levels: [2.5, 3.1, 2.8, 3.0, 2.7]
Normalized Expression Levels: [0.8064516129032258, 1.0, 0.9032258064516128, 0.9677419354838709, 0.8709677419354839]


Filtering Genes by Expression

In [110]:
high_expression_genes = [gene for gene, level in zip(gene_list, expression_levels) if level > 2.8]

print(f"Genes with High Expression: {high_expression_genes}")

Genes with High Expression: ['GeneB', 'GeneD']


# Genomic Coordinates

In [111]:
gene_coordinates = [(1000, 1500), (2000, 2500), (3000, 3500)]

# Calculate the total genomic span
total_genomic_span = sum(end - start for start, end in gene_coordinates)

print(f"Total Genomic Span: {total_genomic_span} bp")

Total Genomic Span: 1500 bp


Sequences in FASTA Format

In [112]:
fasta_sequences = [">GeneA", "ATGCGTACGCTAGCTAGCTAGCG", ">GeneB", "ATCGATCGATCGTACGTAGCTAGC"]

# Extract gene names and sequences
gene_names = [line[1:] for line in fasta_sequences if line.startswith(">")]
sequences = [line for line in fasta_sequences if not line.startswith(">")]

print(f"Gene Names: {gene_names}")
print(f"Sequences: {sequences}")

Gene Names: ['GeneA', 'GeneB']
Sequences: ['ATGCGTACGCTAGCTAGCTAGCG', 'ATCGATCGATCGTACGTAGCTAGC']


Pathways and Reactions

In [113]:
pathway_data = [("PathwayA", ["GeneA", "GeneB", "GeneC"], ["Reaction1", "Reaction2"]),
("PathwayB", ["GeneD", "GeneE"], ["Reaction3", "Reaction4"]),
("PathwayC", ["GeneF"], ["Reaction5"])]

# Extract genes in PathwayA
pathway_a_genes = [gene for pathway, genes, reactions in pathway_data if pathway == "PathwayA"]

print(f"Genes in PathwayA: {pathway_a_genes}")

Genes in PathwayA: ['GeneC']


# Immutable Nature of Tuples in Bioinformatics:

Storing Coordinates

In [114]:
gene_coordinates = ((1000, 1500), (2000, 2500), (3000, 3500))

# Pairing Data

In [115]:
gene_data = [("GeneA", 2.5), ("GeneB", 3.1), ("GeneC", 2.8)]

Positional Information

In [116]:
dna_sequence = tuple("ATGCGTACGCTAGCTAGCTAGCG")

 Biological Relationships

gene_relationships = (("GeneA", "GeneB"), ("GeneB", "GeneC"), ("GeneD", "GeneE"))

Multiple Data Types

In [117]:
gene_relationships = (("GeneA", "GeneB"), ("GeneB", "GeneC"), ("GeneD", "GeneE"))

Dictionaries with Tuples

In [118]:
gene_annotations = {("GeneA", "X"): {"length": 1200, "start_position": 1000},
("GeneB", "Y"): {"length": 1500, "start_position": 2000}}

Frequency Distribution

In [119]:
nucleotide_counts = tuple((base, dna_sequence.count(base)) for base in 'ATCG')


Biological Sequences

In [120]:
protein_sequence = tuple("MHTGKVY")

# Set Operations for Unique Identifiers in Bioinformatics

Gene IDs

In [121]:
gene_ids_set = {"GeneA", "GeneB", "GeneC", "GeneD", "GeneE"}

Pathway Members

In [122]:
pathway_a_genes = {"GeneA", "GeneB", "GeneC"}
pathway_b_genes = {"GeneD", "GeneE"}

Intersection of Sets

In [123]:
common_genes = pathway_a_genes.intersection(pathway_b_genes)
print(f"Common Genes: {common_genes}")

Common Genes: set()


Union of Sets

In [124]:
all_genes = pathway_a_genes.union(pathway_b_genes)
print(f"All Genes: {all_genes}")

All Genes: {'GeneA', 'GeneD', 'GeneB', 'GeneE', 'GeneC'}


Difference of Sets

In [125]:
unique_to_pathway_a = pathway_a_genes.difference(pathway_b_genes)
print(f"Genes Unique to PathwayA: {unique_to_pathway_a}")

Genes Unique to PathwayA: {'GeneA', 'GeneC', 'GeneB'}


Symmetric Difference

In [126]:
unique_to_pathway_a_or_b = pathway_a_genes.symmetric_difference(pathway_b_genes)
print(f"Genes Unique to PathwayA or PathwayB: {unique_to_pathway_a_or_b}")

Genes Unique to PathwayA or PathwayB: {'GeneA', 'GeneC', 'GeneD', 'GeneB', 'GeneE'}


Checking Membership

In [127]:
is_gene_in_pathway = "GeneA" in pathway_a_genes
print(f"Is GeneA in PathwayA? {is_gene_in_pathway}")

Is GeneA in PathwayA? True


Subset Check

In [128]:
is_pathway_a_subset = pathway_a_genes.issubset(all_genes)
print(f"Is PathwayA a Subset of All Genes? {is_pathway_a_subset}")

Is PathwayA a Subset of All Genes? True


# Conditional Statements for Decision-Making in Bioinformatics

In [129]:
sequence_quality = 25

if sequence_quality >= 30:
    print("High-quality data. Proceed with analysis.")
else:
    print("Low-quality data. Consider re-sequencing or trimming.")

Low-quality data. Consider re-sequencing or trimming.


Filtering Genes by Expression Levels

In [130]:
expression_level = 2.7

if expression_level > 2.5:
    print("Gene expression is higher than threshold. Include in analysis.")
else:
    print("Low gene expression. Exclude from analysis.")

Gene expression is higher than threshold. Include in analysis.


Variant Calling in Genomic Data

In [131]:
variant_present = True

if variant_present:
    print("Variants detected. Further analysis required.")
else:
    print("No variants found. Continue with downstream analysis.")

Variants detected. Further analysis required.


Pathway Analysis

In [132]:
pathway_genes = ["GeneA", "GeneB", "GeneC"]

if "GeneA" in pathway_genes and "GeneB" in pathway_genes:
    print("Pathway enriched with key genes. Conduct pathway analysis.")
else:
    print("Insufficient key genes for pathway analysis.")

Pathway enriched with key genes. Conduct pathway analysis.


Selecting Analysis Method

In [133]:
data_type = "RNA-seq"

if data_type == "RNA-seq":
    print("Perform differential gene expression analysis.")
elif data_type == "ChIP-seq":
    print("Analyze protein-DNA interactions.")
else:
    print("Unsupported data type. Check compatibility.")

Perform differential gene expression analysis.


Identifying Coding vs. Non-coding Regions

In [134]:
sequence_type = "coding"

if sequence_type == "coding":
    print("Sequence represents a coding region.")
else:
    print("Sequence is non-coding.")

Sequence represents a coding region.


Handling Missing Data

In [135]:
data_missing = False

if data_missing:
    print("Missing data detected. Impute or address missing values.")
else:
    print("No missing data. Proceed with analysis.")

No missing data. Proceed with analysis.


Variant Annotation

In [136]:
functional_impact = "high"

if functional_impact == "high":
    print("High-impact variants. Prioritize for further investigation.")
else:
    print("Low-impact variants. Consider in the context of other analyses.")

High-impact variants. Prioritize for further investigation.


# Iterative Processes in Bioinformatics with For Loop

Processing Multiple Protein Sequences

In [137]:
from Bio.SeqUtils import molecular_weight

protein_sequences = ["MHTGKVY", "PLKQRTV", "SWSEVFRG"]

for sequence in protein_sequences:
    weight = molecular_weight(sequence, seq_type="protein")
    print(f"Molecular weight of {sequence}: {weight:.2f} Da")

Molecular weight of MHTGKVY: 834.98 Da
Molecular weight of PLKQRTV: 841.01 Da
Molecular weight of SWSEVFRG: 967.04 Da


Analyzing Codon Usage in Genomic Data

In [138]:
pip install --upgrade biopython

Defaulting to user installation because normal site-packages is not writeable
    pyparsing>=2.4*
             ~~~~~^[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [139]:
from Bio import SeqUtils

In [140]:
%pip show biopython

Name: biopython
Version: 1.84
Summary: Freely available tools for computational molecular biology.
Home-page: https://biopython.org/
Author: The Biopython Contributors
Author-email: biopython@biopython.org
License: 
Location: /home/tbkhori_/.local/lib/python3.9/site-packages
Requires: numpy
Required-by: #N/A
Note: you may need to restart the kernel to use updated packages.


In [141]:
from Bio.SeqUtils import *
print(dir())
gene_list = ["GeneA", "GeneB", "GeneC"]

#for gene in gene_list:
#    codon_table = CodonUsage.CodonsDict[gene]
#print(f"Codon usage for {gene}: {codon_table}")

['CodonAdaptationIndex', 'CodonTable', 'GC123', 'GC_skew', 'IUPACData', 'In', 'Out', 'Seq', 'SeqIO', 'SeqUtils', '_', '__', '___', '__builtin__', '__builtins__', '__doc__', '__loader__', '__name__', '__package__', '__spec__', '__vsc_ipynb_file__', '_dh', '_exit_code', '_i', '_i1', '_i10', '_i100', '_i101', '_i102', '_i103', '_i104', '_i105', '_i106', '_i107', '_i108', '_i109', '_i11', '_i110', '_i111', '_i112', '_i113', '_i114', '_i115', '_i116', '_i117', '_i118', '_i119', '_i12', '_i120', '_i121', '_i122', '_i123', '_i124', '_i125', '_i126', '_i127', '_i128', '_i129', '_i13', '_i130', '_i131', '_i132', '_i133', '_i134', '_i135', '_i136', '_i137', '_i138', '_i139', '_i14', '_i140', '_i141', '_i15', '_i16', '_i17', '_i18', '_i19', '_i2', '_i20', '_i21', '_i22', '_i23', '_i24', '_i25', '_i26', '_i27', '_i28', '_i29', '_i3', '_i30', '_i31', '_i32', '_i33', '_i34', '_i35', '_i36', '_i37', '_i38', '_i39', '_i4', '_i40', '_i41', '_i42', '_i43', '_i44', '_i45', '_i46', '_i47', '_i48', '_i49',

In [142]:
from Bio.Seq import Seq
from Bio.Data import CodonTable
import collections

def calculate_codon_usage(sequence):
    # Ensure the sequence length is divisible by 3
    if len(sequence) % 3 != 0:
        raise ValueError("Sequence length must be divisible by 3")
    
    # Split the sequence into codons
    codons = [sequence[i:i+3] for i in range(0, len(sequence), 3)]
    
    # Count the occurrences of each codon
    codon_count = collections.Counter(codons)
    
    # Calculate the frequency of each codon
    total_codons = sum(codon_count.values())
    codon_frequency = {codon: count / total_codons for codon, count in codon_count.items()}
    
    return codon_frequency

# Example usage
gene_list = ["GeneA", "GeneB", "GeneC"]

# Example sequences (replace these with your actual sequences)
sequences = {
    "GeneA": "ATGGCGTGA",
    "GeneB": "ATGTCATGA",
    "GeneC": "ATGCCCTGA"
}

for gene in gene_list:
    sequence = sequences[gene]
    codon_usage = calculate_codon_usage(sequence)
    print(f"Codon usage for {gene}:")
    for codon, frequency in codon_usage.items():
        print(f"  {codon}: {frequency:.4f}")
    print()

Codon usage for GeneA:
  ATG: 0.3333
  GCG: 0.3333
  TGA: 0.3333

Codon usage for GeneB:
  ATG: 0.3333
  TCA: 0.3333
  TGA: 0.3333

Codon usage for GeneC:
  ATG: 0.3333
  CCC: 0.3333
  TGA: 0.3333



Batch Processing of Sequences

In [143]:
from Bio.SeqUtils import gc_fraction

dna_sequences = ["ATGCGTACGCTAGCTAGCTAGCG", "ATCGATCGATCGTACGTAGCTAGC", "AGCTAGCTAGCTAGCGATCGATCGA"]

for sequence in dna_sequences:
    gc_content = gc_fraction(sequence) * 100
    print(f"GC content of {sequence}: {gc_content:.2f}%")

GC content of ATGCGTACGCTAGCTAGCTAGCG: 56.52%
GC content of ATCGATCGATCGTACGTAGCTAGC: 50.00%
GC content of AGCTAGCTAGCTAGCGATCGATCGA: 52.00%


Comparing Multiple Protein Structures

In [144]:
gene_exon_structures = {"GeneA": (2, 4), "GeneB": (3, 5), "GeneC": (4, 6)}

for gene, structure in gene_exon_structures.items():
    exons, introns = structure
print(f"{gene} has {exons} exons and {introns} introns.")

GeneC has 4 exons and 6 introns.


Analyzing Exon-Intron Structures

In [145]:
protein_structures = {"ProteinA": "MHTGKVY", "ProteinB": "PLKQRTV", "ProteinC": "SWSEVFRG"}

for protein, sequence in protein_structures.items():
    length = len(sequence)
print(f"Length of {protein}: {length} amino acids")

Length of ProteinC: 8 amino acids


Filtering Variants in Genomic Data

In [146]:
variant_data = [("SNP1", 30), ("SNP2", 25), ("SNP3", 35)]

high_quality_variants = []

for variant, quality in variant_data:
    if quality >= 30:
        high_quality_variants.append(variant)

print(f"High-quality variants: {high_quality_variants}")

High-quality variants: ['SNP1', 'SNP3']


Automating File Processing

In [None]:
import os
from Bio import SeqIO

fasta_directory = "/path/to/fasta/files"

for filename in os.listdir(fasta_directory):
    if filename.endswith(".fasta"):
        filepath = os.path.join(fasta_directory, filename)
        sequence_record = SeqIO.read(filepath, "fasta")
        gc_content = GC(sequence_record.seq)
print(f"GC content of {filename}: {gc_content:.2f}%")

# Utilizing While Loops for Continuous Data Analysis in Bioinformatics