# Basic Concepts of Python for Bioinformatics:

Variables and Data Types

In [13]:
name = "Bioinformatics"
count = 42
percentage = 75.5
gene_list = ["GeneA", "GeneB", "GeneC"]

Control Flow

In [14]:
for gene in gene_list:
 if len(gene) > 5:
  print(gene)
else:
 print("Short gene:", gene)

Short gene: GeneC


Functions

In [15]:
def calculate_gc_content(sequence):
    gc_count = sequence.count('G') + sequence.count('C')
    total_bases = len(sequence)
    gc_content = (gc_count / total_bases) * 100
    return gc_content

File Handling

In [16]:
#with open('sequences.txt', 'r') as file:
    #sequences = file.readlines()

Bioinformatics Libraries

In [17]:
from Bio import SeqIO

#sequence_record = SeqIO.read("sequence.fasta", "fasta")

# Importance of Clean Code and Documentation in Bioinformatics Projects:

Readability

In [18]:
# Bad
#x = dna.count('G') + dna.count('C')

# Good
#gc_count = dna.count('G') + dna.count('C')

Modularity

In [19]:
# Bad
#def analyze_sequence(data):
        # ...

# Good
#def calculate_gc_content(sequence):
    # ...

#def analyze_sequence(data):
    #gc_content = calculate_gc_content(data)
    # ...

Documentation

In [20]:
def calculate_gc_content(sequence):
    """
Calculate the GC content of a DNA sequence.

Parameters:
- sequence (str): DNA sequence.

Returns:
- float: GC content percentage.
"""
# ...

# Basic Input and Output in Python for Bioinformatics:

Input Methods for Biological Data

a. Reading from Files

In [21]:
from Bio import SeqIO

# Reading a FASTA file
fasta_file = "sequence.fasta"
#sequence_record = SeqIO.read(fasta_file, "fasta")

b. User Input

In [22]:
#user_sequence = input("Enter a DNA sequence: ")

Formatted Output for Interpretation

a. Print Statements

In [23]:
gc_content = 45.2
print("GC Content:", gc_content, "%")

GC Content: 45.2 %


b. Formatted Strings

In [24]:
gene_name = "GeneA"
expression_level = 2.5

output_message = f"The expression level of {gene_name} is {expression_level} units."
print(output_message)

The expression level of GeneA is 2.5 units.


c. Tabular Data

In [25]:
genes = ["GeneA", "GeneB", "GeneC"]
expression_levels = [2.5, 3.1, 1.8]

print("Gene\tExpression Level")
for gene, level in zip(genes, expression_levels):
    print(f"{gene}\t{level}")

Gene	Expression Level
GeneA	2.5
GeneB	3.1
GeneC	1.8


In [26]:
output_file = "results.txt"
with open(output_file, 'w') as file:
    file.write(f"Gene\tExpression Level\n")
    for gene, level in zip(genes, expression_levels):
        file.write(f"{gene}\t{level}\n")

In [27]:
organism = "Homo sapiens"
num_genes = 1500

summary_message = f"The genome of {organism} contains {num_genes} genes."
print(summary_message)



The genome of Homo sapiens contains 1500 genes.


# Mathematical Operations in Bioinformatics:

GC Content Calculation

In [28]:
def calculate_gc_content(sequence):
    gc_count = sequence.count('G') + sequence.count('C')
    total_bases = len(sequence)
    gc_content = (gc_count / total_bases) * 100
    return gc_content

# Example
dna_sequence = "ATGCGATCGATCGTACG"
gc_percentage = calculate_gc_content(dna_sequence)
print(f"GC Content: {gc_percentage:.2f}%")

GC Content: 52.94%


Transcription Factor Binding Site Analysis

In [29]:
def find_motif_occurrences(sequence, motif):
    occurrences = []
    i = sequence.find(motif)
    while i != -1:
        occurrences.append(i)
        i = sequence.find(motif, i + 1)
    return occurrences

# Example
gene_sequence = "ATGCGTACGCTAGCTAGCTAGCG"
motif = "CTAG"
motif_occurrences = find_motif_occurrences(gene_sequence, motif)
print(f"Motif '{motif}' found at positions: {motif_occurrences}")

Motif 'CTAG' found at positions: [9, 13, 17]


In [30]:
%pip install --upgrade numpy scipy

Defaulting to user installation because normal site-packages is not writeable
    pyparsing>=2.4*
             ~~~~~^[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


Statistical Analysis

In [31]:
import numpy as np
import scipy
from scipy.stats import ttest_ind

# Example: Gene expression levels in two conditions
condition_1 = np.array([2.5, 3.1, 2.8, 3.0, 2.7])
condition_2 = np.array([1.8, 2.2, 2.0, 2.5, 2.1])

# Mean and standard deviation
mean_condition_1 = np.mean(condition_1)
mean_condition_2 = np.mean(condition_2)

std_dev_condition_1 = np.std(condition_1)
std_dev_condition_2 = np.std(condition_2)

# Independent t-test
t_stat, p_value = ttest_ind(condition_1, condition_2)

print(f"Mean Condition 1: {mean_condition_1:.2f}")
print(f"Mean Condition 2: {mean_condition_2:.2f}")
print(f"Standard Deviation Condition 1: {std_dev_condition_1:.2f}")
print(f"Standard Deviation Condition 2: {std_dev_condition_2:.2f}")
print(f"T-statistic: {t_stat:.2f}")
print(f"P-value: {p_value:.4f}")

Mean Condition 1: 2.82
Mean Condition 2: 2.12
Standard Deviation Condition 1: 0.21
Standard Deviation Condition 2: 0.23
T-statistic: 4.45
P-value: 0.0022


Protein Folding

In [32]:
import math

def calculate_protein_folding_energy(temperature, entropy_change):
    gas_constant = 8.314 # J/(mol*K)
    delta_g = -temperature * entropy_change
    folding_energy = math.exp(-delta_g / (gas_constant * temperature))
    return folding_energy

# Example
temperature = 298 # in Kelvin
entropy_change = -50 # in J/(mol*K)
folding_energy = calculate_protein_folding_energy(temperature, entropy_change)
print(f"Protein Folding Energy: {folding_energy:.4f}")

Protein Folding Energy: 0.0024


# String Manipulation Techniques in Bioinformatics:

Sequence Alignment

In [33]:
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

sequence_1 = "ATGCGTACGCTAGCTAGCTAGCG"
sequence_2 = "ATGCGATCGATCGTACG"

alignments = pairwise2.align.globalxx(sequence_1, sequence_2, one_alignment_only=True)
best_alignment = alignments[0]

print("Sequence Alignment:")
print(format_alignment(*best_alignment))

Sequence Alignment:
ATGCGTA-CGCTAGCTAGC-TAGCG
||||| | ||  |  |  | || ||
ATGCG-ATCG--A--T--CGTA-CG
  Score=15





Reverse Complement

In [34]:
def reverse_complement(sequence):
    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    return ''.join(complement[base] for base in reversed(sequence))

# Example
dna_sequence = "ATGCGTACGCTAGCTAGCTAGCG"
rev_comp_sequence = reverse_complement(dna_sequence)
print(f"Original Sequence: {dna_sequence}")
print(f"Reverse Complement: {rev_comp_sequence}")

Original Sequence: ATGCGTACGCTAGCTAGCTAGCG
Reverse Complement: CGCTAGCTAGCTAGCGTACGCAT


Motif Extraction

In [35]:
def extract_motifs(sequence, motif_length):
    motifs = [sequence[i:i+motif_length] for i in range(len(sequence)-motif_length+1)]
    return motifs

# Example
gene_sequence = "ATGCGTACGCTAGCTAGCTAGCG"
motif_length = 3
gene_motifs = extract_motifs(gene_sequence, motif_length)
print(f"Gene Motifs: {gene_motifs}")

Gene Motifs: ['ATG', 'TGC', 'GCG', 'CGT', 'GTA', 'TAC', 'ACG', 'CGC', 'GCT', 'CTA', 'TAG', 'AGC', 'GCT', 'CTA', 'TAG', 'AGC', 'GCT', 'CTA', 'TAG', 'AGC', 'GCG']


Counting Nucleotides

In [36]:
def count_nucleotides(sequence):
    nucleotide_counts = {base: sequence.count(base) for base in 'ATCG'}
    return nucleotide_counts

# Example
dna_sequence = "ATGCGTACGCTAGCTAGCTAGCG"
nucleotide_counts = count_nucleotides(dna_sequence)
print("Nucleotide Counts:")
for base, count in nucleotide_counts.items():
    print(f"{base}: {count}")

Nucleotide Counts:
A: 5
T: 5
C: 6
G: 7


Searching for Motifs

In [37]:
def find_motif_positions(sequence, motif):
    positions = [i for i in range(len(sequence)-len(motif)+1) if sequence[i:i+len(motif)] == motif]
    return positions

# Example
gene_sequence = "ATGCGTACGCTAGCTAGCTAGCG"
motif = "CTAG"
motif_positions = find_motif_positions(gene_sequence, motif)
print(f"Motif '{motif}' found at positions: {motif_positions}")

Motif 'CTAG' found at positions: [9, 13, 17]


Concatenation and Slicing

In [38]:
header = ">geneA|description"
gene_name = header.split('|')[0][1:]
description = header.split('|')[1]

print(f"Gene Name: {gene_name}")
print(f"Description: {description}")

Gene Name: geneA
Description: description


# Iterable Objects in Python (Dictionaries, Lists, Tuples, Sets)

Representation of Genetic Information

In [39]:
gene_sequences = {
"GeneA": "ATGCGTACGCTAGCTAGCTAGCG",
"GeneB": "ATCGATCGATCGTACGTAGCTAGC",
"GeneC": "ATCGATCGATCGATCGATCG"
}

# Accessing sequence for GeneA
gene_a_sequence = gene_sequences["GeneA"]
print(f"Sequence for GeneA: {gene_a_sequence}")

Sequence for GeneA: ATGCGTACGCTAGCTAGCTAGCG


Gene Annotations

In [40]:
gene_annotations = {
"GeneA": {"length": 21, "chromosome": "X", "start_position": 1000},
"GeneB": {"length": 24, "chromosome": "Y", "start_position": 1500},
"GeneC": {"length": 18, "chromosome": "2", "start_position": 2000}
}

# Accessing annotation for GeneB
gene_b_annotation = gene_annotations["GeneB"]
print("Annotation for GeneB:")
for key, value in gene_b_annotation.items():
    print(f"{key}: {value}")

Annotation for GeneB:
length: 24
chromosome: Y
start_position: 1500


Codon Usage Table

In [41]:
codon_usage = {
"ATG": 0.85,
"TAA": 0.05,
"TAG": 0.03,
"TGA": 0.02,
"GCT": 0.10,
"GCC": 0.15,
"GCA": 0.30,
"GCG": 0.45
}

# Accessing frequency for GCT codon
gct_frequency = codon_usage.get("GCT", 0)
print(f"Frequency of GCT codon: {gct_frequency}")

Frequency of GCT codon: 0.1


Protein Structures

In [42]:
protein_structures = {
"ProteinA": {"length": 300, "secondary_structure": "alpha-helix", "domains": ["A", "B", "C"]},
"ProteinB": {"length": 200, "secondary_structure": "beta-sheet", "domains": ["X", "Y"]},
"ProteinC": {"length": 400, "secondary_structure": "random-coil", "domains": ["M", "N", "O"]}
}

# Accessing information for ProteinC
protein_c_info = protein_structures.get("ProteinC", {})
print("Information for ProteinC:")
for key, value in protein_c_info.items():
    print(f"{key}: {value}")

Information for ProteinC:
length: 400
secondary_structure: random-coil
domains: ['M', 'N', 'O']


Pathways and Interactions

In [43]:
pathway_interactions = {
"PathwayA": {"genes": ["GeneA", "GeneB", "GeneC"], "reactions": ["Reaction1", "Reaction2"]},
"PathwayB": {"genes": ["GeneD", "GeneE"], "reactions": ["Reaction3", "Reaction4"]},
"PathwayC": {"genes": ["GeneF"], "reactions": ["Reaction5"]}
}

# Accessing genes for PathwayA
pathway_a_genes = pathway_interactions.get("PathwayA", {}).get("genes", [])
print(f"Genes in PathwayA: {pathway_a_genes}")

Genes in PathwayA: ['GeneA', 'GeneB', 'GeneC']


# Usage of Lists in Bioinformatics:

List of Genes

In [44]:
gene_list = ["GeneA", "GeneB", "GeneC", "GeneD", "GeneE"]

Length Distribution

In [45]:
gene_lengths = [1200, 800, 1500, 1000, 900]

# Calculate average length
average_length = sum(gene_lengths) / len(gene_lengths)

# Filter genes longer than the average length
long_genes = [gene for gene, length in zip(gene_list, gene_lengths) if length > average_length]

print(f"Average Gene Length: {average_length:.2f} bp")
print(f"Genes Longer than Average: {long_genes}")

Average Gene Length: 1080.00 bp
Genes Longer than Average: ['GeneA', 'GeneC']


Gene Expression Levels

In [46]:
expression_levels = [2.5, 3.1, 2.8, 3.0, 2.7]

# Normalize expression levels
normalized_levels = [level / max(expression_levels) for level in expression_levels]

print(f"Original Expression Levels: {expression_levels}")
print(f"Normalized Expression Levels: {normalized_levels}")

Original Expression Levels: [2.5, 3.1, 2.8, 3.0, 2.7]
Normalized Expression Levels: [0.8064516129032258, 1.0, 0.9032258064516128, 0.9677419354838709, 0.8709677419354839]


Filtering Genes by Expression

In [47]:
high_expression_genes = [gene for gene, level in zip(gene_list, expression_levels) if level > 2.8]

print(f"Genes with High Expression: {high_expression_genes}")

Genes with High Expression: ['GeneB', 'GeneD']


# Genomic Coordinates

In [48]:
gene_coordinates = [(1000, 1500), (2000, 2500), (3000, 3500)]

# Calculate the total genomic span
total_genomic_span = sum(end - start for start, end in gene_coordinates)

print(f"Total Genomic Span: {total_genomic_span} bp")

Total Genomic Span: 1500 bp


Sequences in FASTA Format

In [49]:
fasta_sequences = [">GeneA", "ATGCGTACGCTAGCTAGCTAGCG", ">GeneB", "ATCGATCGATCGTACGTAGCTAGC"]

# Extract gene names and sequences
gene_names = [line[1:] for line in fasta_sequences if line.startswith(">")]
sequences = [line for line in fasta_sequences if not line.startswith(">")]

print(f"Gene Names: {gene_names}")
print(f"Sequences: {sequences}")

Gene Names: ['GeneA', 'GeneB']
Sequences: ['ATGCGTACGCTAGCTAGCTAGCG', 'ATCGATCGATCGTACGTAGCTAGC']


Pathways and Reactions

In [50]:
pathway_data = [("PathwayA", ["GeneA", "GeneB", "GeneC"], ["Reaction1", "Reaction2"]),
("PathwayB", ["GeneD", "GeneE"], ["Reaction3", "Reaction4"]),
("PathwayC", ["GeneF"], ["Reaction5"])]

# Extract genes in PathwayA
pathway_a_genes = [gene for pathway, genes, reactions in pathway_data if pathway == "PathwayA"]

print(f"Genes in PathwayA: {pathway_a_genes}")

Genes in PathwayA: ['GeneC']


# Immutable Nature of Tuples in Bioinformatics:

Storing Coordinates

In [51]:
gene_coordinates = ((1000, 1500), (2000, 2500), (3000, 3500))

# Pairing Data

In [52]:
gene_data = [("GeneA", 2.5), ("GeneB", 3.1), ("GeneC", 2.8)]

Positional Information

In [53]:
dna_sequence = tuple("ATGCGTACGCTAGCTAGCTAGCG")

 Biological Relationships

gene_relationships = (("GeneA", "GeneB"), ("GeneB", "GeneC"), ("GeneD", "GeneE"))

Multiple Data Types

In [54]:
gene_relationships = (("GeneA", "GeneB"), ("GeneB", "GeneC"), ("GeneD", "GeneE"))

Dictionaries with Tuples

In [55]:
gene_annotations = {("GeneA", "X"): {"length": 1200, "start_position": 1000},
("GeneB", "Y"): {"length": 1500, "start_position": 2000}}

Frequency Distribution

In [56]:
nucleotide_counts = tuple((base, dna_sequence.count(base)) for base in 'ATCG')


Biological Sequences

In [57]:
protein_sequence = tuple("MHTGKVY")

# Set Operations for Unique Identifiers in Bioinformatics

Gene IDs

In [58]:
gene_ids_set = {"GeneA", "GeneB", "GeneC", "GeneD", "GeneE"}

Pathway Members

In [59]:
pathway_a_genes = {"GeneA", "GeneB", "GeneC"}
pathway_b_genes = {"GeneD", "GeneE"}

Intersection of Sets

In [60]:
common_genes = pathway_a_genes.intersection(pathway_b_genes)
print(f"Common Genes: {common_genes}")

Common Genes: set()


Union of Sets

In [61]:
all_genes = pathway_a_genes.union(pathway_b_genes)
print(f"All Genes: {all_genes}")

All Genes: {'GeneE', 'GeneD', 'GeneA', 'GeneB', 'GeneC'}


Difference of Sets

In [62]:
unique_to_pathway_a = pathway_a_genes.difference(pathway_b_genes)
print(f"Genes Unique to PathwayA: {unique_to_pathway_a}")

Genes Unique to PathwayA: {'GeneA', 'GeneB', 'GeneC'}


Symmetric Difference

In [63]:
unique_to_pathway_a_or_b = pathway_a_genes.symmetric_difference(pathway_b_genes)
print(f"Genes Unique to PathwayA or PathwayB: {unique_to_pathway_a_or_b}")

Genes Unique to PathwayA or PathwayB: {'GeneE', 'GeneA', 'GeneB', 'GeneD', 'GeneC'}


Checking Membership

In [64]:
is_gene_in_pathway = "GeneA" in pathway_a_genes
print(f"Is GeneA in PathwayA? {is_gene_in_pathway}")

Is GeneA in PathwayA? True


Subset Check

In [65]:
is_pathway_a_subset = pathway_a_genes.issubset(all_genes)
print(f"Is PathwayA a Subset of All Genes? {is_pathway_a_subset}")

Is PathwayA a Subset of All Genes? True


# Conditional Statements for Decision-Making in Bioinformatics

In [66]:
sequence_quality = 25

if sequence_quality >= 30:
    print("High-quality data. Proceed with analysis.")
else:
    print("Low-quality data. Consider re-sequencing or trimming.")

Low-quality data. Consider re-sequencing or trimming.


Filtering Genes by Expression Levels

In [67]:
expression_level = 2.7

if expression_level > 2.5:
    print("Gene expression is higher than threshold. Include in analysis.")
else:
    print("Low gene expression. Exclude from analysis.")

Gene expression is higher than threshold. Include in analysis.


Variant Calling in Genomic Data

In [68]:
variant_present = True

if variant_present:
    print("Variants detected. Further analysis required.")
else:
    print("No variants found. Continue with downstream analysis.")

Variants detected. Further analysis required.


Pathway Analysis

In [69]:
pathway_genes = ["GeneA", "GeneB", "GeneC"]

if "GeneA" in pathway_genes and "GeneB" in pathway_genes:
    print("Pathway enriched with key genes. Conduct pathway analysis.")
else:
    print("Insufficient key genes for pathway analysis.")

Pathway enriched with key genes. Conduct pathway analysis.


Selecting Analysis Method

In [70]:
data_type = "RNA-seq"

if data_type == "RNA-seq":
    print("Perform differential gene expression analysis.")
elif data_type == "ChIP-seq":
    print("Analyze protein-DNA interactions.")
else:
    print("Unsupported data type. Check compatibility.")

Perform differential gene expression analysis.


Identifying Coding vs. Non-coding Regions

In [71]:
sequence_type = "coding"

if sequence_type == "coding":
    print("Sequence represents a coding region.")
else:
    print("Sequence is non-coding.")

Sequence represents a coding region.


Handling Missing Data

In [72]:
data_missing = False

if data_missing:
    print("Missing data detected. Impute or address missing values.")
else:
    print("No missing data. Proceed with analysis.")

No missing data. Proceed with analysis.


Variant Annotation

In [73]:
functional_impact = "high"

if functional_impact == "high":
    print("High-impact variants. Prioritize for further investigation.")
else:
    print("Low-impact variants. Consider in the context of other analyses.")

High-impact variants. Prioritize for further investigation.


# Iterative Processes in Bioinformatics with For Loop

Processing Multiple Protein Sequences

In [74]:
from Bio.SeqUtils import molecular_weight

protein_sequences = ["MHTGKVY", "PLKQRTV", "SWSEVFRG"]

for sequence in protein_sequences:
    weight = molecular_weight(sequence, seq_type="protein")
    print(f"Molecular weight of {sequence}: {weight:.2f} Da")

Molecular weight of MHTGKVY: 834.98 Da
Molecular weight of PLKQRTV: 841.01 Da
Molecular weight of SWSEVFRG: 967.04 Da


Analyzing Codon Usage in Genomic Data

In [75]:
pip install --upgrade biopython

Defaulting to user installation because normal site-packages is not writeable
    pyparsing>=2.4*
             ~~~~~^[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [76]:
from Bio import SeqUtils

In [77]:
%pip show biopython

Name: biopython
Version: 1.84
Summary: Freely available tools for computational molecular biology.
Home-page: https://biopython.org/
Author: The Biopython Contributors
Author-email: biopython@biopython.org
License: 
Location: /home/tbkhori_/.local/lib/python3.9/site-packages
Requires: numpy
Required-by: #N/A
Note: you may need to restart the kernel to use updated packages.


In [78]:
from Bio.SeqUtils import *
print(dir())
gene_list = ["GeneA", "GeneB", "GeneC"]

#for gene in gene_list:
#    codon_table = CodonUsage.CodonsDict[gene]
#print(f"Codon usage for {gene}: {codon_table}")

['CodonAdaptationIndex', 'GC123', 'GC_skew', 'IUPACData', 'In', 'Out', 'Phylo', 'Seq', 'SeqIO', 'SeqUtils', '_', '__', '___', '__builtin__', '__builtins__', '__doc__', '__loader__', '__name__', '__package__', '__spec__', '__vsc_ipynb_file__', '_dh', '_exit_code', '_i', '_i1', '_i10', '_i11', '_i12', '_i13', '_i14', '_i15', '_i16', '_i17', '_i18', '_i19', '_i2', '_i20', '_i21', '_i22', '_i23', '_i24', '_i25', '_i26', '_i27', '_i28', '_i29', '_i3', '_i30', '_i31', '_i32', '_i33', '_i34', '_i35', '_i36', '_i37', '_i38', '_i39', '_i4', '_i40', '_i41', '_i42', '_i43', '_i44', '_i45', '_i46', '_i47', '_i48', '_i49', '_i5', '_i50', '_i51', '_i52', '_i53', '_i54', '_i55', '_i56', '_i57', '_i58', '_i59', '_i6', '_i60', '_i61', '_i62', '_i63', '_i64', '_i65', '_i66', '_i67', '_i68', '_i69', '_i7', '_i70', '_i71', '_i72', '_i73', '_i74', '_i75', '_i76', '_i77', '_i78', '_i8', '_i9', '_ih', '_ii', '_iii', '_oh', 'alignments', 'all_genes', 'average_length', 'base', 'bed_file_path', 'best_alignment'

In [79]:
from Bio.Seq import Seq
from Bio.Data import CodonTable
import collections

def calculate_codon_usage(sequence):
    # Ensure the sequence length is divisible by 3
    if len(sequence) % 3 != 0:
        raise ValueError("Sequence length must be divisible by 3")
    
    # Split the sequence into codons
    codons = [sequence[i:i+3] for i in range(0, len(sequence), 3)]
    
    # Count the occurrences of each codon
    codon_count = collections.Counter(codons)
    
    # Calculate the frequency of each codon
    total_codons = sum(codon_count.values())
    codon_frequency = {codon: count / total_codons for codon, count in codon_count.items()}
    
    return codon_frequency

# Example usage
gene_list = ["GeneA", "GeneB", "GeneC"]

# Example sequences (replace these with your actual sequences)
sequences = {
    "GeneA": "ATGGCGTGA",
    "GeneB": "ATGTCATGA",
    "GeneC": "ATGCCCTGA"
}

for gene in gene_list:
    sequence = sequences[gene]
    codon_usage = calculate_codon_usage(sequence)
    print(f"Codon usage for {gene}:")
    for codon, frequency in codon_usage.items():
        print(f"  {codon}: {frequency:.4f}")
    print()

Codon usage for GeneA:
  ATG: 0.3333
  GCG: 0.3333
  TGA: 0.3333

Codon usage for GeneB:
  ATG: 0.3333
  TCA: 0.3333
  TGA: 0.3333

Codon usage for GeneC:
  ATG: 0.3333
  CCC: 0.3333
  TGA: 0.3333



Batch Processing of Sequences

In [80]:
from Bio.SeqUtils import gc_fraction

dna_sequences = ["ATGCGTACGCTAGCTAGCTAGCG", "ATCGATCGATCGTACGTAGCTAGC", "AGCTAGCTAGCTAGCGATCGATCGA"]

for sequence in dna_sequences:
    gc_content = gc_fraction(sequence) * 100
    print(f"GC content of {sequence}: {gc_content:.2f}%")

GC content of ATGCGTACGCTAGCTAGCTAGCG: 56.52%
GC content of ATCGATCGATCGTACGTAGCTAGC: 50.00%
GC content of AGCTAGCTAGCTAGCGATCGATCGA: 52.00%


Comparing Multiple Protein Structures

In [81]:
gene_exon_structures = {"GeneA": (2, 4), "GeneB": (3, 5), "GeneC": (4, 6)}

for gene, structure in gene_exon_structures.items():
    exons, introns = structure
print(f"{gene} has {exons} exons and {introns} introns.")

GeneC has 4 exons and 6 introns.


Analyzing Exon-Intron Structures

In [82]:
protein_structures = {"ProteinA": "MHTGKVY", "ProteinB": "PLKQRTV", "ProteinC": "SWSEVFRG"}

for protein, sequence in protein_structures.items():
    length = len(sequence)
print(f"Length of {protein}: {length} amino acids")

Length of ProteinC: 8 amino acids


Filtering Variants in Genomic Data

In [83]:
variant_data = [("SNP1", 30), ("SNP2", 25), ("SNP3", 35)]

high_quality_variants = []

for variant, quality in variant_data:
    if quality >= 30:
        high_quality_variants.append(variant)

print(f"High-quality variants: {high_quality_variants}")

High-quality variants: ['SNP1', 'SNP3']


Automating File Processing

In [None]:
import os
from Bio import SeqIO

fasta_directory = "/path/to/fasta/files"

for filename in os.listdir(fasta_directory):
    if filename.endswith(".fasta"):
        filepath = os.path.join(fasta_directory, filename)
        sequence_record = SeqIO.read(filepath, "fasta")
        gc_content = GC(sequence_record.seq)
print(f"GC content of {filename}: {gc_content:.2f}%")

# Utilizing While Loops for Continuous Data Analysis in Bioinformatics

Continuous Monitoring of Gene Expression

In [85]:
import random

target_expression = 3.0
current_expression = 2.0

while current_expression < target_expression:
# Simulating gene expression changes
    current_expression += random.uniform(0.1, 0.5)
print(f"Current gene expression: {current_expression:.2f}")

print("Target gene expression reached. Initiating downstream analysis.")

Current gene expression: 3.28
Target gene expression reached. Initiating downstream analysis.


 Iterative Parameter Optimization

In [86]:
import numpy as np

target_accuracy = 0.95
current_accuracy = 0.80
learning_rate = 0.02

while current_accuracy < target_accuracy:
# Simulating parameter optimization
    current_accuracy += np.random.normal(loc=learning_rate, scale=0.01)
    learning_rate *= 0.95 # Decay the learning rate
print(f"Current accuracy: {current_accuracy:.3f}")

print("Optimal parameters achieved. Proceeding with analysis.")

Current accuracy: 0.956
Optimal parameters achieved. Proceeding with analysis.


Continuous Data Streaming Analysis

In [87]:
import time

threshold_value = 10
current_value = 0

while current_value < threshold_value:
# Simulating continuous data stream
    current_value += 1
    time.sleep(1) # Simulating a pause between data points
print(f"Current value in data stream: {current_value}")

print("Stopping criterion met. Finalizing data analysis.")

Current value in data stream: 10
Stopping criterion met. Finalizing data analysis.


Dynamic Threshold Adjustment

In [88]:
import pandas as pd

data = pd.Series([5, 8, 7, 11, 9, 12, 10])
dynamic_threshold = 10

while any(data > dynamic_threshold):
# Adjusting the threshold dynamically based on data
    dynamic_threshold += 1
print(f"Dynamic threshold adjusted to: {dynamic_threshold}")

print("Dynamic threshold met. Initiating further analysis.")

Dynamic threshold adjusted to: 12
Dynamic threshold met. Initiating further analysis.


Adaptive Filtering in Sequencing Data

In [89]:
from Bio.SeqUtils import molecular_weight

target_molecular_weight = 30000
current_molecular_weight = 25000

while current_molecular_weight < target_molecular_weight:
# Simulating adaptive filtering in sequence data
    current_molecular_weight += random.uniform(500, 1000)
print(f"Current molecular weight: {current_molecular_weight:.2f}")

print("Target molecular weight reached. Proceeding with sequence analysis.")

Current molecular weight: 30120.62
Target molecular weight reached. Proceeding with sequence analysis.


# File Handling in Python
Techniques for Reading Various Bioinformatics File Formats:

FASTA Files (Genomic Sequences)

In [None]:
from Bio import SeqIO

fasta_file_path = "genome.fasta"

sequences = []
for record in SeqIO.parse(fasta_file_path, "fasta"):
    sequences.append(record.seq)

print("Genomic Sequences:")
for sequence in sequences:
    print(sequence)

FASTQ Files (Sequencing Reads with Quality Scores)

In [None]:
from Bio import SeqIO

fastq_file_path = "sequencing_reads.fastq"

reads = []
qualities = []
for record in SeqIO.parse(fastq_file_path, "fastq"):
    reads.append(str(record.seq))
    qualities.append(record.letter_annotations["phred_quality"])

print("Sequencing Reads:")
print(reads)
print("Quality Scores:")
print(qualities)

GFF/GTF Files (Genomic Feature Annotations)

In [None]:
import pandas as pd

gff_file_path = "annotations.gff"

# Assuming GFF file columns: ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']
annotations_df = pd.read_csv(gff_file_path, sep='\t', header=None, comment='#', names=['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])

print("Genomic Feature Annotations:")
print(annotations_df)

VCF Files (Variant Call Format)

In [None]:
import pandas as pd

vcf_file_path = "variants.vcf"

# Assuming VCF file columns: ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'Sample1', 'Sample2']
variants_df = pd.read_csv(vcf_file_path, sep='\t', header=None, comment='#', names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'Sample1', 'Sample2'])

print("Variant Data:")
print(variants_df)

BED Files (Genomic Coordinates)

In [None]:
import pandas as pd

bed_file_path = "genomic_coordinates.bed"

# Assuming BED file columns: ['chrom', 'start', 'end', 'name', 'score', 'strand']
coordinates_df = pd.read_csv(bed_file_path, sep='\t', header=None, names=['chrom', 'start', 'end', 'name', 'score', 'strand'])

print("Genomic Coordinates:")
print(coordinates_df)

SAM/BAM Files (Sequence Alignment)

In [None]:
from pysam import AlignmentFile

sam_file_path = "sequence_alignment.sam"

with AlignmentFile(sam_file_path, 'r') as sam_file:
for alignment in sam_file.fetch():
print("Sequence Alignment:")
print(alignment)

Phylogenetic Tree Files (Newick Format)

In [None]:
from Bio import Phylo

tree_file_path = "phylogenetic_tree.newick"

tree = Phylo.read(tree_file_path, 'newick')

print("Phylogenetic Tree:")
Phylo.draw_ascii(tree)

CSV Files (Tabular Data)

In [None]:
import pandas as pd

csv_file_path = "experimental_data.csv"

experimental_data_df = pd.read_csv(csv_file_path)

print("Experimental Data:")
print(experimental_data_df)

# Creating Output Files for Bioinformatics Results:

 Writing Genomic Sequences to a FASTA File

In [None]:
from Bio import SeqIO

sequences = [Seq("ATGCGTACGCTAGCTAGCTAGCG"), Seq("ATCGATCGATCGTACGTAGCTAGC"), Seq("AGCTAGCTAGCTAGCGATCGATCGA")]

output_fasta_path = "output_genomic_sequences.fasta"

with open(output_fasta_path, "w") as output_fasta:
    SeqIO.write(sequences, output_fasta, "fasta")

print(f"Genomic sequences saved to {output_fasta_path}.")

Saving Sequencing Reads and Quality Scores to a FASTQ File

In [102]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

# Assuming 'sequences' is your list of Seq objects
output_fasta_path = "output_genomic_sequences.fasta"

# Convert Seq objects to SeqRecord objects
seq_records = []
for i, seq in enumerate(sequences, start=1):
    record = SeqRecord(seq, id=f"sequence_{i}", description="")
    seq_records.append(record)

# Write SeqRecord objects to FASTA file
with open(output_fasta_path, "w") as output_fasta:
    SeqIO.write(seq_records, output_fasta, "fasta")

print(f"Genomic sequences saved to {output_fasta_path}.")

Genomic sequences saved to output_genomic_sequences.fasta.


In [None]:
from Bio import SeqIO

reads = [Seq("ATCGATCGATCG"), Seq("GCTAGCTAGCTA"), Seq("ATGCGTACGCTA")]
qualities = [[30, 29, 28, 30], [25, 28, 30, 32], [29, 30, 28, 31]]

output_fastq_path = "output_sequencing_reads.fastq"

records = [SeqIO.SeqRecord(read, id=f"Read{i+1}", letter_annotations={"phred_quality": qual}) for i, (read, qual) in enumerate(zip(reads, qualities))]

with open(output_fastq_path, "w") as output_fastq:
    SeqIO.write(records, output_fastq, "fastq")

print(f"Sequencing reads and quality scores saved to {output_fastq_path}.")

In [105]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

# Your input data
reads = ["ATGCGTACGCTA", "CGTACGTACGTA", "GTACGTACGTAC"]
qualities = [[30, 29, 28, 30], [25, 28, 30, 32], [29, 30, 28, 31]]

output_fastq_path = "output_sequencing_reads.fastq"

# Create SeqRecord objects, ensuring quality scores match sequence length
records = []
for i, (read, qual) in enumerate(zip(reads, qualities)):
    # Convert the read string to a Seq object
    seq = Seq(read)
    
    # Extend or truncate quality scores to match sequence length
    adjusted_qual = qual * (len(read) // len(qual)) + qual[:len(read) % len(qual)]
    
    # Create SeqRecord
    record = SeqRecord(seq, id=f"Read{i+1}", description="",
                       letter_annotations={"phred_quality": adjusted_qual})
    records.append(record)

# Write to FASTQ file
with open(output_fastq_path, "w") as output_fastq:
    SeqIO.write(records, output_fastq, "fastq")

print(f"Sequencing reads saved to {output_fastq_path}.")

Sequencing reads saved to output_sequencing_reads.fastq.


Exporting Genomic Feature Annotations to GFF/GTF File

In [106]:
import pandas as pd

annotations_df = pd.DataFrame({
'seqid': ['chr1', 'chr2', 'chr1'],
'source': ['gene_prediction', 'gene_prediction', 'gene_prediction'],
'type': ['gene', 'exon', 'exon'],
'start': [100, 200, 250],
'end': [300, 220, 270],
'score': [None, None, None],
'strand': ['+', '+', '-'],
'phase': [None, None, None],
'attributes': ['ID=GeneA', 'ID=GeneA;Parent=GeneA', 'ID=GeneA;Parent=GeneA']
})

output_gff_path = "output_genomic_annotations.gff"

annotations_df.to_csv(output_gff_path, sep='\t', index=False, header=False)

print(f"Genomic feature annotations saved to {output_gff_path}.")

Genomic feature annotations saved to output_genomic_annotations.gff.


Exporting Variant Data to a VCF File

In [107]:
import pandas as pd

variants_df = pd.DataFrame({
'CHROM': ['chr1', 'chr2', 'chr1'],
'POS': [100, 200, 300],
'ID': ['SNP1', 'SNP2', 'SNP3'],
'REF': ['A', 'C', 'G'],
'ALT': ['T', 'G', 'A'],
'QUAL': [30, 25, 35],
'FILTER': ['PASS', 'PASS', 'PASS'],
'INFO': ['.', '.', '.'],
'FORMAT': ['GT', 'GT', 'GT'],
'Sample1': ['0/1', '1/1', '0/0'],
'Sample2': ['1/1', '0/1', '1/0']
})

output_vcf_path = "output_variant_data.vcf"

variants_df.to_csv(output_vcf_path, sep='\t', index=False, header=False)

print(f"Variant data saved to {output_vcf_path}.")

Variant data saved to output_variant_data.vcf.


In [108]:
import pandas as pd

coordinates_df = pd.DataFrame({
'chrom': ['chr1', 'chr2', 'chr1'],
'start': [100, 200, 300],
'end': [150, 250, 350],
'name': ['Feature1', 'Feature2', 'Feature3'],
'score': [None, None, None],
'strand': ['+', '-', '+']
})

output_bed_path = "output_genomic_coordinates.bed"

coordinates_df.to_csv(output_bed_path, sep='\t', index=False, header=False)

print(f"Genomic coordinates saved to {output_bed_path}.")

Genomic coordinates saved to output_genomic_coordinates.bed.


In [112]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

# DNA Sequences
dna_sequences = [
Seq("ATGCGTACGCTAGCTAGCTAGCG"),
Seq("ATCGATCGATCGTACGTAGCTAGC"),
Seq("AGCTAGCTAGCTAGCGATCGATCGA")
]

# Protein Sequences
protein_sequences = [
Seq("MHTGKVY"),
Seq("PLKQRTV"),
Seq("SWSEVFRG")
]

# Consolidate sequences
consolidated_sequences = []
for i, dna_seq in enumerate(dna_sequences):
    record_name = f"DNA_Sequence_{i + 1}"
record_description = f"Description for DNA Sequence {i + 1}"
consolidated_sequences.append(SeqRecord(dna_seq, id=record_name, description=record_description))

for i, protein_seq in enumerate(protein_sequences):
    record_name = f"Protein_Sequence_{i + 1}"
record_description = f"Description for Protein Sequence {i + 1}"
consolidated_sequences.append(SeqRecord(protein_seq, id=record_name, description=record_description))

# Write to a single FASTA file
output_fasta_path = "consolidated_sequences.fasta"
with open(output_fasta_path, "w") as output_fasta:
    SeqIO.write(consolidated_sequences, output_fasta, "fasta")

print(f"Consolidated sequences saved to {output_fasta_path}.")

Consolidated sequences saved to consolidated_sequences.fasta.


# Interfacing with the operating system

Creating Directories for Data Organization

In [113]:
import os

# Define directory names
data_dir = "bioinformatics_data"
fasta_dir = os.path.join(data_dir, "fasta_files")
results_dir = os.path.join(data_dir, "analysis_results")

# Create directories
os.makedirs(fasta_dir, exist_ok=True)
os.makedirs(results_dir, exist_ok=True)

print(f"Directories created: {fasta_dir}, {results_dir}")

Directories created: bioinformatics_data/fasta_files, bioinformatics_data/analysis_results


Moving Files to Respective Directories

In [115]:
import shutil

# Assuming you have existing FASTA files in the current directory
fasta_files = [file for file in os.listdir() if file.endswith(".fasta")]

for file in fasta_files:
# Move files to the fasta directory
    shutil.move(file, os.path.join(fasta_dir, file))

print("FASTA files moved to the designated directory.")

FASTA files moved to the designated directory.


Listing Files in a Directory

In [117]:
result_files = os.listdir(results_dir)

print("Files in the results directory:")
for file in result_files:
    print(file)

Files in the results directory:


Renaming Files

In [None]:
# Assume you have result files with inconsistent names
# Define old_name first
for old_name in result_files:
    new_name = old_name.replace("_", "_") # Replace spaces with underscores
os.rename(os.path.join(results_dir, old_name), os.path.join(results_dir, new_name))

print("Files renamed for consistency.")

Deleting Unnecessary Files

In [122]:
unwanted_files = ["temp.txt", "unnecessary_data.csv"]

for file in unwanted_files:
    file_path = os.path.join(data_dir, file)
if os.path.exists(file_path):
    os.remove(file_path)

print("Unnecessary files removed.")

Unnecessary files removed.


Checking File Existence

In [None]:
# add file "important_data.txt"
target_file = "important_data.txt"

if os.path.exists(os.path.join(data_dir, target_file)):
# Perform an operation using the file
    print(f"Processing {target_file}")
else:
    print(f"{target_file} not found.")

Navigating Through Directories

In [128]:
for root, dirs, files in os.walk(data_dir):
    print(f"Current directory: {root}")
print("Files:", files)
print("Directories:", dirs)
print()

Current directory: bioinformatics_data
Current directory: bioinformatics_data/fasta_files
Current directory: bioinformatics_data/analysis_results
Files: []
Directories: []



# Handling CSV (Comma-Separated Values) files

Reading Genomic Data from CSV

In [None]:
import pandas as pd

# Assuming CSV file columns: ['GeneID', 'GeneName', 'ExpressionLevel', 'VariantCount']
csv_file_path = "genomic_data.csv"

genomic_data_df = pd.read_csv(csv_file_path)

print("Genomic Data:")
print(genomic_data_df)

Filtering Genes Based on Expression Level

In [None]:
# define genomic_data_df
expression_threshold = 2.5
high_expression_genes = genomic_data_df[genomic_data_df['ExpressionLevel'] > expression_threshold]

print("Genes with High Expression:")
print(high_expression_genes)

Sorting Genomic Data by Variant Count

In [None]:
sorted_genomic_data = genomic_data_df.sort_values(by='VariantCount', ascending=False)

print("Sorted Genomic Data by Variant Count:")
print(sorted_genomic_data)

Adding a New Column

In [None]:
genomic_data_df['VariantExpressionRatio'] = genomic_data_df['VariantCount'] / genomic_data_df['ExpressionLevel']

print("Genomic Data with Variant Expression Ratio:")
print(genomic_data_df)

Grouping and Aggregating Data

In [None]:
gene_expression_means = genomic_data_df.groupby('GeneName')['ExpressionLevel'].mean().reset_index()

print("Mean Expression Level by Gene:")
print(gene_expression_means)

Exporting Modified Data to CSV

In [None]:
output_csv_path = "modified_genomic_data.csv"
genomic_data_df.to_csv(output_csv_path, index=False)

print(f"Modified genomic data saved to {output_csv_path}.")

Handling Missing Data

In [None]:
genomic_data_df.fillna({'VariantCount': 0}, inplace=True)

print("Genomic Data with NaN values filled:")
print(genomic_data_df)

Merging Genomic Data with Metadata

In [None]:
# Include a file named 'gene_metadata.csv
metadata_csv_path = "gene_metadata.csv"

metadata_df = pd.read_csv(metadata_csv_path)
merged_data = pd.merge(genomic_data_df, metadata_df, on='GeneID', how='inner')

print("Merged Genomic Data with Metadata:")
print(merged_data)

Functions & Modules in Python

Calculate GC Content

In [144]:
def calculate_gc_content(dna_sequence):
    """
Calculate GC content of a DNA sequence.
Args:
dna_sequence (str): Input DNA sequence.
Returns:
float: GC content as a percentage.
    """
    gc_count = dna_sequence.count('G') + dna_sequence.count('C')
    total_bases = len(dna_sequence)
    gc_content = (gc_count / total_bases) * 100
    return gc_content

Translate DNA Sequence to Protein

In [145]:
from Bio.Seq import Seq

def translate_dna_to_protein(dna_sequence):
    """
Translate a DNA sequence to a protein sequence.
Args:
dna_sequence (str): Input DNA sequence.
Returns:
str: Translated protein sequence.
    """
    translated_sequence = Seq(dna_sequence).translate()
    return str(translated_sequence)

Perform Quality Control Check

In [148]:
def quality_control_check(sequence_quality):
    """
Perform quality control check on sequencing data.
Args:
sequence_quality (int): Quality score of the sequencing data.
Returns:
str: Result of the quality control check.
    """
    if sequence_quality >= 30:
        return "High-quality data. Proceed with analysis."
    else:
        return "Low-quality data. Consider re-sequencing or trimming."

Filter Genes Based on Expression Level

In [150]:
def filter_genes_by_expression(gene_data, expression_threshold):
    """
Filter genes based on expression level.
Args:
gene_data (DataFrame): Genomic data with expression levels.
expression_threshold (float): Threshold for expression level.
Returns:
DataFrame: Filtered genes.
    """
    high_expression_genes = gene_data[gene_data['ExpressionLevel'] > expression_threshold]
    return high_expression_genes

 Retrieve Sequences from FASTA File

In [None]:
# create 'genome.fasta in fasta_file_path'
from Bio import SeqIO

def retrieve_sequences_from_fasta(fasta_file_path):
    """
Retrieve sequences from a FASTA file.
Args:
fasta_file_path (str): Path to the FASTA file.
Returns:
list: List of SeqRecord objects.
    """
sequences = []
for record in SeqIO.parse(fasta_file_path, "fasta"):
    sequences.append(record)
return sequences

Perform Statistical Analysis

In [164]:
import numpy as np

def perform_statistical_analysis(data):
    """
    Perform statistical analysis on a dataset.
    
    Args:
    data (list): Numeric dataset for analysis.
    
    Returns:
    dict: Dictionary containing mean, median, and standard deviation.
    """
    result = {
        'mean': np.mean(data),
        'median': np.median(data),
        'std_dev': np.std(data)
    }
    return result

# Example usage:
sample_data = [1, 2, 3, 4, 5]
analysis_result = perform_statistical_analysis(sample_data)
print(analysis_result)

{'mean': np.float64(3.0), 'median': np.float64(3.0), 'std_dev': np.float64(1.4142135623730951)}


with statement in Python

Reading from a FASTA File

In [None]:
from Bio import SeqIO

fasta_file_path = "genomic_sequences.fasta"

with open(fasta_file_path, "r") as fasta_file:
    for record in SeqIO.parse(fasta_file, "fasta"):
        print("Header:", record.id)
        print("Sequence:", record.seq)

Writing to a New FASTA File

In [171]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

output_fasta_path = "output_sequences.fasta"

sequences = [SeqRecord(Seq("ATGCGTACGCTAGCTAGCTAGCG"), id="Seq1"), SeqRecord(Seq("ATCGATCGATCGTACGTAGCTAGC"), id="Seq2")]

with open(output_fasta_path, "w") as output_fasta:
    SeqIO.write(sequences, output_fasta, "fasta")

print(f"Sequences saved to {output_fasta_path}.")

Sequences saved to output_sequences.fasta.


Performing File Operations with os Module

In [181]:
import os

data_directory = "bioinformatics_data"

with os.scandir(data_directory) as directory_contents:
    for entry in directory_contents:
        if entry.is_file():
            print("File:", entry.name)
        elif entry.is_dir():
            print("Directory:", entry.name)

Directory: fasta_files
Directory: analysis_results


Managing Database Connections

In [None]:
import sqlite3

database_path = "bioinformatics_db.sqlite"

with sqlite3.connect(database_path) as conn:
    cursor = conn.cursor()
# Perform database operations using cursor
cursor.execute("SELECT * FROM genomic_data")
results = cursor.fetchall()
for row in results:
    print(row)

Handling Transactions in Database Operations

In [None]:
import sqlite3

database_path = "bioinformatics_db.sqlite"

with sqlite3.connect(database_path) as conn:
    with conn.cursor() as cursor:
# Begin transaction
        cursor.execute("UPDATE genomic_data SET expression_level = expression_level * 1.1 WHERE gene_id = 'GeneA'")
        cursor.execute("UPDATE genomic_data SET expression_level = expression_level * 1.2 WHERE gene_id = 'GeneB'")
# Commit transaction
conn.commit()

In [195]:
import sqlite3

database_path = "bioinformatics_db.sqlite"

try:
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()

    # Begin transaction
    conn.execute("BEGIN TRANSACTION")

    # Update expressions
    cursor.execute("UPDATE genomic_data SET expression_level = expression_level * 1.1 WHERE gene_id = 'GeneA'")
    cursor.execute("UPDATE genomic_data SET expression_level = expression_level * 1.2 WHERE gene_id = 'GeneB'")

    # Commit the transaction
    conn.commit()

    print("Transaction completed successfully.")

except sqlite3.Error as e:
    # If there's an error, roll back the changes
    conn.rollback()
    print(f"An error occurred: {e}")

finally:
    # Always close the cursor and connection
    if cursor:
        cursor.close()
    if conn:
        conn.close()

An error occurred: no such table: genomic_data


Custom Context Manager for File Locking

In [None]:
import filelock

file_path = "shared_file.txt"

def process_shared_file():
    with filelock.FileLock(file_path + ".lock"):
        with open(file_path, "a") as shared_file:
            shared_file.write("New data added by process.\n")

# Use the context manager
process_shared_file()

Try-Except Blocks for Specific Exceptions

In [None]:
# create 'nonexistent_file.txt'
try:
# Code that may raise an IOError
    with open("nonexistent_file.txt", "r") as file:
        content = file.read()
except IOError as e:
    print(f"An IOError occurred: {e}")

Handling Multiple Exceptions

In [None]:
# create 'nonexistent_file.txt'
try:
# Code that may raise either FileNotFoundError or ValueError
    file_path = "nonexistent_file.txt"
    with open(file_path, "r") as file:
        content = file.read()
        value = int("invalid_value")
except FileNotFoundError as file_error:
    print(f"FileNotFoundError: {file_error}")
except ValueError as value_error:
    print(f"ValueError: {value_error}")
except Exception as general_error:
    print(f"An unexpected error occurred: {general_error}")

Using Finally Blocks

In [222]:
try:
# Code that may raise an exception
    result = 10 / 0
except ZeroDivisionError as zero_error:
    print(f"ZeroDivisionError: {zero_error}")
finally:
    print("Cleanup operations, if any, are performed here.")

ZeroDivisionError: division by zero
Cleanup operations, if any, are performed here.


Custom Exception Classes

In [224]:
class GenomicDataError(Exception):
    pass

try:
# Code that may raise a GenomicDataError
    raise GenomicDataError("Invalid genomic data format.")
except GenomicDataError as gen_error:
    print(f"GenomicDataError: {gen_error}")

GenomicDataError: Invalid genomic data format.
