 Function to validate DNA sequences


In [None]:
def validate_base_sequence(dna_seq):
   #define_a_set_of_valid_nucleotide_bases
   valid_bases = {'A', 'T', 'G', 'C'}

   #check_if_all_characters_in_the_DNA_sequence_are_in_the_valid_bases_set
   return all(base in valid_bases for base in dna_seq)


seq1 = "ATGCTGTAGGTAAGTAAGCG"
seq2 = "ATGXGMGGGTCTA"

if validate_base_sequence(seq1):
   print(f"seq1 is a valid DNA sequence.")
else:
   print(f"seq1 contains invalid bases.")

if validate_base_sequence(seq2):
   print(f"seq2 is a valid DNA sequence.")
else:
   print(f"seq2 contains invalid bases.")



seq1 is a valid DNA sequence.
seq2 contains invalid bases.


 Restriction sites in a DNA sequence

In [None]:
# Define the function for recognizing restriction sites
def find_recognition_sites(dna_seq, recognition_site):
    sites = []
    site_length = len(recognition_site)

    # Iterates through the DNA sequence and checks if the current subsequence of the same length as the recognition site matches the recognition site
    for i in range(len(dna_seq) - site_length + 1):
        if dna_seq[i:i+site_length] == recognition_site:
            sites.append(i)

    return sites


dna_seq = "AATGCTAGCTAGCTGCTAGCTGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGC"
recognition_site = "GCTAGC"

site_positions = find_recognition_sites(dna_seq, recognition_site)

if site_positions:
    print(f"Recognition site '{recognition_site}' found at positions: {', '.join(map(str, site_positions))}")
else:
    print(f"Recognition site '{recognition_site}' not found in the sequence.")


Recognition site 'GCTAGC' found at positions: 3, 7, 14, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61


Comparing two DNA sequences of equal length and calculating their percentage identity


In [None]:
# Function for comparing two DNA sequences of equal length and calculating their percentage identity
def compare_dna_seq(seq1, seq2):
    if len(seq1) != len(seq2):
        return None

    matching_nucleotides = 0
    total_nucleotides = len(seq1)

    for i in range(total_nucleotides):
        if seq1[i] == seq2[i]:
            matching_nucleotides += 1

    if total_nucleotides > 0:
        percentage_identity = (matching_nucleotides / total_nucleotides) * 100
    else:
        percentage_identity = 0

    return percentage_identity


seq1 = "ATGCTGCATAGCGATAA"
seq2 = "ATGCTCGTATAGCGTAA"
identity = compare_dna_seq(seq1, seq2)

if identity is not None:
    print("Percentage Identity:", f"{identity:.2f}%")
else:
    print("Sequences have different lengths and cannot be compared.")


Percentage Identity: 47.06%


Find the start codon and stop codon

In [None]:
 #Functions to find the start codon and stop codon

#defining_function_for_start_codon
def find_start_codon(rna_seq):
   start_codon = "AUG"
   start_index = rna_seq.find(start_codon)
   return start_index

#defining_function_for_stop_codon
def find_stop_codon(rna_seq):
   stop_codons = ["UAA", "UAG", "UGA"]
   for stop_codon in stop_codons:
       stop_index = rna_seq.find(stop_codon)
       if stop_index != -1:
           return stop_index
   return -1

#Input_mRNA_sequence
rna_seq = "AUGGUACCUUAAAGGGCUAGAGTCGAAAUAGCGUAGUCG"
start_index = find_start_codon(rna_seq)
stop_index = find_stop_codon(rna_seq)

if start_index != -1:
   print(f"Start codon found at index {start_index}")
else:
   print("Start codon not found in the sequence")

if stop_index != -1:
   print(f"Stop codon found at index {stop_index}")
else:
   print("Stop codon not found in the sequence")


Start codon found at index 0
Stop codon found at index 9


Find the overlapping region between two DNA sequences

In [None]:
def find_overlap(seq1, seq2):
    overlap = 0
    for i in range(1, min(len(seq1), len(seq2)) + 1):
        if seq1[-i:] == seq2[:i]:
            overlap = i
    return overlap

def extract_overlap(seq1, seq2, overlap):
    return seq1 + seq2[overlap:]

def find_and_extract_overlap(seq1, seq2):
    overlap = find_overlap(seq1, seq2)
    if overlap > 0:
        overlapping_seq = extract_overlap(seq1, seq2, overlap)
        return overlap, overlapping_seq, seq2[:overlap]
    else:
        return 0, None, None


seq1 = "TCGGTCACTTACGAGCGTTAGG"
seq2 = "CGTTAGGCGGTAACCCCTTAAAGTCGG"

overlap, overlapping_seq, overlapped_portion = find_and_extract_overlap(seq1, seq2)

if overlap > 0:
    print(f"No of bases that overlap between the two sequences: {overlap} bases")
    print(f"The new overlapping sequence: {overlapping_seq}")
    print(f"Overlapped portion: {overlapped_portion}")
else:
    print("No overlap found between the two sequences.")


No of bases that overlap between the two sequences: 7 bases
The new overlapping sequence: TCGGTCACTTACGAGCGTTAGGCGGTAACCCCTTAAAGTCGG
Overlapped portion: CGTTAGG


GI IDs from a Homo sapiens insulin (INS) mRNA, partial cds FASTA file

In [None]:
# Define a function to collect GenInfo (GI) IDs from a FASTA file
def collect_gi_ids(fasta_file):
    gi_ids = []  # Initialize an empty list to store the GI IDs

    # Open the FASTA file for reading
    with open(fasta_file, "r") as file:
        for line in file:
            line = line.strip()  # Remove leading and trailing whitespaces

            if line.startswith(">"):
                # Extracting the GI ID from the sequence header
                header = line[1:]  # Remove the '>' character at the beginning of the header
                gi_id = header.split()[0]  # Assuming the GI ID is the first word in the header
                gi_ids.append(gi_id)  # Add the GI ID to the list

    return gi_ids  # Return the list of GI IDs

# Usage:
fasta_file = "Insulin.fasta"  # Provide the path to your FASTA file

gi_ids = collect_gi_ids(fasta_file)  # Call the function to collect GI IDs

# Print the GI IDs
for gi_id in gi_ids:
    print("GI ID:", gi_id)


GI ID: JF909299.1


Transcription of  DNA to RNA

In [None]:
# Define a function to transcribe DNA to RNA
def transcribe_dna_to_rna(dna_seq):
    rna_seq = ""
    for base in dna_seq:
        if base == "T":
            rna_seq += "U"
        else:
            rna_seq += base
    return rna_seq

# Input a DNA sequence
dna_seq = "ATTCAGTTCTGTATTG"

# Transcribe DNA to RNA
rna_seq = transcribe_dna_to_rna(dna_seq)

# Print the RNA sequence
print("RNA Sequence:", rna_seq)


RNA Sequence: AUUCAGUUCUGUAUUG


Point mutations between two DNA sequences of equal length

In [None]:
# Define a function to count point mutations between two DNA sequences of equal length
def count_point_mutations(seq1, seq2):
    # Check if the lengths of the input sequences are not equal
    if len(seq1) != len(seq2):
        # Raise a ValueError with a custom error message
        raise ValueError("Input sequences must be of the same length")

    # Initialize a variable to count mutations to 0
    mutations = 0

    # Iterate through the sequences using the zip function
    for base1, base2 in zip(seq1, seq2):
        # Compare the corresponding bases in the two sequences
        if base1 != base2:
            # If they are different, increment the mutations count by 1
            mutations += 1

    # Return the count of point mutations
    return mutations

# Example usage
seq1 = "GAGCCTACTAACGGGATA"
seq2 = "CATCGTAATGACGGCCTG"

# Call the count_point_mutations function to count point mutations
mutations = count_point_mutations(seq1, seq2)

# Print the count of point mutations
print(f"No of point mutations found: {mutations}")


No of point mutations found: 8


 Hamming distance between two DNA sequences of equal length

In [None]:
# Define a function to calculate the Hamming distance between two DNA sequences of equal length
def hamming_distance(seq1, seq2):
    # Check if the lengths of the input sequences are not equal
    if len(seq1) != len(seq2):
        # Raise a ValueError with a custom error message
        raise ValueError("Input sequences must have the same length")

    # Initialize the Hamming distance to 0
    distance = 0

    # Iterate through the sequences using the zip function
    for base1, base2 in zip(seq1, seq2):    # Zip function takes two or more sequences and pairs their elements together
        # Compare the corresponding bases in the two sequences
        if base1 != base2:
            # If they are different, increment the Hamming distance by 1
            distance += 1

    # Return the calculated Hamming distance
    return distance

# Example usage
seq1 = "GAGCCTACTAACTGGGATA"
seq2 = "CATCGTAATGACAGGCCTA"

# Call the hamming_distance function to calculate the Hamming distance
distance = hamming_distance(seq1, seq2)

# Print the Hamming distance
print(f"Hamming Distance: {distance}")



Hamming Distance: 8


Find GC-rich regions within a DNA sequence

In [None]:
# Define a function to find GC-rich regions within a DNA sequence
def find_gc_rich_regions(sequence, window_size, gc_threshold):
    gc_rich_regions = []

    # Iterate through the DNA sequence using a sliding window approach
    for start in range(len(sequence) - window_size + 1):
        # Extract a region of the sequence based on the current 'start' position and 'window_size'
        region = sequence[start:start + window_size]

        # Calculate the GC content of the extracted region
        gc_content = (region.count('G') + region.count('C')) / window_size

        # Check if the GC content of the region is greater than or equal to the specified threshold
        if gc_content >= gc_threshold:
            # If the region is GC-rich, add information about it (start, end, gc_content) to the gc_rich_regions list
            gc_rich_regions.append((start, start + window_size, gc_content))

    # Return the list of identified GC-rich regions
    return gc_rich_regions

# Example DNA sequence
dna_sequence = "ATGCGCATGCAGTACGCTAGCGCAGTGCTAGCGCATGCGCGC"
window_size = 15
gc_threshold = 0.6

# Call the find_gc_rich_regions function to identify GC-rich regions
regions = find_gc_rich_regions(dna_sequence, window_size, gc_threshold)

# Print the details of identified GC-rich regions
for start, end, gc_content in regions:
    print(f"GC-rich Region: Start={start}, End={end}, GC Content={gc_content:.2f}")


GC-rich Region: Start=1, End=16, GC Content=0.60
GC-rich Region: Start=2, End=17, GC Content=0.67
GC-rich Region: Start=3, End=18, GC Content=0.60
GC-rich Region: Start=7, End=22, GC Content=0.60
GC-rich Region: Start=8, End=23, GC Content=0.67
GC-rich Region: Start=9, End=24, GC Content=0.60
GC-rich Region: Start=10, End=25, GC Content=0.60
GC-rich Region: Start=11, End=26, GC Content=0.60
GC-rich Region: Start=12, End=27, GC Content=0.60
GC-rich Region: Start=13, End=28, GC Content=0.67
GC-rich Region: Start=14, End=29, GC Content=0.67
GC-rich Region: Start=15, End=30, GC Content=0.60
GC-rich Region: Start=16, End=31, GC Content=0.60
GC-rich Region: Start=17, End=32, GC Content=0.60
GC-rich Region: Start=18, End=33, GC Content=0.67
GC-rich Region: Start=19, End=34, GC Content=0.73
GC-rich Region: Start=20, End=35, GC Content=0.67
GC-rich Region: Start=21, End=36, GC Content=0.60
GC-rich Region: Start=22, End=37, GC Content=0.60
GC-rich Region: Start=23, End=38, GC Content=0.60
GC-ric

 Function to calculate GC content of a DNA sequence

In [None]:
def calculate_gc_content(dna_seq):
    gc_count = 0
    for base in dna_seq:
        if base == 'G' or base == 'C':
            gc_count += 1
    total_bases = len(dna_seq)
    gc_content = (gc_count / total_bases) * 100
    return gc_content

# Function to analyze the base composition of a DNA sequence
def analyze_base_composition(dna_seq):
    # 'analyze_base_composition' function calls the 'calculate_gc_content' function
    gc_content = calculate_gc_content(dna_seq)

    # Print the results
    print(f"DNA sequence: {dna_seq}")
    print(f"GC Content: {gc_content:.2f}%")


dna_seq = "ATGCTATGATTGCCGTGCTA"
analyze_base_composition(dna_seq)

DNA sequence: ATGCTATGATTGCCGTGCTA
GC Content: 45.00%
