<a href="https://colab.research.google.com/github/TummalaSharmila/HT/blob/HT-quiz/HT_quiz.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from collections import Counter

# Defining valid RNA nucleotides
VALID_NUCLEOTIDES = {"A", "U", "G", "C"}

# Function to read a FASTA file and return the sequence as a string
def read_fasta(file_path):
    """Reads a FASTA file and returns the sequence as a string."""
    sequences = []
    with open(file_path, "r") as file:
        for line in file:
            if not line.startswith(">"):  # Ignore header lines
                sequences.append(line.strip().upper())  # Convert to uppercase
    return "".join(sequences)  # Concatenate all sequence lines into a single string

# Function to reverse transcribe RNA to DNA
def reverse_transcribe(rna_seq):
    """Converts an RNA sequence to its DNA equivalent by replacing 'U' with 'T'."""
    return rna_seq.replace("U", "T")  # Change uracil (U) to thymine (T)

# Function to compute n-nucleotide frequencies (absolute count and percentage)
def compute_frequencies(seq, n):
    """Computes n-nucleotide frequencies (absolute and percentage)."""
    filtered_seq = "".join([nt for nt in seq if nt in VALID_NUCLEOTIDES])  # Ensure only valid RNA bases
    ngram_counts = Counter([filtered_seq[i:i+n] for i in range(len(filtered_seq)-n+1)])  # Count overlapping n-grams
    total_ngrams = sum(ngram_counts.values())  # Get total count of n-grams
    ngram_percentages = {k: (v / total_ngrams) * 100 for k, v in ngram_counts.items()}  # Compute percentage
    return ngram_counts, ngram_percentages  # Return absolute counts and percentages


# Function to compare two frequency distributions and find those with a 3X difference
def compare_frequencies(freq1, freq2):
    """Identifies n-nucleotides with at least a threefold difference in percentage."""
    threefold_diff = {}
    all_keys = set(freq1.keys()).union(set(freq2.keys()))  # Ensure all possible n-grams are compared
    for k in all_keys:
        val1 = freq1.get(k, 0)  # Default to 0 if missing
        val2 = freq2.get(k, 0)  # Default to 0 if missing
        if min(val1, val2) > 0:  # Avoid division by zero
            ratio = max(val1, val2) / min(val1, val2)
            if ratio >= 3:
                threefold_diff[k] = (val1, val2)
    return threefold_diff

# File paths for RNA sequence files
file1_path = "RNA-sequence1.fna"
file2_path = "RNA-sequence2.fna"

# Reading RNA sequences from FASTA files
rna_seq1 = read_fasta(file1_path)
rna_seq2 = read_fasta(file2_path)

# Converting RNA sequences to DNA sequences
dna_seq1 = reverse_transcribe(rna_seq1)
dna_seq2 = reverse_transcribe(rna_seq2)

# Computing dinucleotide and trinucleotide frequencies for both sequences
di_counts1, di_perc1 = compute_frequencies(rna_seq1, 2)  # Dinucleotide analysis for file 1
di_counts2, di_perc2 = compute_frequencies(rna_seq2, 2)  # Dinucleotide analysis for file 2

tri_counts1, tri_perc1 = compute_frequencies(rna_seq1, 3)  # Trinucleotide analysis for file 1
tri_counts2, tri_perc2 = compute_frequencies(rna_seq2, 3)  # Trinucleotide analysis for file 2

# Identifying differences of at least 3X in percentage between sequences
threefold_diff_di = compare_frequencies(di_perc1, di_perc2)  # For dinucleotides
threefold_diff_tri = compare_frequencies(tri_perc1, tri_perc2)  # For trinucleotides



In [None]:
# Printing reverse transcribed DNA sequences
print("Reverse Transcribed DNA Sequences:")
print("DNA Sequence 1:", dna_seq1)
print("DNA Sequence 2:", dna_seq2)


Reverse Transcribed DNA Sequences:
DNA Sequence 1: CTACCCTAAC CCCAAAAGGG GAGGGTACAC GAGTTCTGAC CGCGATTTTC AAAACTCGAAGAGTTTTCAG ATCTCGGTGG CAGGTCCCTC GTCCATCGAC GACCCGAGGC CCCTGTGAAACGCAAGCCCG ACCCTCGCAC GAAAGGTGCT GCCACTGTGC GAAGGGACCT AACCCATTCGAGGACTGACT TGAACTACTC AGGAGAGACT CAGTGCCCGA GAGCCGAGGC ACATAAAAGTCGAGCCCTTT TAGCGACCCC GACCCCCACC CCGTCACCCC TGAATCGCTC AAACCCCCACTCACCCTACC TTCGAACCGA TCTCCCTAGT AGTATCCTCA ACGTAACAAC CCTCTGGACCCACATCTACT ACCCCTACAA TCCTGGTAGG CTTGAGTTTC AACTTGCGGA TCCGTCTCCTCACCTCGAAA CCCCTTGGAA CTCGGCCGGA TTTCGCATGA AGAAACGTGT AGGTGGGCCACGACCCGCAT CCCTTAGGGA CTTTATTTTC TACGTGTTTC GTAACTCCAG ACTCTGAAAACCTAGAGCTT TGTAACTCTT GAGTATCGAC ATATAAAATC TCGGGTACCG TAGGATCACTTTTGACCCCG AGGTAAGGCT TTACTAGTAA ACCCCCACTA GGCCCCTCGG GTTCGACGATTCCAGGGTGT TGAAGGCCTG GAAACAGGAA GGACCTCGCT AGAAAGGTCC GTCGGGGGCCGAGGCGATCT ACCTCTTTTA GGTTAACTTC CGACAGTCAG CACCTTCACT CTTCACGATTTGGTCCCCAA ACGGGCGGTC CGGCTCCTCC TGGCAGCGTT AGACTCTCCG GGCCGTCGGGACAATAACAA ACCGAGGTGT AAATGTAAAG ACGGAG

In [None]:
# Printing dinucleotide frequencies for file 1
print("\nDinucleotide Frequencies (File 1):")
print("Base\tAbsolute Frequency\tPercentage")
for k, v in di_counts1.items():
    print(f"{k}\t{v}\t{di_perc1[k]:.2f}%")




Dinucleotide Frequencies (File 1):
Base	Absolute Frequency	Percentage
CU	1183	6.18%
UA	957	5.00%
AC	1408	7.35%
CC	1478	7.72%
AA	1357	7.09%
CA	966	5.04%
AG	1107	5.78%
GG	1398	7.30%
GA	1433	7.48%
GU	1398	7.30%
CG	1176	6.14%
UU	1556	8.13%
UC	1492	7.79%
UG	973	5.08%
GC	425	2.22%
AU	841	4.39%


In [None]:
# Printing dinucleotide frequencies for file 2
print("\nDinucleotide Frequencies (File 2):")
print("Base\tAbsolute Frequency\tPercentage")
for k, v in di_counts2.items():
    print(f"{k}\t{v}\t{di_perc2[k]:.2f}%")




Dinucleotide Frequencies (File 2):
Base	Absolute Frequency	Percentage
CA	4695	4.79%
AC	6190	6.32%
CC	3767	3.85%
AA	10917	11.15%
AG	5623	5.74%
GG	4109	4.20%
GC	787	0.80%
CG	3270	3.34%
GA	6489	6.63%
CU	5218	5.33%
UG	5281	5.39%
UC	6205	6.34%
UU	11596	11.84%
GU	6898	7.04%
AU	8121	8.29%
UA	8750	8.94%


In [None]:
# Printing trinucleotide frequencies for file 1
print("\nTrinucleotide Frequencies (File 1):")
print("Base\tAbsolute Frequency\tPercentage")
for k, v in tri_counts1.items():
    print(f"{k}\t{v}\t{tri_perc1[k]:.2f}%")




Trinucleotide Frequencies (File 1):
Base	Absolute Frequency	Percentage
CUA	209	1.09%
UAC	224	1.17%
ACC	447	2.33%
CCC	412	2.15%
CCU	363	1.90%
UAA	277	1.45%
AAC	371	1.94%
CCA	290	1.51%
CAA	238	1.24%
AAA	521	2.72%
AAG	244	1.27%
AGG	343	1.79%
GGG	386	2.02%
GGA	434	2.27%
GAG	401	2.09%
GGU	438	2.29%
GUA	278	1.45%
ACA	297	1.55%
CAC	335	1.75%
ACG	307	1.60%
CGA	336	1.75%
AGU	318	1.66%
GUU	283	1.48%
UUC	301	1.57%
UCU	369	1.93%
CUG	186	0.97%
UGA	306	1.60%
GAC	478	2.50%
CCG	413	2.16%
CGC	123	0.64%
GCG	95	0.50%
GAU	233	1.22%
AUU	231	1.21%
UUU	733	3.83%
UCA	282	1.47%
ACU	356	1.86%
CUC	479	2.50%
UCG	361	1.89%
GAA	321	1.68%
AGA	357	1.86%
CAG	209	1.09%
AUC	210	1.10%
CGG	383	2.00%
GUG	335	1.75%
UGG	286	1.49%
GGC	140	0.73%
GCA	97	0.51%
GUC	502	2.62%
UCC	480	2.51%
CGU	334	1.74%
CAU	184	0.96%
GCC	139	0.73%
UGU	308	1.61%
AGC	89	0.46%
UGC	73	0.38%
GCU	94	0.49%
CUU	309	1.61%
UUG	237	1.24%
AUA	185	0.97%
UUA	285	1.49%
UAG	253	1.32%
AAU	221	1.15%
UAU	203	1.06%
AUG	215	1.12%


In [None]:
# Printing trinucleotide frequencies for file 2
print("\nTrinucleotide Frequencies (File 2):")
print("Base\tAbsolute Frequency\tPercentage")
for k, v in tri_counts2.items():
    print(f"{k}\t{v}\t{tri_perc2[k]:.2f}%")




Trinucleotide Frequencies (File 2):
Base	Absolute Frequency	Percentage
CAC	1048	1.07%
ACC	1384	1.41%
CCA	953	0.97%
CAA	1527	1.56%
AAG	2064	2.11%
AGG	1360	1.39%
GGC	218	0.22%
GCC	214	0.22%
CCC	843	0.86%
CCG	807	0.82%
CGA	1081	1.10%
GAC	1613	1.65%
CCU	1164	1.19%
CUG	804	0.82%
UGC	229	0.23%
CUC	1151	1.18%
UCA	1703	1.74%
ACU	1861	1.90%
CUU	1972	2.01%
UUC	2037	2.08%
CAG	789	0.81%
GGG	914	0.93%
GGU	1516	1.55%
GUG	1285	1.31%
GCA	200	0.20%
CAU	1331	1.36%
AUA	2511	2.56%
UAG	1458	1.49%
AGA	2135	2.18%
GAG	1312	1.34%
AGU	1959	2.00%
GUU	2061	2.10%
UUA	3007	3.07%
UAA	2972	3.04%
AAU	2722	2.78%
AUU	2711	2.77%
UCC	1326	1.35%
ACG	1107	1.13%
AAA	4388	4.48%
UAU	2534	2.59%
UAC	1786	1.82%
UCU	2014	2.06%
UUG	1700	1.74%
UGU	2237	2.28%
GUA	1941	1.98%
AUC	1407	1.44%
UGA	1812	1.85%
GAU	1534	1.57%
GCG	194	0.20%
CGU	1186	1.21%
UCG	1162	1.19%
AUG	1492	1.52%
UGG	1003	1.02%
GCU	179	0.18%
UUU	4852	4.96%
GGA	1461	1.49%
CGC	171	0.17%
CGG	832	0.85%
GAA	2030	2.07%
AAC	1743	1.78%
GUC	1610	1.64%
CUA	1291	1.32%
ACA	1838	1.8

In [None]:
# Printing dinucleotides with at least 3X difference in percentage
print("\nThreefold Difference Dinucleotides:")
print("Base\tFile 1 Percentage\tFile 2 Percentage")
for k, v in threefold_diff_di.items():
    print(f"{k}\t{v[0]:.2f}%\t{v[1]:.2f}%")




Threefold Difference Dinucleotides:
Base	File 1 Percentage	File 2 Percentage


In [None]:
# Printing trinucleotides with at least 3X difference in percentage
print("\nThreefold Difference Trinucleotides:")
print("Base\tFile 1 Percentage\tFile 2 Percentage")
for k, v in threefold_diff_tri.items():
    print(f"{k}\t{v[0]:.2f}%\t{v[1]:.2f}%")


Threefold Difference Trinucleotides:
Base	File 1 Percentage	File 2 Percentage
CGC	0.64%	0.17%
GCC	0.73%	0.22%
GGC	0.73%	0.22%


In [None]:
#verifying the raw percentage difference ratio to ensure that my output is correct for both di nucleotides and tri nucleotides
print("\nComparison of Dinucleotide Percentages:")
print("Base\tFile 1 Percentage\tFile 2 Percentage\tRatio")
for k in di_perc1.keys():
    p1 = di_perc1[k]
    p2 = di_perc2.get(k, 0)  # Default to 0 if missing
    ratio = max(p1, p2) / min(p1, p2) if min(p1, p2) > 0 else "Inf"
    print(f"{k}\t{p1:.2f}%\t{p2:.2f}%\t{ratio}")



Comparison of Dinucleotide Percentages:
Base	File 1 Percentage	File 2 Percentage	Ratio
CU	6.18%	5.33%	1.1593402519584193
UA	5.00%	8.94%	1.7879932313979232
AC	7.35%	6.32%	1.1631674773459215
CC	7.72%	3.85%	2.0063609138745107
AA	7.09%	11.15%	1.5732336167228353
CA	5.04%	4.79%	1.052135743036752
AG	5.78%	5.74%	1.006722516495989
GG	7.30%	4.20%	1.7398078057088886
GA	7.48%	6.63%	1.1292722626837475
GU	7.30%	7.04%	1.0363685522844046
CG	6.14%	3.34%	1.8390342345381123
UU	8.13%	11.84%	1.457365113647068
UC	7.79%	6.34%	1.2295813921231404
UG	5.08%	5.39%	1.0613853340827364
GC	2.22%	0.80%	2.761496152885475
AU	4.39%	8.29%	1.8883533795547724


In [None]:
print("\nComparison of Trinucleotide Percentages:")
print("Base\tFile 1 Percentage\tFile 2 Percentage\tRatio")
for k in set(tri_perc1.keys()).union(tri_perc2.keys()):  # Ensure all keys are included
    p1 = tri_perc1.get(k, 0)  # Default to 0 if missing
    p2 = tri_perc2.get(k, 0)  # Default to 0 if missing
    ratio = max(p1, p2) / min(p1, p2) if min(p1, p2) > 0 else "Inf"
    print(f"{k}\t{p1:.2f}%\t{p2:.2f}%\t{ratio}")



Comparison of Trinucleotide Percentages:
Base	File 1 Percentage	File 2 Percentage	Ratio
AAC	1.94%	1.78%	1.0884914285019118
AUG	1.12%	1.52%	1.3570063260849168
CGG	2.00%	0.85%	2.354094749227635
UAU	1.06%	2.59%	2.4409679753903366
GAA	1.68%	2.07%	1.2366377920451381
ACC	2.33%	1.41%	1.6516572345205076
AUU	1.21%	2.77%	2.2949279048242435
CCA	1.51%	0.97%	1.5561576363048772
GCA	0.51%	0.20%	2.480220138925158
GUU	1.48%	2.10%	1.4241084563682822
GAC	2.50%	1.65%	1.5154514241967336
AGA	1.86%	2.18%	1.1694486914921205
UUG	1.24%	1.74%	1.402658941030184
UAG	1.32%	1.49%	1.1269081293587908
CUG	0.97%	0.82%	1.1830562287533453
GAG	2.09%	1.34%	1.5630001818408796
CAG	1.09%	0.81%	1.3546209061067982
CUC	2.50%	1.18%	2.1281815840016316
UCU	1.93%	2.06%	1.0672953298495864
ACU	1.86%	1.90%	1.022228262646976
UAA	1.45%	3.04%	2.0980727592690265
CAU	0.96%	1.36%	1.4145286284243876
ACG	1.60%	1.13%	1.4182057621136785
AAA	2.72%	4.48%	1.6469499623777641
GGG	2.02%	0.93%	2.1596809619551784
UCG	1.89%	1.19%	1.58872805534713
CGA	1.7