In [4]:
pip install biopython

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Collecting biopython
  Downloading biopython-1.85-cp312-cp312-win_amd64.whl.metadata (13 kB)
Downloading biopython-1.85-cp312-cp312-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   -------------- ------------------------- 1.0/2.8 MB 12.7 MB/s eta 0:00:01
   -------------- ------------------------- 1.0/2.8 MB 12.7 MB/s eta 0:00:01
   ---------------------------------------- 2.8/2.8 MB 4.4 MB/s eta 0:00:00
Installing collected packages: biopython
Successfully installed biopython-1.85


In [1]:
from Bio.Seq import Seq
from Bio.SeqUtils import gc_fraction
import pandas as pd

motif = "ATG"
stop_codons = ["TAA", "TAG", "TGA"]

try:
    with open("human.txt", "r") as file:
        sequences = [line.strip() for line in file if line.strip()]
except FileNotFoundError:
    print("Error: The file 'human.txt' was not found.")
    sequences = []

if not sequences:
    print("No sequences to analyze. Please check your file.")
else:
    df = pd.DataFrame(sequences, columns=['sequence'])
    print("Original DataFrame (first 5 rows):")
    print(df.head())
    # print("-" * 30)

    def analyze_sequence(seq_str):
        """Analyzes a single DNA sequence to find motifs, GC content, and coding regions."""
        sequence = Seq(seq_str)

        positions = [i + 1 for i in range(len(sequence) - len(motif) + 1) if sequence[i:i + len(motif)] == motif]
        gc_content = gc_fraction(sequence) * 100

        coding_region = ""
        start_pos = seq_str.find("ATG")
        
        if start_pos != -1:
            for i in range(start_pos + 3, len(seq_str), 3):
                codon = seq_str[i:i + 3]
                
                if codon in stop_codons:
                    coding_region = seq_str[start_pos:i + 3]
                    break
        
        return pd.Series([positions, gc_content, coding_region], index=['Motif_Positions', 'GC_Content', 'Coding_Region'])

    df[['Motif_Positions', 'GC_Content', 'Coding_Region']] = df['sequence'].apply(analyze_sequence)
    print("Analyzed DataFrame (first 5 rows):")
    print(df.head())

Original DataFrame (first 5 rows):
                                            sequence
0                                    sequence\tclass
1  ATGCCCCAACTAAATACTACCGTATGGCCCACCATAATTACCCCCA...
2  ATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAG...
3  ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...
4  ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...
Analyzed DataFrame (first 5 rows):
                                            sequence  \
0                                    sequence\tclass   
1  ATGCCCCAACTAAATACTACCGTATGGCCCACCATAATTACCCCCA...   
2  ATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAG...   
3  ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...   
4  ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...   

                                     Motif_Positions  GC_Content  \
0                                                 []   71.428571   
1                                       [1, 24, 162]   39.613527   
2                  [1, 141, 169, 310, 324, 368, 511]   44.199706   
3  