In [None]:
# Ch07-1-genomes

In [6]:
# Download T2T reference genome
! mkdir -p data
! wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/009/914/755/GCA_009914755.4_T2T-CHM13v2.0/GCA_009914755.4_T2T-CHM13v2.0_genomic.fna.gz
! gunzip GCA_009914755.4_T2T-CHM13v2.0_genomic.fna.gz
! mv GCA_009914755.4_T2T-CHM13v2.0_genomic.fna data/T2T_genome.fasta

--2025-01-06 23:18:39--  https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/009/914/755/GCA_009914755.4_T2T-CHM13v2.0/GCA_009914755.4_T2T-CHM13v2.0_genomic.fna.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 2607:f220:41e:250::10, 2607:f220:41e:250::31, 2607:f220:41e:250::11, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|2607:f220:41e:250::10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 932696125 (889M) [application/x-gzip]
Saving to: ‘GCA_009914755.4_T2T-CHM13v2.0_genomic.fna.gz’


2025-01-06 23:19:13 (26.3 MB/s) - ‘GCA_009914755.4_T2T-CHM13v2.0_genomic.fna.gz’ saved [932696125/932696125]



In [None]:
# Install pyfastx
! pip install pyfastx

In [12]:
import pyfastx

# Input genome file
genome_fasta = "data/T2T_genome.fasta"

def compute_genome_size(fasta_file):
    """
    Compute the total genome size from a FASTA file.
    """
    genome_size = 0
    genome = pyfastx.Fasta(fasta_file, build_index=False)
    for _, seq in genome:  # Use the sequence directly from the tuple
        genome_size += len(seq)
    return genome_size

def compute_gc_content(fasta_file):
    """
    Compute the overall GC content of the genome.
    """
    total_bases = 0
    gc_count = 0
    genome = pyfastx.Fasta(fasta_file, build_index=False)
    for _, seq in genome:  # Use the sequence directly from the tuple
        total_bases += len(seq)
        gc_count += seq.count('G') + seq.count('C')
    return (gc_count / total_bases) * 100 if total_bases > 0 else 0

def compute_n50(fasta_file):
    """
    Compute the N50 value for the genome.
    """
    lengths = []
    genome = pyfastx.Fasta(fasta_file, build_index=False)
    lengths = sorted([len(seq) for _, seq in genome], reverse=True)
    
    cumulative_length = 0
    total_length = sum(lengths)
    for length in lengths:
        cumulative_length += length
        if cumulative_length >= total_length / 2:
            return length
    return 0

def assess_quality(fasta_file):
    """
    Assess the quality of a T2T genome by calculating key metrics.
    """
    genome_size = compute_genome_size(fasta_file)
    gc_content = compute_gc_content(fasta_file)
    n50 = compute_n50(fasta_file)

    print(f"Genome Quality Metrics for {fasta_file}:")
    print(f"Total Genome Size: {genome_size:,} bp")
    print(f"GC Content: {gc_content:.2f}%")
    print(f"N50: {n50:,} bp")

# Run the quality assessment
if __name__ == "__main__":
    assess_quality(genome_fasta)


Genome Quality Metrics for data/T2T_genome.fasta:
Total Genome Size: 3,117,292,070 bp
GC Content: 25.32%
N50: 150,617,247 bp
