The sequence.csv file was downloaded from NCBI and it contains information regarding all the complete SARS-CoV-2 genomes, without any ambigious characters, downloaded from NCBI.
At first, we access it and remove those genomes that does not have a proper collection date.

In [1]:
import pandas as pd

def remove_genomes_with_no_collection_date(metadata_file):
    # Read the CSV file
    all_genomes_metadata_df = pd.read_csv(metadata_file)
    print("Initial number of genomes:",len(all_genomes_metadata_df))

    #Filter out data points that lacks proper collection dates
    all_genomes_metadata_df = all_genomes_metadata_df[(all_genomes_metadata_df['Collection_Date'].notna()) & 
            (all_genomes_metadata_df['Collection_Date'].str.len() >=7)]

    print("Number of genomes after filtering:",len(all_genomes_metadata_df))

    return all_genomes_metadata_df


metadata_file="../SARS-COV-2_Genomic_data/SARS-CoV-2_1063493_metadata.csv"
all_genomes_metadata_df=remove_genomes_with_no_collection_date(metadata_file)


  all_genomes_metadata_df = pd.read_csv(metadata_file)


Initial number of genomes: 1063493
Number of genomes after filtering: 1059141


We now sort the dataframe based on the collection dates of the genomes and remove the referece genome from the sampling population as we are going to add it later and use it as the outgroup for the tree construction

In [2]:
# Sort the DataFrame by Collection Date
all_genomes_metadata_df = all_genomes_metadata_df.sort_values('Collection_Date')

# Drop the row where 'Accession' is 'NC_045512.2'
all_genomes_metadata_df = all_genomes_metadata_df[all_genomes_metadata_df['Accession'] != 'NC_045512.2']

We sort the genomes based on their collection dates and shortlist time-wise evenly distributed genomes to be used in out study

In [3]:
def sample_genomes_timewise(all_genomes_metadata_df,n):
    # Sort the DataFrame based on the 'Collection_Date' column
    all_genomes_metadata_df_sorted = all_genomes_metadata_df.sort_values(by='Collection_Date')
    # Sample every nth row
    sampled_genomes_df = all_genomes_metadata_df_sorted.iloc[::n]

    return sampled_genomes_df

#select every nth sample from the sorted list
n=50
sampled_genomes_df=sample_genomes_timewise(all_genomes_metadata_df,n)
    


In [4]:
sampled_genomes_df

Unnamed: 0,Accession,Organism_Name,Assembly,Submitters,Organization,Org_location,Release_Date,Pangolin,PangoVersions,Surveillance_Sampling,Isolate,Species,Length,Nuc_Completeness,Geo_Location,Country,USA,Host,Tissue_Specimen_Source,Collection_Date
1063492,MN908947.3,Severe acute respiratory syndrome coronavirus 2,set:MN908947,"Wu,F., Zhao,S., Yu,B., Chen,Y.M., Wang,W., Son...","Fudan University, Shanghai Public Health Clini...",China,2020-01-12,B,4.3.1/1.29/v0.1.12/0.3.19/0.6.2,,Wuhan-Hu-1,Severe acute respiratory syndrome-related coro...,29903,complete,China,China,,Homo sapiens,,2019-12
1063393,MT246667.1,Severe acute respiratory syndrome coronavirus 2,set:MT246667,"Thornburg,N., Bradford,R., Rashid,S., Flores,B...","US Food and Drug Administration, Center for De...",USA,2020-03-26,A,4.3.1/1.29/v0.1.12/0.3.19/0.6.2,,USA-WA1,Severe acute respiratory syndrome-related coro...,29867,complete,USA: WA,USA,WA,Homo sapiens,oronasopharynx,2020-01-19
731870,OM403303.1,Severe acute respiratory syndrome coronavirus 2,,"Pui,K., Cheng,S., Gu,H., Poon,L., Peiris,M., C...","The University of Hong Kong, School of Public ...",Hong Kong,2022-01-27,A,4.3.1/1.29/v0.1.12/0.3.19/0.6.2,,Original,Severe acute respiratory syndrome-related coro...,29851,complete,Hong Kong,Hong Kong,,Homo sapiens,,2020-01-23
1063448,MT123292.2,Severe acute respiratory syndrome coronavirus 2,set:MT123292,"Huang,J., Shi,Y., Sun,J., Zheng,K., Zhu,A., Su...","Guangzhou Customs, Technology Centre, Guangzho...",China,2020-02-28,B,4.3.1/1.29/v0.1.12/0.3.19/0.6.2,,IQTC04,Severe acute respiratory syndrome-related coro...,29923,complete,China: Guangzhou,China,,Homo sapiens,"lung, oronasopharynx",2020-01-27
928040,MT509662.1,Severe acute respiratory syndrome coronavirus 2,,"Gruber,C.E., Rueca,M., Bartolini,B., Messina,F...","INMI Lazzaro Spallanzani IRCCS, Laboratory of ...",Italy,2021-06-30,B,4.3.1/1.29/v0.1.12/0.3.19/0.6.2,,INMI2-N,Severe acute respiratory syndrome-related coro...,29834,complete,Italy: Lazio,Italy,,Homo sapiens,,2020-01-30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,PQ237423.1,Severe acute respiratory syndrome coronavirus 2,,"Howard,D., Batra,D., Cook,P.W., Caravas,J., Ra...","Centers for Disease Control and Prevention, Re...",USA,2024-08-27,KP.3.1.1,4.3.1/1.29/v0.1.12/0.3.19/0.6.2,True,AZ-CDC-LC1115136,Severe acute respiratory syndrome-related coro...,29688,complete,USA: Arizona,USA,AZ,Homo sapiens,oronasopharynx,2024-08-13
494,PQ238425.1,Severe acute respiratory syndrome coronavirus 2,,"Reeb,V., Twait,E., Yakkos,A., Eveland,K., Benf...","University of Iowa, State Hygienic Laboratory",USA,2024-08-27,,4.3.1/1.29/v0.1.12/0.3.19/0.6.2,True,2481404,Severe acute respiratory syndrome-related coro...,29756,complete,USA: Iowa,USA,IA,Homo sapiens,,2024-08-13
299,PQ237484.1,Severe acute respiratory syndrome coronavirus 2,,"Howard,D., Batra,D., Cook,P.W., Caravas,J., Ra...","Centers for Disease Control and Prevention, Re...",USA,2024-08-27,KP.2.9,4.3.1/1.29/v0.1.12/0.3.19/0.6.2,True,LA-CDC-LC1115430,Severe acute respiratory syndrome-related coro...,29691,complete,USA: Louisiana,USA,LA,Homo sapiens,oronasopharynx,2024-08-14
376,PQ237580.1,Severe acute respiratory syndrome coronavirus 2,,"Howard,D., Batra,D., Cook,P.W., Caravas,J., Ra...","Centers for Disease Control and Prevention, Re...",USA,2024-08-27,JN.1.11.1,4.3.1/1.29/v0.1.12/0.3.19/0.6.2,True,NC-CDC-LC1115487,Severe acute respiratory syndrome-related coro...,29598,complete,USA: North Carolina,USA,NC,Homo sapiens,oronasopharynx,2024-08-16


We take these shortlisted genomes and subset the original fasta files contaitning all the genome sequences of SARS-CoV-2 to only contain these sequences

In [5]:
#We take the accessions of these samples
accessions = sampled_genomes_df['Accession'].tolist()

#We set the referemce genome as the first in the list
reference_id="NC_045512.2"
accessions = [reference_id] + accessions

from Bio import SeqIO
#Function to create the new multifasta file containing the sampled genome sequences
def filter_fasta(input_file, output_file, sequence_ids):
    # Open input FASTA file
    with open(input_file, "r") as input_handle:
        # Open output FASTA file
        with open(output_file, "w") as output_handle:
            # Iterate over each sequence in the input FASTA file
            for record in SeqIO.parse(input_handle, "fasta"):
                # Check if the sequence ID is in the provided list
                if record.id in sequence_ids:
                    #print(record.id)
                    # Write the sequence to the output FASTA file
                    SeqIO.write(record, output_handle, "fasta")

input_file = "../SARS-COV-2_Genomic_data/SARS-CoV-2_1063493_genomes.fasta"
output_file = f"SARS-CoV-2_{len(accessions)-1}+1.fasta"

filter_fasta(input_file, output_file, accessions)