The sequence.csv file was downloaded from NCBI and it contains information regarding all the complete SARS-CoV-2 genomes downloaded from NCBI.
We access it and remove those genomes that does not have a proper collection date.

In [None]:
import pandas as pd

# Read the CSV file
all_genomes_metadata_df = pd.read_csv('sequences.csv')
print("Initial number of genomes:",len(all_genomes_metadata_df))

#Filter out data points that lacks proper collection dates
all_genomes_metadata_df = all_genomes_metadata_df[(all_genomes_metadata_df['Collection_Date'].notna()) & 
        (all_genomes_metadata_df['Collection_Date'].str.len() >=7)]

print("Number of genomes after filtering:",len(all_genomes_metadata_df))

We now sort the dataframe based on the collection dates of the genomes and remove the referece genome from the sampling population as we are going to add it later and use it as the outgroup for the tree construction

In [None]:
# Sort the DataFrame by Collection Date
all_genomes_metadata_df = all_genomes_metadata_df.sort_values('Collection_Date')

# Drop the row where 'Accession' is 'NC_045512.2'
all_genomes_metadata_df = all_genomes_metadata_df[all_genomes_metadata_df['Accession'] != 'NC_045512.2']


Next, we divide the timezone from the start of the pandemic into intervals of 6 months and then we assign these time period intervals to our records

In [None]:
import pandas as pd

#Function to divide the timezome from the start of the pandemic till present into timeframes of 6 months interval and then assign this information to each datapoint
def assign_6month_timeframe(collection_date):
    global has_executed
    global timeframes
    #To initialize the timeframes list only once
    if not has_executed:
        start_date = pd.Timestamp('2019-12-01')
        end_date = pd.Timestamp("2024-05-26")
        # Initialize a list to store the timeframes
        timeframes = []

        # Start from the start_date and increment by 6 months until reaching the end_date
        current_date = start_date
        while current_date <= end_date:
            next_date = current_date + pd.DateOffset(months=6)
            timeframes.append([current_date, next_date+pd.DateOffset(days=-1)])
            current_date = next_date

        # Adjust the end date of the last interval if it exceeds the current date
        if timeframes[-1][-1] > end_date:
            timeframes[-1][-1] = end_date
        has_executed = True
    
    collection_date=pd.Timestamp(collection_date)
    #Assign the time frame
    for interval in timeframes:
        if(interval[0]<=collection_date <=interval[1]):
            return f"{interval[0].date()}--{interval[1].date()}"
    

has_executed = False

#Assign time period to the records
all_genomes_metadata_df["6_Month_Period"] = all_genomes_metadata_df["Collection_Date"].apply(assign_6month_timeframe)


We now visualize the number of genomes that were sequenced in each of these timeframes

In [None]:
genome_counts_per_timeframe = all_genomes_metadata_df.groupby('6_Month_Period').size()
print(genome_counts_per_timeframe)
print("Total Genomes:", genome_counts_per_timeframe.sum())

As these distribution are inequal accros the different time frames. Hence, to ensure proper representation and capture the mutation dynamics of the virus accross each of these timeframes we sampled timewise-ranked equidistant 2000 genome sequences for each of these timeperiods.

In [None]:
import pandas as pd
import numpy as np

# Function to Generate a DataFrame of n equidistant genomes for each 6-Month-Period
def equidistant_genomes(df, n):
    timewise_equidistant_genomes_df = pd.DataFrame(columns=df.columns)
    #We access the genome sets for each time period
    for timeframe,genome_set in df.groupby('6_Month_Period'):
        #We sort them based on their collection dates
        genome_set = genome_set.sort_values(by='Collection_Date')
        #We select n equidistant genomes bases on the sorted collection dates
        indices = np.linspace(0, len(genome_set) - 1, n).astype(int)
        equidistant_dates = genome_set.iloc[indices]
        timewise_equidistant_genomes_df = pd.concat([timewise_equidistant_genomes_df, equidistant_dates], ignore_index=True)
        
    return timewise_equidistant_genomes_df

sampled_genomes_df = equidistant_genomes(all_genomes_metadata_df, n=2000)

genome_counts_per_timeframe = sampled_genomes_df.groupby('6_Month_Period').size()
print("Total Genomes sampled:", genome_counts_per_timeframe.sum())
genome_counts_per_timeframe

We now access the "sequences.fasta" file which contains the genome sequences of all the genomes in the initial population. We then create a new multifasta file  which contains the genome sequences of the genomes that we sampled in the previous step. We also add the reference genome of the Wuhan-hu-1 strain to this file as it would be used later in the tree construction process. 

In [None]:
#We take the accessions of these samples
accessions = sampled_genomes_df['Accession'].tolist()

#We set the referemce genome as the first in the list
reference_id="NC_045512.2"
accessions = [reference_id] + accessions

from Bio import SeqIO
#Function to create the new multifasta file containing the sampled genome sequences
def filter_fasta(input_file, output_file, sequence_ids):
    # Open input FASTA file
    with open(input_file, "r") as input_handle:
        # Open output FASTA file
        with open(output_file, "w") as output_handle:
            # Iterate over each sequence in the input FASTA file
            for record in SeqIO.parse(input_handle, "fasta"):
                # Check if the sequence ID is in the provided list
                if record.id in sequence_ids:
                    # Write the sequence to the output FASTA file
                    SeqIO.write(record, output_handle, "fasta")

input_file = "sequences.fasta"
output_file = f"SARS-CoV-2_{len(accessions)-1}+1.fasta"

filter_fasta(input_file, output_file, accessions)

Finally, we create and save the metadata file containing information about these sampled genomes

In [None]:
#dataframe containing information regarding the sampled genomes
sampled_genome_metadata_df = all_genomes_metadata_df[all_genomes_metadata_df['Accession'].isin(accessions)]

# Save this DataFrame as a CSV file
sampled_genome_metadata_df.to_csv(f"SARS-CoV-2_{len(accessions)-1}+1.csv", index=False)
