The sequence.csv file was downloaded from NCBI and it contains information regarding all the complete SARS-CoV-2 genomes downloaded from NCBI.
We access it and remove those genomes that does not have a proper collection date.

In [None]:
import pandas as pd
from Bio import SeqIO
import os

In [None]:

# Read the CSV file containing information about all the genomes
all_genomes_metadata_df = pd.read_csv('../SARS-CoV-2_genomes_metadata.csv')
print("Initial number of genomes:", len(all_genomes_metadata_df))

# Filter out data points that lacks proper collection dates
all_genomes_metadata_df = all_genomes_metadata_df[((all_genomes_metadata_df['Collection_Date'].notna()) & 
                (all_genomes_metadata_df['Collection_Date'].str.len() >=7))]

print("Number of genomes after filtering:",len(all_genomes_metadata_df))

In [None]:
all_genomes_metadata_df

In [None]:
print("The first collection date:"+min(all_genomes_metadata_df["Collection_Date"]))
print("The most recent collection date:"+max(all_genomes_metadata_df["Collection_Date"]))

We now sort the dataframe based on the collection dates of the genomes and remove the referece genome from the sampling population as we are going to explicitly add it later and use it as the outgroup for the tree construction

In [None]:
# Sort the DataFrame by their Collection Date
all_genomes_metadata_df = all_genomes_metadata_df.sort_values('Collection_Date')

# Drop the row where 'Accession' is 'NC_045512.2' which is the reference genome
all_genomes_metadata_df = all_genomes_metadata_df[all_genomes_metadata_df['Accession'] != 'NC_045512.2']

Next, we divide the timezone from the start of the pandemic into 20 timeframes of 3 months each and then we assign these time frames to our records

In [None]:
# Function to divide the timezome from the start of the pandemic till present into timeframes of n-months interval and then assign this information to each datapoint
def assign_n_month_timeframe(collection_date):
    """
    Returns the corresponding timeframes to a Collection date in the metadata dataframe
    
    Parameters: 
        - collection_date (str) : The collection date of the genome sequence

    Returns:
        - str: the corresponding timeframe within which the collection date falls
    """
    global n
    global has_executed
    global timeframes
    #To initialize the timeframes list only once
    if not has_executed:
        start_date = pd.Timestamp('2019-12-01')
        end_date = pd.Timestamp("2024-11-30")
        # Initialize a list to store the timeframes
        timeframes = []

        # Start from the start_date and increment by n months until reaching the end_date
        current_date = start_date
        while current_date <= end_date:
            next_date = current_date + pd.DateOffset(months=n)
            timeframes.append([current_date, next_date+pd.DateOffset(days=-1)])
            current_date = next_date

        # Adjust the end date of the last interval if it exceeds the end date
        if timeframes[-1][-1] > end_date:
            timeframes[-1][-1] = end_date
        has_executed = True
    
    collection_date=pd.Timestamp(collection_date)
    # Assign the time frame
    for interval in timeframes:
        if(interval[0]<=collection_date <=interval[1]):
            return f"{interval[0].date()}--{interval[1].date()}"
    
n=3
has_executed = False

# Assign time period to the records
all_genomes_metadata_df[f"{n}_Month_Period"] = all_genomes_metadata_df["Collection_Date"].apply(assign_n_month_timeframe)

# Save this filtered metadata for later use
all_genomes_metadata_df.to_csv('../SARS-CoV-2_genomes_metadata_filtered.csv', index=False)



We now visualize the number of genomes that were sequenced in each of these timeframes

In [None]:
genome_counts_per_timeframe = all_genomes_metadata_df.groupby(f'{n}_Month_Period').size()
print(genome_counts_per_timeframe)
print("Total Genomes:", genome_counts_per_timeframe.sum())

As these distribution are inequal across the different time frames. Hence, to ensure proper representation and capture the mutation dynamics of the virus accross each of these timeframes we did random sampling of genome sequences for each of these timeperiods.

In [None]:
# Function to randomly sample genomes for each n-Month Period
def random_sample_genomes(df, no_of_genomes_per_timeframe, timeframe_col):
    """
    Create the Sample Genome sets by randomly sampling a particular number of genomes within each timeframe.

    Parameters:
        - df (pandas dataframe): The dataframe contaning information about all the genomes from which to sample.
        - no_of_genomes_per_timeframe (int): Number of genomes to sample from each timeframe.
        - timeframe_col (str): The column in the dataframe contatining the assigned timeframes.  

    Returns:
        - timewise_random_genomes_df (pandas dataframe): Contains information about the sampled genomeset.
    """
    # Create an empty DataFrame to store the randomly sampled genomes
    timewise_random_genomes_df = pd.DataFrame(columns=df.columns)
    
    # Group the data by the n-month period column
    for timeframe, genome_set in df.groupby(timeframe_col):
        # If the number of genomes in the group is less than the required sample size, take all genomes
        sample_size = min(len(genome_set), no_of_genomes_per_timeframe)
        
        # Randomly sample genomes from the group
        random_genomes = genome_set.sample(n=sample_size, replace=False)  
        
        # Append the randomly sampled genomes to the result DataFrame
        timewise_random_genomes_df = pd.concat([timewise_random_genomes_df, random_genomes], ignore_index=True)
    
    return timewise_random_genomes_df




We now access the multifasta file which contains the genome sequences of all the genomes in the initial population. We then create a new multifasta file  which contains the genome sequences of the genomes that we sampled. We also add the reference genome of the Wuhan-hu-1 strain to this file as it would be used later in the tree construction process. 

In [None]:

#Function to create the new multifasta file containing the sampled genome sequences
def filter_fasta(input_file, output_file, sequence_ids):
    """
    Writes a new multi-fasta file from the original multi-fasta file containing a list of desired sequences.
    
    Parameters:
        - input_file (str): Path to the initial multi-fasta file to subset from.
        - output_file (str): Path to the new multi-fasta file.
        - sequence_ids (list): the list of sequences to subset.
        
    """
    # Open input FASTA file
    with open(input_file, "r") as input_handle:
        # Open output FASTA file
        with open(output_file, "w") as output_handle:
            # Iterate over each sequence in the input FASTA file
            for record in SeqIO.parse(input_handle, "fasta"):
                # Check if the sequence ID is in the provided list
                if record.id in sequence_ids:
                    # Write the sequence to the output FASTA file
                    SeqIO.write(record, output_handle, "fasta")



Finally, we create and save 30 sample genomesets, each containing randomly sampled genomes from each of the 20 timeframes along with the associated metadata

In [None]:

# Number of samples needed per sample
no_of_samples=30
# Number of months per timeframe
n=3
# Number of genomes to sample per timeframe
no_of_genomes_per_timeframe=200

# Multifasta file containing all the genomes of SARS-CoV-2
input_file = "../SARS-CoV-2_genomes.fasta"

# We iterate through the number of samples we want to create
for sample_no in range(1, no_of_samples+1):
    
    # Flag to prevent the assign_n_month_timeframe from generating the same timeframe list to assign from, multiple times
    has_executed = False

    # Assign time period to the records
    all_genomes_metadata_df[f"{n}_Month_Period"] = all_genomes_metadata_df["Collection_Date"].apply(assign_n_month_timeframe)
    
    # Create a new sampled dataframe by sampling all the dataframe containing information on all the genomes
    sampled_genomes_df = random_sample_genomes(all_genomes_metadata_df, no_of_genomes_per_timeframe, f'{n}_Month_Period')

    # Create a directory along with any necessary intermediate directories to write the files to
    os.makedirs(f"../Samples/Sample_{sample_no}")

    # Extract the sequence ids of the genomes in this sampled dataframe
    accessions = sampled_genomes_df['Accession'].tolist()

    # We set the reference genome as the first in the list to add this sequence as the topmost sequence in the multifasta
    reference_id="NC_045512.2"
    accessions = [reference_id] + accessions
    
    # Generate the path to the output file
    output_file = f"../Samples/Sample_{sample_no}/SARS-CoV-2_Sample_{sample_no}_{len(accessions)-1}+1.fasta"
    
    # Write the multifasta file containing the sampled genomes
    filter_fasta(input_file, output_file, accessions)
    
    # Write the dataframe containing information about this sample a csv file 
    sampled_genomes_df.to_csv(f"../Samples/Sample_{sample_no}/SARS-CoV-2_Sample_{sample_no}_{len(accessions)-1}+1_metadata.csv", index=False)

    #print completion message
    print(f"Sample {sample_no} Generated Successfully")




