The sequence.csv file was downloaded from NCBI and it contains information regarding all the complete SARS-CoV-2 genomes downloaded from NCBI.
We access it and remove those genomes that does not have a proper collection date.

In [1]:
import pandas as pd
from Bio import SeqIO
import os

In [2]:

# Read the CSV file contatining information about all the genomes
all_genomes_metadata_df = pd.read_csv('../../../Future_VOC_prediction/SARS-COV-2_Genomic_data/SARS-CoV-2_1063493_metadata.csv')
print("Initial number of genomes:",len(all_genomes_metadata_df))

#Filter out data points that lacks proper collection dates
all_genomes_metadata_df = all_genomes_metadata_df[(all_genomes_metadata_df['Collection_Date'].notna()) & 
        (all_genomes_metadata_df['Collection_Date'].str.len() >=7)]

print("Number of genomes after filtering:",len(all_genomes_metadata_df))

  all_genomes_metadata_df = pd.read_csv('../../../Future_VOC_prediction/SARS-COV-2_Genomic_data/SARS-CoV-2_1063493_metadata.csv')


Initial number of genomes: 1063493
Number of genomes after filtering: 1059141


We now sort the dataframe based on the collection dates of the genomes and remove the referece genome from the sampling population as we are going to add it later and use it as the outgroup for the tree construction

In [3]:
# Sort the DataFrame by their Collection Date
all_genomes_metadata_df = all_genomes_metadata_df.sort_values('Collection_Date')

# Drop the row where 'Accession' is 'NC_045512.2' which is the reference genome
all_genomes_metadata_df = all_genomes_metadata_df[all_genomes_metadata_df['Accession'] != 'NC_045512.2']

Next, we divide the timezone from the start of the pandemic into intervals of 6 months and then we assign these time period intervals to our records

In [4]:
#Function to divide the timezome from the start of the pandemic till present into timeframes of n-months interval and then assign this information to each datapoint
def assign_n_month_timeframe(collection_date):
    global n
    global has_executed
    global timeframes
    #To initialize the timeframes list only once
    if not has_executed:
        start_date = pd.Timestamp('2019-12-01')
        end_date = pd.Timestamp("2024-05-26")
        # Initialize a list to store the timeframes
        timeframes = []

        # Start from the start_date and increment by 6 months until reaching the end_date
        current_date = start_date
        while current_date <= end_date:
            next_date = current_date + pd.DateOffset(months=n)
            timeframes.append([current_date, next_date+pd.DateOffset(days=-1)])
            current_date = next_date

        # Adjust the end date of the last interval if it exceeds the current date
        if timeframes[-1][-1] > end_date:
            timeframes[-1][-1] = end_date
        has_executed = True
    
    collection_date=pd.Timestamp(collection_date)
    #Assign the time frame
    for interval in timeframes:
        if(interval[0]<=collection_date <=interval[1]):
            return f"{interval[0].date()}--{interval[1].date()}"
    
#n=3
#has_executed = False

#Assign time period to the records
#all_genomes_metadata_df[f"{n}_Month_Period"] = all_genomes_metadata_df["Collection_Date"].apply(assign_6month_timeframe)


We now visualize the number of genomes that were sequenced in each of these timeframes

In [5]:
#genome_counts_per_timeframe = all_genomes_metadata_df.groupby(f'{n}_Month_Period').size()
#print(genome_counts_per_timeframe)
#print("Total Genomes:", genome_counts_per_timeframe.sum())

As these distribution are inequal across the different time frames. Hence, to ensure proper representation and capture the mutation dynamics of the virus accross each of these timeframes we did random sampling of genome sequences for each of these timeperiods.

In [6]:
# Function to randomly sample genomes for each n-Month Period
def random_sample_genomes(df, no_of_genomes_per_timeframe, period_col):
    # Create an empty DataFrame to store the randomly sampled genomes
    timewise_random_genomes_df = pd.DataFrame(columns=df.columns)
    
    # Group the data by the n-month period column
    for timeframe, genome_set in df.groupby(period_col):
        # If the number of genomes in the group is less than the required sample size, take all genomes
        sample_size = min(len(genome_set), no_of_genomes_per_timeframe)
        
        # Randomly sample genomes from the group
        random_genomes = genome_set.sample(n=sample_size, replace=False)  # Random state for reproducibility
        
        # Append the randomly sampled genomes to the result DataFrame
        timewise_random_genomes_df = pd.concat([timewise_random_genomes_df, random_genomes], ignore_index=True)
    
    return timewise_random_genomes_df


#no_of_genomes_per_timeframe=500
#sampled_genomes_df = random_sample_genomes(all_genomes_metadata_df, no_of_genomes_per_timeframe, f'{n}_Month_Period')

#genome_counts_per_timeframe = sampled_genomes_df.groupby(f'{n}_Month_Period').size()
#print("Total Genomes sampled:", genome_counts_per_timeframe.sum())
#genome_counts_per_timeframe

We now access the multifasta file which contains the genome sequences of all the genomes in the initial population. We then create a new multifasta file  which contains the genome sequences of the genomes that we sampled in the previous step. We also add the reference genome of the Wuhan-hu-1 strain to this file as it would be used later in the tree construction process. 

In [7]:
#We take the accessions of these samples
#accessions = sampled_genomes_df['Accession'].tolist()

#We set the referemce genome as the first in the list
#reference_id="NC_045512.2"
#accessions = [reference_id] + accessions

#Function to create the new multifasta file containing the sampled genome sequences
def filter_fasta(input_file, output_file, sequence_ids):
    # Open input FASTA file
    with open(input_file, "r") as input_handle:
        # Open output FASTA file
        with open(output_file, "w") as output_handle:
            # Iterate over each sequence in the input FASTA file
            for record in SeqIO.parse(input_handle, "fasta"):
                # Check if the sequence ID is in the provided list
                if record.id in sequence_ids:
                    # Write the sequence to the output FASTA file
                    SeqIO.write(record, output_handle, "fasta")

#input_file = "../../../Future_VOC_prediction/SARS-COV-2_Genomic_data/SARS-CoV-2_1063493_genomes.fasta"
#output_file = f"../SARS-CoV-2_{len(accessions)-1}+1.fasta"
#filter_fasta(input_file, output_file, accessions)

Finally, we create and save the metadata file containing information about these sampled genomes

In [8]:
#dataframe containing information regarding the sampled genomes
#sampled_genome_metadata_df = all_genomes_metadata_df[all_genomes_metadata_df['Accession'].isin(accessions)]

# Save this DataFrame as a CSV file
#sampled_genome_metadata_df.to_csv(f"../SARS-CoV-2_{len(accessions)-1}+1_metadata.csv", index=False)


In [9]:

#Number of samples needed per sample
no_of_samples=30
#Number of months per timeframe
n=3
#Number of genomes to select per timeframe
no_of_genomes_per_timeframe=200
#Multifasta file containing all the genomes of SARS-CoV-2
input_file = "../../../Future_VOC_prediction/SARS-COV-2_Genomic_data/SARS-CoV-2_1063493_genomes.fasta"

for sample_no in range(1, no_of_samples+1):
    print(f"Sample {sample_no}")
    has_executed = False

    #Assign time period to the records
    all_genomes_metadata_df[f"{n}_Month_Period"] = all_genomes_metadata_df["Collection_Date"].apply(assign_n_month_timeframe)
    
    sampled_genomes_df = random_sample_genomes(all_genomes_metadata_df, no_of_genomes_per_timeframe, f'{n}_Month_Period')

    genome_counts_per_timeframe = sampled_genomes_df.groupby(f'{n}_Month_Period').size()
    print("Total Genomes sampled:", genome_counts_per_timeframe.sum())
    genome_counts_per_timeframe

    # Create a directory along with any necessary intermediate directories
    os.makedirs(f"../Samples/Sample_{sample_no}")


    accessions = sampled_genomes_df['Accession'].tolist()

    #We set the referemce genome as the first in the list
    reference_id="NC_045512.2"
    accessions = [reference_id] + accessions
    output_file = f"../Samples/Sample_{sample_no}/SARS-CoV-2_Sample_{sample_no}_{len(accessions)-1}+1.fasta"
    filter_fasta(input_file, output_file, accessions)

    #dataframe containing information regarding the sampled genomes
    sampled_genome_metadata_df = all_genomes_metadata_df[all_genomes_metadata_df['Accession'].isin(accessions)]

    # Save this DataFrame as a CSV file
    sampled_genome_metadata_df.to_csv(f"../Samples/Sample_{sample_no}/SARS-CoV-2_Sample_{sample_no}_{len(accessions)-1}+1_metadata.csv", index=False)







Sample 1
Total Genomes sampled: 3600
Sample 2
Total Genomes sampled: 3600
Sample 3
Total Genomes sampled: 3600
Sample 4
Total Genomes sampled: 3600
Sample 5
Total Genomes sampled: 3600
Sample 6
Total Genomes sampled: 3600
Sample 7
Total Genomes sampled: 3600
Sample 8
Total Genomes sampled: 3600
Sample 9
Total Genomes sampled: 3600
Sample 10
Total Genomes sampled: 3600
Sample 11
Total Genomes sampled: 3600
Sample 12
Total Genomes sampled: 3600
Sample 13
Total Genomes sampled: 3600
Sample 14
Total Genomes sampled: 3600
Sample 15
Total Genomes sampled: 3600
Sample 16
Total Genomes sampled: 3600
Sample 17
Total Genomes sampled: 3600
Sample 18
Total Genomes sampled: 3600
Sample 19
Total Genomes sampled: 3600
Sample 20
Total Genomes sampled: 3600
Sample 21
Total Genomes sampled: 3600
Sample 22
Total Genomes sampled: 3600
Sample 23
Total Genomes sampled: 3600
Sample 24
Total Genomes sampled: 3600
Sample 25
Total Genomes sampled: 3600
Sample 26
Total Genomes sampled: 3600
Sample 27
Total Genom