# Genome Sampling
#### This script generates 30 genome sets for SARS-CoV-2 mutation analysis through random sampling.


##### Import the necessary libraries

In [None]:
import pandas as pd
from Bio import SeqIO
import os
import numpy as np

##### The sequence.csv file contains metadata records for all the complete SARS-CoV-2 genomes downloaded from NCBI. We start by filtering out those records without a valid collection date.  


In [None]:
# Read the CSV file containing information about all the genomes
all_genomes_metadata_df = pd.read_csv('../Sequence_Data/sequences.csv')
print("Initial number of genomes:", len(all_genomes_metadata_df))

# Filter out data points that lacks proper collection dates
all_genomes_metadata_df = all_genomes_metadata_df[((all_genomes_metadata_df['Collection_Date'].notna()) & 
                (all_genomes_metadata_df['Collection_Date'].str.len() >= 7))]

print("Number of genomes after filtering:", len(all_genomes_metadata_df))

In [None]:
all_genomes_metadata_df

In [None]:
print("The earliest collection date:"+min(all_genomes_metadata_df["Collection_Date"]))
print("The most recent collection date:"+max(all_genomes_metadata_df["Collection_Date"]))

##### We now sort the dataframe based on the collection dates of the genomes and remove the referece genome from the sampling population as we are going to explicitly add it later and use it as the outgroup for the tree construction.

In [None]:
# Sort the DataFrame by their Collection Date
all_genomes_metadata_df = all_genomes_metadata_df.sort_values('Collection_Date')

# Drop the row where 'Accession' is 'NC_045512.2' which is the reference genome
all_genomes_metadata_df = all_genomes_metadata_df[all_genomes_metadata_df['Accession'] != 'NC_045512.2']

##### Next, we divide the timezone from the start of the pandemic to present (2025-2-28) into 21 timeframes of 3 months each and then assign these time frames to our records.

In [None]:
# Function to divide the timezome from the start of the pandemic till present into timeframes of n-months interval and then assign this information to each datapoint
def assign_n_month_timeframe(collection_date):
    """
    Returns the n_months-timeframe corresponding to the Collection dates in the metadata dataframe.
    
    Parameters: 
        - collection_date (str) : The collection date of the genome sequence.

    Returns:
        - str: the corresponding n_months-timeframe within which the collection date falls.
    """
    
    global n_months, end_date, has_executed, timeframes, start_date

    #To initialize the timeframes list (only once)
    if not has_executed:

        # Initialize a list to store the timeframes
        timeframes = []

        # Start from the start_date and increment by n-months until reaching the end_date
        current_date = start_date
        while current_date <= end_date:
            next_date = current_date + pd.DateOffset(months=n_months)
            timeframes.append([current_date, next_date+pd.DateOffset(days=-1)])
            current_date = next_date

        has_executed = True
    
    collection_date=pd.Timestamp(collection_date)

    # Assign the time frame
    for interval in timeframes:
        if(interval[0]<=collection_date <=interval[1]):
            return f"{interval[0].date()}--{interval[1].date()}"


# Define the date range
start_date = pd.Timestamp('2019-12-01')
end_date = pd.Timestamp('2025-02-28') 

# Timeframe Duration (3 months)
n_months=3

# To initialize the timeframe list only once. It has not been initialized, hence the value is set as False
has_executed = False

# Assign time period to the records
all_genomes_metadata_df[f"{n_months}_Month_Period"] = all_genomes_metadata_df["Collection_Date"].apply(assign_n_month_timeframe)

# Save this filtered metadata for later use
all_genomes_metadata_df.to_csv('../SARS-CoV-2_genomes_metadata_filtered.csv', index=False)

##### We now visualize the number of genomes that were sequenced in each of these timeframes

In [None]:
genome_counts_per_timeframe = all_genomes_metadata_df.groupby(f'{n_months}_Month_Period').size()
print(genome_counts_per_timeframe)
print("Total Genomes:", genome_counts_per_timeframe.sum())

##### Finally, we create and save 30 sample genomesets, each containing randomly sampled genomes from each of the 21 timeframes along with the associated metadata.

In [None]:
# Function to randomly sample genomes for each n-Month Period
def random_sample_genomes(df, no_of_genomes_per_timeframe, timeframe_col, seed_value):
    """
    Create a Sample Genome sets by randomly sampling a particular number of genomes within each timeframe.

    Parameters:
        - df (pandas dataframe): The dataframe contaning information about all the genomes from which to sample.
        - no_of_genomes_per_timeframe (int): Number of genomes to sample from each timeframe.
        - timeframe_col (str): The column in the dataframe contatining the assigned timeframes.  
        - seed_value (str): Seed value for reproducibility in the sampling
    Returns:
        - timewise_random_genomes_df (pandas dataframe): Contains information about the sampled genomeset.
    """
    # Create an empty DataFrame to store the randomly sampled genomes
    timewise_random_genomes_df = pd.DataFrame(columns=df.columns)
    
    # Group the data by the n-month period column
    for timeframe, genome_set in df.groupby(timeframe_col):
        # If the number of genomes in the group is less than the required sample size, take all genomes
        sample_size = min(len(genome_set), no_of_genomes_per_timeframe)
        
        # Randomly sample genomes from the group
        random_genomes = genome_set.sample(n=sample_size, replace=False, random_state=seed_value)
        
        # Append the randomly sampled genomes to the result DataFrame
        timewise_random_genomes_df = pd.concat([timewise_random_genomes_df, random_genomes], ignore_index=True)
    
    return timewise_random_genomes_df


In [None]:
# Function to create the new multifasta file containing the sampled genome sequences
def filter_fasta(input_file, output_file, sequence_ids):
    """
    Writes a new multi-fasta file from the original multi-fasta file containing a list of desired sequences.
    
    Parameters:
        - input_file (str): Path to the initial multi-fasta file to subset from.
        - output_file (str): Path to the new multi-fasta file.
        - sequence_ids (list): the list of sequences to subset.
        
    """
    # Open input FASTA file
    with open(input_file, "r") as input_handle:
        # Open output FASTA file
        with open(output_file, "w") as output_handle:
            # Iterate over each sequence in the input FASTA file
            for record in SeqIO.parse(input_handle, "fasta"):
                # Check if the sequence ID is in the provided list
                if record.id in sequence_ids:
                    # Write the sequence to the output FASTA file
                    SeqIO.write(record, output_handle, "fasta")



In [None]:
# Create a directory to store the samples
os.makedirs("../Samples", exist_ok=True)

# Number of samples needed per sample
no_of_samples=30

# Number of genomes to sample per timeframe
no_of_genomes_per_timeframe=250

# Define sampling seed values for reproducibility
min_seed = 0
max_seed = 2**32 - 1  # 32 bit
# Generate 30 evenly spaced seed values
sampling_seed_values = np.linspace(min_seed, max_seed, no_of_samples, dtype=np.uint32).tolist()

# Multifasta file containing all the genomes of SARS-CoV-2
input_file = "../Sequence_Data/sequences.fasta"

# We generate the genome sets through random sampling 
for i, seed in enumerate(sampling_seed_values):
    # Set the sample number
    sample_no = i+1
    
    # Create a new sampled dataframe by sampling all the dataframe containing information on all the genomes
    sampled_genomes_df = random_sample_genomes(all_genomes_metadata_df, no_of_genomes_per_timeframe, f'{n_months}_Month_Period', seed_value=seed)

    # Create a directory along with any necessary intermediate directories to write the files to
    os.makedirs(f"../Samples/Sample_{sample_no}")

    # Extract the sequence ids of the genomes in this sampled dataframe
    accessions = sampled_genomes_df['Accession'].tolist()

    # We set the reference genome as the first in the list to add this sequence as the topmost sequence in the multifasta
    reference_id="NC_045512.2"
    accessions = [reference_id] + accessions
    
    # Generate the path to the output file
    output_file = f"../Samples/Sample_{sample_no}/SARS-CoV-2_Sample_{sample_no}_{len(accessions)-1}+1.fasta"
    
    # Write the multifasta file containing the sampled genomes
    filter_fasta(input_file, output_file, accessions)
    
    # Write the dataframe containing information about this sample a csv file 
    sampled_genomes_df.to_csv(f"../Samples/Sample_{sample_no}/SARS-CoV-2_Sample_{sample_no}_{len(accessions)-1}+1_metadata.csv", index=False)

    #print completion message
    print(f"Sample Genome set {sample_no} Generated Successfully")