In [1]:
import polars as pl
import numpy as np
from methods import Config, RecombAnalysis

# Data required for analysis

Fill out the `config.yaml` file fields with the names of each of these files

- RIVET results file (.txt)
- RIVET vcf file (.vcf)
- Chronumental results file (.tsv)
- PyR0 mutations file (.tsv)
- Case counts file (.csv)
- Standing genetic diversity results file (.csv)



In [2]:
# Ensure correct path to config.yaml file in top level directory
CONFIG_FILEPATH = "../config.yaml"
# Get all the data files from the config file
config = Config(CONFIG_FILEPATH)

In [3]:
analysis = RecombAnalysis(config)

Loading all datasets for analysis
Loading standing genetic diversity scores from file:  /Users/kyle/research/recomb-experiments/data/genetic-diversity-gisaidAndPublic.2023-12-25.csv
Loading PyR0 amino acid mutation ranking scores from file:  /Users/kyle/research/recomb-experiments/data/mutations.tsv
Loading case count data from file:  /Users/kyle/research/recomb-experiments/data/time_series_covid19_confirmed_global.csv
Loading Chronumental object from disk cache:  /Users/kyle/research/recomb-experiments/data/chronumental_dates_gisaidAndPublic.2023-12-25-STEPS2000-SERIAL3.metadata.tsv.tsv.pkl
Chronumental dates loaded.
Data loaded, analysis ready. Elapsed time: 10.7987 seconds


In [4]:
df = analysis.toDataframe()
df

Month,Count,Infections,NumRecombsDetectedByMonth
str,f64,i64,i64
"""2021-09""",128.393,16061455,117
"""2022-05""",94.8531,16213566,63
"""2021-07""",123.092,15676958,111
"""2022-06""",100.291,17732748,81
"""2022-11""",116.172,12388536,46
…,…,…,…
"""2023-01""",119.41,10270797,28
"""2020-07""",75.5938,7118656,11
"""2020-05""",63.8522,2901229,11
"""2020-06""",70.4285,4292072,11


# Figure 1c: Correlation Matrix

In [5]:
selected_columns = ['Count', 'Infections', 'NumRecombsDetectedByMonth']
df.to_pandas()[selected_columns].corr()

Unnamed: 0,Count,Infections,NumRecombsDetectedByMonth
Count,1.0,0.124601,0.779849
Infections,0.124601,1.0,0.306916
NumRecombsDetectedByMonth,0.779849,0.306916,1.0


In [6]:
print(df.select(pl.corr("Count", "NumRecombsDetectedByMonth")).item())
print(df.select(pl.corr("Infections", "NumRecombsDetectedByMonth")).item())
print(df.select(pl.corr("Infections", "Count")).item())

0.7798487013090408
0.3069163582877874
0.12460145765167177


# Comparing the mean and standard deviation of substitution mutations vs recombination

In [11]:
substitution_mean, substitution_std_dev = analysis.getSubstitutionStats().values()
print("Mean fitness for substitution mutations found in MAT: ", substitution_mean)
print("Standard deviation of fitness for substitution mutations found in MAT: ", substitution_std_dev)

Mean fitness for substitution mutations found in MAT:  1.009351393981603
Standard deviation of fitness for substitution mutations found in MAT:  0.02727305822114323


# Figure 3 Analysis

In [1]:
def low_fitness_recombs_stats(df):
    """
    Proportion of recombinations between two already low fitness parents, that result in a above average recombinant.
    """
    both_parents_low_fitness = 0
    low_fitness_recomb = 0
    for row in df.iter_rows(named=True):
        month = row["Month"]
        # Median fitness for the given month
        median_fitness = (
            df.filter(pl.col("Month") == month)
            .select("Median")
            .cast(pl.Float64)
            .head(1)
            .item()
        )
        donor_fitness = row["DonorFitness"]
        acceptor_fitness = row["AcceptorFitness"]
        recomb_fitness = row["Score"]
        # Both parents are below the circulating median fitness
        if (donor_fitness < median_fitness) and (acceptor_fitness < median_fitness):
            both_parents_low_fitness += 1
            # If the recombinant is also below the median fitness
            if recomb_fitness < median_fitness:
                low_fitness_recomb += 1

    print(
        "Total number of recombs with both parents less fit than median fitness: ",
        both_parents_low_fitness,
    )
    print(
        "Total number of recombs with both parents less fit than median fitness, and recombinant also below median fitness: ",
        low_fitness_recomb,
    )
    print("Percentage of recombinants where both parents are less fit than median fitness, and recombinant also below median fitness: ", (low_fitness_recomb / both_parents_low_fitness) * 100)
    print("Percentage of recombinants where both parents are less fit than median fitness, and recombinant is at or above median fitness: ", 100 - (low_fitness_recomb / both_parents_low_fitness) * 100)


In [2]:
low_fitness_recombs_stats(analysis.getRecombData())

NameError: name 'analysis' is not defined

In [26]:
def quadrant_percentages(df):
    """
    """
    FITNESS_COL = "RecombFitnessNormalizedByMaxParents"
    DIVERGENCE_COL = "ParentsHD"
    
    # Calculate the average parental divergence of recombinants
    avg_parental_divergence = df.select(pl.col(DIVERGENCE_COL).mean()).head(1).item()
    TOTAL = len(df)    
    NEUTRAL_FITNESS = float(1.0)
    
    quadrant_1 = len(
        df.filter(
            (pl.col(FITNESS_COL) > NEUTRAL_FITNESS)
            & (pl.col(DIVERGENCE_COL) > avg_parental_divergence)
        )
    )
    quadrant_2 = len(
        df.filter(
            (pl.col(FITNESS_COL) <= NEUTRAL_FITNESS)
            & (pl.col(DIVERGENCE_COL) > avg_parental_divergence)
        )
    )
    quadrant_3 = len(
        df.filter(
            (pl.col(FITNESS_COL) <= NEUTRAL_FITNESS)
            & (pl.col(DIVERGENCE_COL) <= avg_parental_divergence)
        )
    )
    quadrant_4 = len(
        df.filter(
            (pl.col(FITNESS_COL) > NEUTRAL_FITNESS)
            & (pl.col(DIVERGENCE_COL) <= avg_parental_divergence)
        )
    )
    assert quadrant_1 + quadrant_2 + quadrant_3 + quadrant_4 == TOTAL

    #print("Count for Quadrant 1 (High diversity, high fitness): ", quadrant_1)
    #print("Count for Quadrant 2 (High diversity, low fitness): ", quadrant_2)
    #print("Count for Quadrant 3 (Low diversity, low fitness): ", quadrant_3)
    #print("Count for Quadrant 4 (Low diversity, high fitness): ", quadrant_4)
    #print()
    #print()
    quadrant_1_percent = (quadrant_1 / TOTAL) * 100
    quadrant_2_percent = (quadrant_2 / TOTAL) * 100
    quadrant_3_percent = (quadrant_3 / TOTAL) * 100
    quadrant_4_percent = (quadrant_4 / TOTAL) * 100
    print(
        "Percentage for Quadrant 1 (High diversity, high fitness): ", quadrant_1_percent
    )
    print(
        "Percentage for Quadrant 2 (High diversity, low fitness): ", quadrant_2_percent
    )
    print(
        "Percentage for Quadrant 3 (Low diversity, low fitness): ", quadrant_3_percent
    )
    print(
        "Percentage for Quadrant 4 (Low diversity, high fitness): ", quadrant_4_percent
    )

    # Above 1.1 norm fitness and greater than average parental divergence
    very_rare = len(
        df.filter(
            (pl.col(FITNESS_COL) > 1.1)
            & (pl.col(DIVERGENCE_COL) > avg_parental_divergence)
        )
    )
    very_rate_percent = (very_rare / TOTAL) * 100
    print("Percentage of recombinants that are above 1.1 normalized fitness (by max parent) and have a greater than average parental divergence: ", very_rate_percent)

In [25]:
quadrant_percentages(analysis.getRecombData())

Percentage for Quadrant 1 (High diversity, high fitness):  6.134157105030892
Percentage for Quadrant 2 (High diversity, low fitness):  28.067078552515447
Percentage for Quadrant 3 (Low diversity, low fitness):  52.162400706090025
Percentage for Quadrant 4 (Low diversity, high fitness):  13.636363636363635
Percentage of recombinants that are above 1.1 normalized fitness (by max parent) and have a greater than average parental divergence:  0.7943512797881729


In [28]:
def neutral_and_divergent(df):
    """
    TODO
    """
    FITNESS_COL = "RecombFitnessNormalizedByMaxParents"
    DIVERGENCE_COL = "ParentsHD"

    TOTAL = len(df)
    avg_parental_divergence = df.select(pl.col(DIVERGENCE_COL).mean()).head(1).item()
    results = len(
        df.filter(
            (pl.col(FITNESS_COL).is_between(0.98, 1.02))
            & (pl.col(DIVERGENCE_COL) > avg_parental_divergence)
        )
    )
    print("Percentage of recombinants that are near neutral and divergent cases: ", (results / TOTAL) * 100)

In [29]:
neutral_and_divergent(analysis.getRecombData())

Percentage of recombinants that are near neutral and divergent cases:  12.400706090026478


# Figure 3d Analysis

In [16]:
def calculate_percentages(data):
    """
    """
    total = len(data)
    between_0_and_1 = np.sum((data >= 0) & (data <= 1))
    below_0 =  np.sum(data < 0)
    above_1 = np.sum(data > 1)
    assert(total == (between_0_and_1 + below_0 + above_1))

    print("Percentage of data below 0: ", round((below_0 / total) * 100, 2))
    print("Percentage of data above 1: ", round((above_1 / total) * 100, 2))
    print("Percentage of data between 0 and 1 (both inclusive): ", round((between_0_and_1 / total) * 100, 2))


In [17]:
norm_fitness_data = analysis.getNormFitness().select('NormFitness').to_numpy()
calculate_percentages(norm_fitness_data)

Percentage of data below 0:  15.36
Percentage of data above 1:  19.77
Percentage of data between 0 and 1 (both inclusive):  64.87
