In [2]:
import polars as pl
import numpy as np
from methods import Config, RecombAnalysis

# Data required for analysis

The notebook loads the following three data files, included in the `data` directory:
- `rivet_recombs_data.csv`: The file containing all the information and statistics for the detected recombinants included in this analysis.
- `substitutions_scores.csv`: A file containing the PyR0 fitness scores for all ranked subtitutions considered by PyR0 found in the "2023-12-25" private MAT.
- `monthly_fitness_stats.csv`: A file containing some basic statistics for the circulating fitness of all SARS-CoV-2 samples found in the MAT for each month.

# Load configuration file and data files required for analysis

In [3]:
# Ensure correct path to config.yaml file in top level directory
CONFIG_FILEPATH = "../config.yaml"
# Get all the data files from the config file
config = Config(CONFIG_FILEPATH)
analysis = RecombAnalysis(config)

Loading all datasets for analysis
Data loaded, analysis ready. Elapsed time: 2.1022 seconds


# Figure 1: Relationship between detectable recombination, standing genetic diversity and the number of infections

In [4]:
df = analysis.getEpidemiologicalFactors()
print(df)

shape: (37, 4)
┌─────────┬────────────────┬────────────┬───────────────────────────┐
│ Month   ┆ DiversityScore ┆ Infections ┆ NumRecombsDetectedByMonth │
│ ---     ┆ ---            ┆ ---        ┆ ---                       │
│ str     ┆ f64            ┆ i64        ┆ i64                       │
╞═════════╪════════════════╪════════════╪═══════════════════════════╡
│ 2022-09 ┆ 102.288        ┆ 14721332   ┆ 75                        │
│ 2021-10 ┆ 133.259        ┆ 13064658   ┆ 94                        │
│ 2021-12 ┆ 144.167        ┆ 25614610   ┆ 61                        │
│ 2020-04 ┆ 59.1907        ┆ 2412716    ┆ 4                         │
│ 2022-05 ┆ 94.8531        ┆ 16213566   ┆ 63                        │
│ …       ┆ …              ┆ …          ┆ …                         │
│ 2021-05 ┆ 140.826        ┆ 19347136   ┆ 83                        │
│ 2021-02 ┆ 142.19         ┆ 11253786   ┆ 96                        │
│ 2022-10 ┆ 110.021        ┆ 12795215   ┆ 64                        │
│ 202

### Figure 1c: Correlation Matrix

In [5]:
selected_columns = ['DiversityScore', 'Infections', 'NumRecombsDetectedByMonth']
df.to_pandas()[selected_columns].corr()

Unnamed: 0,DiversityScore,Infections,NumRecombsDetectedByMonth
DiversityScore,1.0,0.124601,0.779849
Infections,0.124601,1.0,0.306916
NumRecombsDetectedByMonth,0.779849,0.306916,1.0


In [6]:
print(df.select(pl.corr("DiversityScore", "NumRecombsDetectedByMonth")).item())
print(df.select(pl.corr("Infections", "NumRecombsDetectedByMonth")).item())
print(df.select(pl.corr("Infections", "DiversityScore")).item())

0.7798487013090409
0.3069163582877874
0.12460145765167177


# Figure 2: Fitness of Detectable Recombinants and Circulating Sample Fitness

In [7]:
# ADD

# Figure 3a Analysis: Comparing the mean and standard deviation of substitution mutations vs recombination

In [8]:
substitution_mean, substitution_std_dev = analysis.getSubstitutionFitnessStats().values()
print("Mean fitness for substitution mutations found in MAT: ", substitution_mean)
print("Standard deviation of fitness for substitution mutations found in MAT: ", substitution_std_dev)

Mean fitness for substitution mutations found in MAT:  1.009351393981603
Standard deviation of fitness for substitution mutations found in MAT:  0.02727305822114323


In [9]:
recombination_mean, recombination_std_dev = analysis.getRecombinationFitnessStats().values()
print("Mean fitness for recombinants found in MAT: ", recombination_mean)
print("Standard deviation of recombinants found in MAT: ", recombination_std_dev)

Mean fitness for recombinants found in MAT:  0.9370849044625659
Standard deviation of recombinants found in MAT:  0.13788125059309864


# Figure 3b Analysis: Recombinant fitness normalized by the fitter parent, compared to parental divergence

In [10]:
def low_fitness_recombs_stats(df):
    """
    Proportion of recombinations between two already low fitness parents, that result in a above average recombinant.
    """
    both_parents_low_fitness = 0
    low_fitness_recomb = 0
    for row in df.iter_rows(named=True):
        month = row["Month"]
        # Median fitness for the given month
        median_fitness = (
            df.filter(pl.col("Month") == month)
            .select("Median")
            .cast(pl.Float64)
            .head(1)
            .item()
        )
        donor_fitness = row["DonorFitness"]
        acceptor_fitness = row["AcceptorFitness"]
        recomb_fitness = row["Score"]
        # Both parents are below the circulating median fitness
        if (donor_fitness < median_fitness) and (acceptor_fitness < median_fitness):
            both_parents_low_fitness += 1
            # If the recombinant is also below the median fitness
            if recomb_fitness < median_fitness:
                low_fitness_recomb += 1

    print(
        "Total number of recombs with both parents less fit than median fitness: ",
        both_parents_low_fitness,
    )
    print(
        "Total number of recombs with both parents less fit than median fitness, and recombinant also below median fitness: ",
        low_fitness_recomb,
    )
    print("Percentage of recombinants where both parents are less fit than median fitness, and recombinant also below median fitness: ", (low_fitness_recomb / both_parents_low_fitness) * 100)
    print("Percentage of recombinants where both parents are less fit than median fitness, and recombinant is at or above median fitness: ", 100 - (low_fitness_recomb / both_parents_low_fitness) * 100)


### Comparing recombinant and parental fitness with median circulating fitness at time of emergence

In [11]:
low_fitness_recombs_stats(analysis.getRecombData())

Total number of recombs with both parents less fit than median fitness:  321
Total number of recombs with both parents less fit than median fitness, and recombinant also below median fitness:  301
Percentage of recombinants where both parents are less fit than median fitness, and recombinant also below median fitness:  93.76947040498442
Percentage of recombinants where both parents are less fit than median fitness, and recombinant is at or above median fitness:  6.230529595015582


In [12]:
def quadrant_percentages(df):
    """
    """
    FITNESS_COL = "RecombFitnessNormalizedByMaxParents"
    DIVERGENCE_COL = "ParentsHD"
    
    # Calculate the average parental divergence of recombinants
    avg_parental_divergence = df.select(pl.col(DIVERGENCE_COL).mean()).head(1).item()
    TOTAL = len(df)    
    NEUTRAL_FITNESS = float(1.0)
    
    quadrant_1 = len(
        df.filter(
            (pl.col(FITNESS_COL) > NEUTRAL_FITNESS)
            & (pl.col(DIVERGENCE_COL) > avg_parental_divergence)
        )
    )
    quadrant_2 = len(
        df.filter(
            (pl.col(FITNESS_COL) <= NEUTRAL_FITNESS)
            & (pl.col(DIVERGENCE_COL) > avg_parental_divergence)
        )
    )
    quadrant_3 = len(
        df.filter(
            (pl.col(FITNESS_COL) <= NEUTRAL_FITNESS)
            & (pl.col(DIVERGENCE_COL) <= avg_parental_divergence)
        )
    )
    quadrant_4 = len(
        df.filter(
            (pl.col(FITNESS_COL) > NEUTRAL_FITNESS)
            & (pl.col(DIVERGENCE_COL) <= avg_parental_divergence)
        )
    )
    assert quadrant_1 + quadrant_2 + quadrant_3 + quadrant_4 == TOTAL

    #print("Count for Quadrant 1 (High diversity, high fitness): ", quadrant_1)
    #print("Count for Quadrant 2 (High diversity, low fitness): ", quadrant_2)
    #print("Count for Quadrant 3 (Low diversity, low fitness): ", quadrant_3)
    #print("Count for Quadrant 4 (Low diversity, high fitness): ", quadrant_4)
    #print()
    #print()
    quadrant_1_percent = (quadrant_1 / TOTAL) * 100
    quadrant_2_percent = (quadrant_2 / TOTAL) * 100
    quadrant_3_percent = (quadrant_3 / TOTAL) * 100
    quadrant_4_percent = (quadrant_4 / TOTAL) * 100
    print(
        "Percentage for Quadrant 1 (High diversity, high fitness): ", quadrant_1_percent
    )
    print(
        "Percentage for Quadrant 2 (High diversity, low fitness): ", quadrant_2_percent
    )
    print(
        "Percentage for Quadrant 3 (Low diversity, low fitness): ", quadrant_3_percent
    )
    print(
        "Percentage for Quadrant 4 (Low diversity, high fitness): ", quadrant_4_percent
    )

    # Above 1.1 norm fitness and greater than average parental divergence
    very_rare = len(
        df.filter(
            (pl.col(FITNESS_COL) > 1.1)
            & (pl.col(DIVERGENCE_COL) > avg_parental_divergence)
        )
    )
    very_rate_percent = (very_rare / TOTAL) * 100
    print("Percentage of recombinants that are above 1.1 normalized fitness (by max parent) and have a greater than average parental divergence: ", very_rate_percent)

### Divergence vs Fitness Quadrant Percentages 

In [13]:
quadrant_percentages(analysis.getRecombData())

Percentage for Quadrant 1 (High diversity, high fitness):  6.134157105030892
Percentage for Quadrant 2 (High diversity, low fitness):  28.067078552515447
Percentage for Quadrant 3 (Low diversity, low fitness):  52.162400706090025
Percentage for Quadrant 4 (Low diversity, high fitness):  13.636363636363635
Percentage of recombinants that are above 1.1 normalized fitness (by max parent) and have a greater than average parental divergence:  0.7943512797881729


In [14]:
def neutral_and_divergent(df):
    """
    TODO
    """
    FITNESS_COL = "RecombFitnessNormalizedByMaxParents"
    DIVERGENCE_COL = "ParentsHD"

    TOTAL = len(df)
    avg_parental_divergence = df.select(pl.col(DIVERGENCE_COL).mean()).head(1).item()
    results = len(
        df.filter(
            (pl.col(FITNESS_COL).is_between(0.98, 1.02))
            & (pl.col(DIVERGENCE_COL) > avg_parental_divergence)
        )
    )
    print("Percentage of recombinants that are near neutral and divergent cases: ", (results / TOTAL) * 100)

### Percentage of recombinants that have near neutral fitness and arrise from divergent parents 

In [15]:
neutral_and_divergent(analysis.getRecombData())

Percentage of recombinants that are near neutral and divergent cases:  12.400706090026478


# Figure 3c Analysis: Calculating the mean and standard deviation of recombinant fitness when normalized by minimum of parental fitness

In [16]:
recombination_norm_by_min_mean, recombination_norm_by_min_std_dev = analysis.getRecombinationMinFitnessStats().values()
print("Mean fitness for recombinants (normalized by min of parental fitness) found in MAT: ", recombination_norm_by_min_mean)
print("Standard deviation of recombinants (normalized by min of parental fitness) found in MAT: ", recombination_norm_by_min_std_dev)

Mean fitness for recombinants (normalized by min of parental fitness) found in MAT:  1.1351193435012363
Standard deviation of recombinants (normalized by min of parental fitness) found in MAT:  0.37676604798203955


# Figure 3d Analysis: Recombinant fitness compared to parental fitness

In [17]:
def calculate_percentages(data):
    """
    """
    total = len(data)
    between_0_and_1 = np.sum((data >= 0) & (data <= 1))
    below_0 =  np.sum(data < 0)
    above_1 = np.sum(data > 1)
    assert(total == (between_0_and_1 + below_0 + above_1))

    print("Percentage of data below 0: ", round((below_0 / total) * 100, 2))
    print("Percentage of data above 1: ", round((above_1 / total) * 100, 2))
    print("Percentage of data between 0 and 1 (both inclusive): ", round((between_0_and_1 / total) * 100, 2))


In [18]:
norm_fitness_data = analysis.getNormFitness().select('NormFitness').to_numpy()
calculate_percentages(norm_fitness_data)

Percentage of data below 0:  15.36
Percentage of data above 1:  19.77
Percentage of data between 0 and 1 (both inclusive):  64.87
