In [None]:
import numpy as np
from scipy.optimize import minimize
import matplotlib.pyplot as plt
import itertools
import pandas as pd
import scipy
from tqdm.notebook import tqdm  

In [None]:
RSV_data = np.array([[21523, 18363, 18354],
                     [17765, 14929, 14681],
                     [ 7910,  6497,  7076],
                     [ 8710,  7052,  7234],
                     [ 8470,  7248,  6867],
                     [ 5809,  4852,  4782],
                     [ 5530,  4089,  5031],
                     [ 3802,  2929,  2997],
                     [ 3191,  2742,  2665],
                     [   33,    12,    29]])
RSV_data

In [None]:
EBOV_data = np.array([[3322,3013,2345,1697,1079,584,228], # Ebola Gradient
                      [651,663,435,482,254,180,43]]).T
EBOV_data

In [None]:
MV_data = np.array([[851,688,493,388,336,211,48], # Marbug Gradient
                      [108,103,57,59,44,30,5]]).T
MV_data

In [None]:
# define list of virus names
viruses = ["RSV", "EBOV", "MV"]

# define list of virus genes and their orders
virus_gene_names = [ ["NS1", "NS2", "N", "P", "M", "SH", "G", "F", "M2", "L"],
                     ["NP", "VP35", "VP40", "GP", "VP30", "VP24", "L"],
                     ["NP", "VP35", "VP40", "GP", "VP30", "VP24", "L"]]
# define list of mean expression of each gene for each virus
virus_data = [RSV_data, EBOV_data, MV_data]




In [None]:
RSV_genome_pos = np.array([576,1098,2327,3242,4209,4628,5595,7550,8557,15067])
RSV_genome_pos = np.array(RSV_genome_pos - np.min(RSV_genome_pos))
RSV_genome_pos

EBOV_genome_pos =  np.array([3026,4407,5894,8305,9740,11518,18282])
EBOV_genome_pos = np.array(EBOV_genome_pos - np.min(EBOV_genome_pos))
EBOV_genome_pos

MV_genome_pos =  np.array([2844,4410,5819,8670,10016,11285,19037])
MV_genome_pos = np.array(MV_genome_pos - np.min(MV_genome_pos))
MV_genome_pos


virus_genome_pos = [RSV_genome_pos, EBOV_genome_pos, MV_genome_pos]


In [None]:
overlaps = [[9],
            [2, 4, 6],
            [5]]

In [None]:
# objective function for fitting model

def obj_fun(x):
    p = x[0]
    p_trans = x[1]
    pred = np.array([p**x * p_trans for x in genome_pos])
    for t in ov:
        pred[ov] = pred[ov] * (1-p_trans)
    lambda_pred = pred/np.sum(pred)
    LL = 0
    if len(dv.shape) == 1:
        LL += np.sum(scipy.stats.poisson.logpmf(dv, lambda_pred))
        if np.isnan(LL):
            return np.inf
        return LL * -1
    else:
        for i in range(dv.shape[1]):
            LL += np.sum(scipy.stats.poisson.logpmf(dv[:,i], lambda_pred))
        if np.isnan(LL):
            return np.inf
        return LL * -1
    

In [None]:
# iterate through each virus and fit the model 100,000 times starting at random positions.

all_fits = []
all_funcs = []
for j in range(len(viruses)):
    fits = []
    funcs = []
    dv = np.array(virus_data[j])
    ov = overlaps[j]
    genome_pos = virus_genome_pos[j]
    for i in range(200000):
        res = minimize(obj_fun,( np.random.random(2) * np.array([0.5, 1]) ) + np.array([0.5, 0]), bounds = [[0.5, 1], [0, 1]], tol = 1e-20)
        fits.append(res.x)
        funcs.append(res.fun * -1)
        if i % 10000 == 0:
            print(i)
    all_fits.append(fits)
    all_funcs.append(funcs)
    print(viruses[j])
all_fits = np.array(all_fits)




In [None]:
# save fits and then generate and save a file with the 
# statistics assocaited with the top 100 fits for each virus

for i in range(len(viruses)):
    df = pd.concat( [pd.DataFrame(all_fits[i]) , pd.DataFrame(all_funcs[i])], axis = 1)
    df.columns = ["p(walk)", "ptransc", "Log Likelihood"]
    df.to_csv(f"File3-All_Parameter_Fits_overlaps/{viruses[i]}_fits.csv")
    df = df.loc[np.array(df.iloc[:,-1]) > np.quantile(np.array(df.iloc[:,-1]), 1-.0005)]
    df_stats = pd.DataFrame([np.mean(df.iloc[:,:-1], axis = 0), np.std(df.iloc[:,:-1], axis = 0), np.max(df.iloc[:,:-1], axis = 0), np.min(df.iloc[:,:-1], axis = 0), df[df.iloc[:,-1] == np.max(df.iloc[:,-1])].iloc[0,:-1]]).T
    df_mse = pd.DataFrame(2*[df[df.iloc[:,-1] == np.max(df.iloc[:,-1])].iloc[0,-1]])
    df_mse.index = ["p(walk)", "ptransc"]
    df_stats = pd.concat([df_stats, df_mse], axis = 1 )
    df_stats.columns = ["Mean of Top 100 Fit Estimates", "Std of Top 100 Fit Estimates", "Max of Top 100 Fit Estimates", "Min of Top 100 Fit Estimates", "Best Fit Estimate", "Best Fit Log Likelihood"]
    df_stats.index = ["p(walk)", "ptransc"]
    df_stats.to_csv(f"File4-Top_100_Fit_Stats_overlaps/{viruses[i]}_fits.csv")
    
    


In [None]:
# Get the best fit parameter

dfs = pd.DataFrame()
for i in range(len(viruses)):
    df = pd.read_csv((f"File4-Top_100_Fit_Stats_overlaps/{viruses[i]}_fits.csv"))
    dfs = pd.concat([dfs, df])
best_fit_pars = dfs.iloc[:,-2]
best_fit_pars = [best_fit_pars[0:2], best_fit_pars[2:4], best_fit_pars[4:6], best_fit_pars[6:8]]
log_likeli = dfs.iloc[::2,-1]


In [None]:
# Generate barplots of observed gradients and predicted gradients

for i in range(len(viruses)):
    genome_pos = virus_genome_pos[i]    
    pars = best_fit_pars[i]
    ov = overlaps[i]
    p = pars[0]
    p_trans = pars[1]
    pred = np.array([p**x * p_trans for x in genome_pos])
    for t in ov:
        pred[ov] = pred[ov] * (1-p_trans)
    pred_norm = pred/np.sum(pred)    
    categories = virus_gene_names[i]
    indices = np.arange(len(categories))
    # Create figure and axis
    fig, ax = plt.subplots(figsize=(10, 6))
    # Create bars
    bar_width = 0.35
    ax.set_title(f'{viruses[i]}', fontsize=28)
    if len(virus_data[i].shape) == 1:
        data_norm = virus_data[i]/np.sum(virus_data[i])
        ax.bar(indices - bar_width/2, data_norm, width=bar_width, color='#0072B2', edgecolor='black', label='Observed Ratio')
    else:
        data_norm = virus_data[i]/np.sum(virus_data[i],axis=0)
        ax.bar(indices - bar_width/2, np.mean(data_norm, axis = 1), width=bar_width, yerr= 2 * np.std(data_norm, axis = 1), color='#0072B2', edgecolor='black', label='Observed Ratio')
    # Increase y-axis label size
    ax.tick_params(axis='y', labelsize=18)
    # Create bars for pred_norm
    ax.bar(indices + bar_width/2, pred_norm, width=bar_width, color='red', edgecolor='black', label='Model Predictions')
    # Set legend with improved positioning and styling
    ax.legend(loc='best', fontsize=20, frameon=True, shadow=True)
    # Set labels, title, and ticks
    ax.set_xticks(indices)
    plt.ylim([0,.52])
    
    mse = log_likeli.iloc[i]
    ax.set_xticklabels(categories, fontsize=20)
    ax.set_ylabel('Proportion of Total Viral mRNA', fontsize=20)
    ax.set_xlabel('Gene', fontsize=20)
    ax.grid(True, which='both', axis='y', linestyle='--', linewidth=0.5) 
    ax.text(.975, 0.725, 'Log Likelihood: {:.3E}'.format(mse) , transform=ax.transAxes, fontsize=30,
            verticalalignment='top', horizontalalignment='right', bbox=dict(facecolor='white', alpha=0.8))
    plt.tight_layout()
    plt.savefig(f"Figures/Fig3/{viruses[i]}_Gradients.svg")
    plt.show()   
    
    
    

In [None]:
dfs = pd.DataFrame()
for i in range(len(viruses)):
    df = pd.read_csv((f"File4-Top_100_Fit_Stats_overlaps/{viruses[i]}_fits.csv"))
    dfs = pd.concat([dfs, df])
    
ind = [2 * [x] for x in viruses]
inds = []

for i in ind:
    inds += i
dfs.index = inds
dfs = dfs.iloc[::2]
dfs.iloc[:,0] = viruses
1 / (1-dfs.iloc[:,-2])

In [None]:

dfs = pd.DataFrame()
for i in range(len(viruses)):
    df = pd.read_csv((f"File4-Top_100_Fit_Stats_overlaps/{viruses[i]}_fits.csv"))
    dfs = pd.concat([dfs, df])
ind = [2 * [x] for x in viruses]
inds = []
for i in ind:
    inds += i
dfs.index = inds
dfs.iloc[:,2] = dfs.iloc[:,2].apply(lambda x: '{:.2e}'.format(x))
dfs.iloc[:,-1] = dfs.iloc[:,-1].apply(lambda x: '{:.2e}'.format(x))
dfs = dfs.round(5)
dfs.columns = ["Parameter"] + list(dfs.columns)[1:]
dfs.to_csv("File4-Top_100_Fit_Stats_overlaps/All_Viruses_Processivities_Overlap_Gradient.csv")
dfs


In [None]:

dfs = pd.read_csv("File4-Top_100_Fit_Stats_overlaps/All_Viruses_Processivities_Overlap_Gradient.csv")


In [None]:
# define code for performing MCMC
# This code is taken from the MCMC section of the UQ-Bio 2024 Github
# https://github.com/MunskyGroup/uqbio2024

def likelihood_ge(x):
    p = x[0]
    p_trans = x[1]
    pred = np.array([p**x * p_trans for x in genome_pos])
    for t in ov:
        pred[ov] = pred[ov] * (1-p_trans)
    lambda_pred = pred/np.sum(pred)
    LL = 0
    if len(dv.shape) == 1:
        LL += np.sum(scipy.stats.poisson.logpmf(dv, lambda_pred))
        if np.isnan(LL):
            return np.inf
        return LL
    else:
        for i in range(dv.shape[1]):
            LL += np.sum(scipy.stats.poisson.logpmf(dv[:,i], lambda_pred))
        if np.isnan(LL):
            return np.inf
        return LL
    
def prior_ge(h):
    return np.log(scipy.stats.norm(p, scale= 1).pdf(h[0])) + np.log(scipy.stats.norm(ptransc, scale= 1).pdf(h[0]))


def posterior_unnormalised_ge(h):
    return likelihood_ge(h) + prior_ge(h)

def logposterior_unnormalised_ge(x):
    rho = x
    log_likelihood = likelihood_ge(rho)
    log_prior = prior_ge(rho)
    return log_likelihood + log_prior



In [None]:

def simulate_step(log_target, proposal_width, x):
    """
    Perform one step of the MH algorithm starting at `x`, with a normal distribution
    for the proposal. `log_target` is the logarithm of the target distribution
    and `proposal_width` is the width of the proposal distribution. Returns the next
    sample.
    """
    # propose a new value
    y = x + proposal_width * np.random.randn(len(x))
    if np.any(y < 0) or np.any(y > 1):
        return x  # Reject proposal and return current value
    log_alpha = np.sum(log_target(y) - log_target(x))    # the q-terms cancel
    if log_alpha > 0:
        return y           # accept
    alpha = np.exp(log_alpha)
    p = np.random.rand()
    if p < alpha:        # this happens with probability alpha
        return y           # accept
    else:
        return x           # reject
def metropolis_hastings(log_target, proposal_width, nsteps, x0):
    """
    Perform MCMC by running the Metropolis-Hastings algorithm for `nsteps` steps,
    starting at `x0`. `log_target` is the logarithm of the target distribution
    and `proposal_width` is the width of the proposal distribution. Returns a matrix
    of samples.
    """
    ret = np.zeros((nsteps, len(x0)))
    ret[0] = x0
    for i in tqdm(range(1, nsteps)):
        ret[i] = simulate_step(log_target, proposal_width, ret[i-1])
    return ret

In [None]:

MCMC_outs = []
for j in range(len(viruses)):
    dv = np.array(virus_data[j])
    ov = overlaps[j]
    genome_pos = virus_genome_pos[j]
    ptransc = dfs.iloc[j*2:j*2 + 2].iloc[1,-2]
    p = dfs.iloc[j*2:j*2 + 2].iloc[0,-2]
    nsamples = 101000
    burnin = 1000
    post_ge_mcmc = metropolis_hastings(logposterior_unnormalised_ge, .01, nsamples, np.array([p, ptransc]))
    post_ge_mcmc = post_ge_mcmc[burnin:]  
    MCMC_outs.append(post_ge_mcmc) 


    
    



In [None]:

MCMC_outs = np.array(MCMC_outs)
np.save("File5-MCMC_Outputs/overlap_model_MCMC_Raw.npy", MCMC_outs)


In [None]:




df = pd.DataFrame([np.array([list(x) for x in list(zip(viruses,viruses))]).flatten(), ["p(walk)", "ptransc"] * 3, np.mean(MCMC_outs, axis = 1).flatten(), np.quantile(MCMC_outs, .025, axis = 1).flatten(), np.quantile(MCMC_outs, .5, axis = 1).flatten(), np.quantile(MCMC_outs, .975, axis = 1).flatten()])
df = df.T
df.columns =  ["Virus","Parameter", "Mean", "2.5% Quantile", "50% Quantile","97.5% Quantile" ]
df.to_csv("File5-MCMC_Outputs/overlap_model_MCMC_statistics.csv")
df


In [None]:
for i in range(3):
    x = MCMC_outs[i]
    plt.scatter(x[:,0], x[:,1])
    plt.ylim(-.01,1)
    plt.xlim(.999,1)
    plt.show()