In [2]:
import numpy as np
from scipy.optimize import minimize
import matplotlib.pyplot as plt
import itertools
import pandas as pd
import scipy
from tqdm.notebook import tqdm  

In [3]:

train_data = np.array([1,.68,.48,.32,.07])
train_data = train_data/np.sum(train_data)
train_data


array([0.39215686, 0.26666667, 0.18823529, 0.1254902 , 0.02745098])

In [4]:
# Gene orders of variants from 2 datasets with DOIs
pmg_combos = np.array([[1,2,3,4,5],[1,4,2,3,5],[1,3,4,2,5],[1,4,3,2,5],[1,2,4,3,5],[1,3,2,4,5]]) # 10.1128/jvi.73.6.4705-4712.1999

ng_combos = np.array([[1,2,3,4,5],[2, 3, 4, 1, 5], [4,1,2,3,5], [4,2,3,1,5]]) # 10.1128/jvi.74.17.7895-7902.2000




In [5]:
# set genome positions for each variant
gene_lengths = np.array([1325, 813, 830, 1664, 6372])

pmg_lenpos = []
for pos_pre in pmg_combos:
    pos = pos_pre
    gene_len_temp = [gene_lengths[i-1] for i in pos]
    genome_pos = [np.sum(gene_len_temp[0:i]) + (i-1) * 10 for i in pos]
    genome_pos = np.array(genome_pos - np.min(genome_pos))
    pmg_lenpos.append(genome_pos)
    
ng_lenpos = []
for pos_pre in ng_combos:
    pos = pos_pre
    gene_len_temp = [gene_lengths[i-1] for i in pos]
    genome_pos = [np.sum(gene_len_temp[0:i]) + (i-1) * 10 for i in pos]
    genome_pos = np.array(genome_pos - np.min(genome_pos))
    ng_lenpos.append(genome_pos)

    

In [6]:
ng_data = np.array([[0.282, 0.195, 0.252, 0.251, 0.02 ],
                    [0.162, 0.118, 0.135, 0.578, 0.007],
                    [0.151, 0.271, 0.354, 0.21 , 0.014],
                    [0.108, 0.182, 0.284, 0.415, 0.011]])

In [7]:
p = pd.read_csv("Folder1-Parameter_Fits/all_viruses_best_fits.csv").iloc[0, -3]
p

0.9997023665202216

In [8]:

ng_RAM_preds = []
for i in range(0, ng_data.shape[0]):
    labmda_pred = np.array([p**x for x in ng_lenpos[i]])
    labmda_pred = labmda_pred/np.sum(labmda_pred)
    ng_RAM_preds.append(labmda_pred)
ng_RAM_preds


[array([0.35486196, 0.27775517, 0.21630527, 0.13141735, 0.01966024]),
 array([0.2975265 , 0.18076371, 0.12148501, 0.38205044, 0.01817435]),
 array([0.15348429, 0.37466646, 0.25180031, 0.19708745, 0.02296149]),
 array([0.14307611, 0.27336993, 0.21289021, 0.34925934, 0.02140441])]

In [9]:
pmg_data = np.array([[100, 81.5, 73.0, 58.8 ,2.6],
                     [100, 53.3, 97.1, 87.1, 4.1],
                     [100, 75.3, 78.5, 99, 4.7],
                     [100, 57.5, 71.6, 99, 1.7],
                     [100, 89.1, 70.1, 92.5, 2.9],
                     [100, 74.9, 99, 70.7, 3.4]]).T
pmg_data = (pmg_data/np.sum(pmg_data, axis = 0)).T


In [10]:

pmg_RAM_preds = []
for i in range(0, pmg_data.shape[0]):
    labmda_pred = np.array([p**x for x in pmg_lenpos[i]])
    labmda_pred = labmda_pred/np.sum(labmda_pred)
    pmg_RAM_preds.append(labmda_pred)
pmg_RAM_preds


[array([0.35486196, 0.27775517, 0.21630527, 0.13141735, 0.01966024]),
 array([0.39859177, 0.14761197, 0.24216641, 0.18954687, 0.02208298]),
 array([0.37346326, 0.17670076, 0.13830604, 0.29083913, 0.0206908 ]),
 array([0.3989735 , 0.14775334, 0.1887707 , 0.24239833, 0.02210413]),
 array([0.37257951, 0.29162293, 0.13797875, 0.17717697, 0.02064184]),
 array([0.35536019, 0.21660897, 0.27674114, 0.13160186, 0.01968785])]

In [11]:
variants = ["NPMGL (WT)", "NMGPL", "NGPML", "NGMPL", "NPGML", "NMPGL"] 
variants += ["NPMGL (WT)", "GNPML", "PMGNL", "GPMNL"]
Dataset_DOIs = ["10.1128/jvi.73.6.4705-4712.1999"] * 6 + ["10.1128/jvi.74.17.7895-7902.2000"] * 4


data_df = pd.DataFrame(list(pmg_data) + list(ng_data))

data_df = pd.concat([pd.DataFrame(variants), data_df, pd.DataFrame(Dataset_DOIs)], axis = 1)

data_df.columns = ["Variant", "N", "P", "M", "G", "L", "Dataset"]

data_df.to_csv("Folder3-Gene_Shuffle_Predictions/prot_expression_ratio_data.csv")


In [12]:
preds_df = pd.DataFrame(list(pmg_RAM_preds) + list(ng_RAM_preds))

preds_df = pd.concat([pd.DataFrame(variants),preds_df], axis = 1)

preds_df.columns = ["Variant", "N", "P", "M", "G", "L"]

preds_df.to_csv("Folder3-Gene_Shuffle_Predictions/expression_ratio_predictions.csv")