In [1]:
# Uncomment and run to reload libs
# import importlib
# import utils
# importlib.reload(utils)
# import utils.modelling
# importlib.reload(utils.modelling)
# import utils.reader
# importlib.reload(utils.reader)


import json
import gc

import numpy as np
import pandas as pd

from utils import (
    logging,
    DUMMY_SEQ_NAMES_FILE,
    MUTATION_SCORES_FILE,
    RECOMMENDED_MUTATIONS_FILE,
)


In [2]:
# Prediction from the model
mutation_scores: pd.DataFrame = pd.read_feather(MUTATION_SCORES_FILE)

# The added dummy sequences
with open(DUMMY_SEQ_NAMES_FILE) as f:
    dummy_seq_names = json.load(f)


In [3]:
pred_scores = mutation_scores.loc[mutation_scores["Accession"].isin(
    dummy_seq_names)]
pred_scores = pred_scores.melt(id_vars=["Accession", "Protein", "Pos"])
logging.info("Score matrix melted")


In [4]:
pred_scores

Unnamed: 0,Accession,Protein,Pos,variable,value
0,Spike_1M,Spike,1,A,-0.000979
1,Spike_1M,Spike,2,A,-0.001310
2,Spike_1M,Spike,3,A,-0.001350
3,Spike_1M,Spike,4,A,-0.000928
4,Spike_1M,Spike,5,A,-0.001479
...,...,...,...,...,...
70639435,Spike_1508Q,Spike,1503,Z,-0.000702
70639436,Spike_1508Q,Spike,1504,Z,-0.000621
70639437,Spike_1508Q,Spike,1505,Z,-0.000704
70639438,Spike_1508Q,Spike,1506,Z,-0.000629


In [5]:
dummy_seq_name_split = pred_scores["Accession"].str.split("_").str
pred_scores["Ref_protein"] = dummy_seq_name_split[0]
pred_scores["Ref_pos"] = dummy_seq_name_split[1].str.extract(
    r"(\d+)").astype(int)
pred_scores = pred_scores[~((pred_scores["Pos"] == pred_scores["Ref_pos"]) &
                            (pred_scores["Protein"] == pred_scores["Ref_protein"]))]
logging.info("Known mut rows removed")
# del dummy_seq_name_split, pred_scores["Ref_protein"], pred_scores["Ref_pos"]
# logging.info("Delete to release memory (hopefully)")

pred_scores["Mut"] = pred_scores["Protein"] + "_" + \
    pred_scores["Pos"].astype(str) + pred_scores["variable"]

pred_scores["Ref_pos_greater"] = ((pred_scores["Protein"] == pred_scores["Ref_protein"]) &
                                  (pred_scores["Ref_pos"] > pred_scores["Pos"]))
del (pred_scores["Protein"], pred_scores["Pos"], pred_scores["Ref_protein"],
     pred_scores["Ref_pos"], pred_scores["variable"])

pred_scores["Accession"], pred_scores["Mut"] = np.where(
    pred_scores["Ref_pos_greater"],
    [pred_scores["Mut"], pred_scores["Accession"]],
    [pred_scores["Accession"], pred_scores["Mut"]],
)
logging.info("Mutation pair per row sorted by pos")
del pred_scores["Ref_pos_greater"]

gc.collect()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_scores["Mut"] = pred_scores["Protein"] + "_" + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_scores["Ref_pos_greater"] = ((pred_scores["Protein"] == pred_scores["Ref_protein"]) &
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_scores["Accession"], pred_scores["Mut"] = np.where(


331

In [6]:
pred_scores.to_csv(RECOMMENDED_MUTATIONS_FILE, index=False)
logging.info(f"{RECOMMENDED_MUTATIONS_FILE} saved!")
