In [1]:
# Uncomment and run to reload libs
# import importlib
# import utils
# importlib.reload(utils)
# import utils.miscellaneous
# importlib.reload(utils.miscellaneous)
# import utils.reader
# importlib.reload(utils.reader)


import json

import pandas as pd

from utils import (
    logging,
    TRAINING_DATA_FILE,
    DUMMY_SEQ_NAMES_FILE
)
from utils.reader import AminoAcidComboReader


In [2]:
aa_combo_reader: AminoAcidComboReader = AminoAcidComboReader()
training_data = aa_combo_reader.before_date_data()


In [3]:
training_data

Unnamed: 0,Protein,To,Pos,Accession,Date,Lineage,AA
0,Spike,G,614,EPI_ISL_464302,2020-02-03,B.1,Spike_614G
1,Spike,L,1263,EPI_ISL_464302,2020-02-03,B.1,Spike_1263L
2,Spike,G,1124,EPI_ISL_464302,2020-02-03,B.1,Spike_1124G
3,Spike,G,769,EPI_ISL_464302,2020-02-03,B.1,Spike_769G
4,Spike,L,5,EPI_ISL_464302,2020-02-03,B.1,Spike_5L
...,...,...,...,...,...,...,...
1409675,Spike,N,1329,EPI_ISL_453133,2020-04-27,B.57,Spike_1329N
1409676,Spike,Q,1373,EPI_ISL_453133,2020-04-27,B.57,Spike_1373Q
1409677,Spike,N,1305,EPI_ISL_453133,2020-04-27,B.57,Spike_1305N
1409678,Spike,G,550,EPI_ISL_453133,2020-04-27,B.57,Spike_550G


In [4]:
# Add 'Pos_id' for all sites
all_pos: pd.DataFrame = training_data[["Protein", "Pos"]].drop_duplicates()
all_pos = all_pos.sort_values(["Protein", "Pos"])
all_pos = all_pos.reset_index(drop=True)
all_pos.index = all_pos.index.set_names("Pos_id")
all_pos = all_pos.reset_index()

training_data = training_data.merge(all_pos, on=["Protein", "Pos"])


In [5]:
# Add 'Seq_id'
seq_info = pd.DataFrame(
    training_data["Accession"].unique(),
    columns=["Accession"]
)
seq_info["Seq_id"] = seq_info.index
training_data = training_data.merge(seq_info, on="Accession")
logging.info(f"{len(training_data.index)}")


In [6]:
dummy_seqs = training_data[["Protein", "To", "Pos", "Pos_id"]].drop_duplicates()
dummy_seqs["Mutation"] = dummy_seqs["Protein"] + "_" + dummy_seqs["Pos"].astype(str) + dummy_seqs["To"]


In [7]:
dummy_seqs[dummy_seqs["Pos"] == 614]

Unnamed: 0,Protein,To,Pos,Pos_id,Mutation
0,Spike,G,614,584,Spike_614G
1000980,Spike,D,614,584,Spike_614D
1009020,Spike,N,614,584,Spike_614N


In [8]:
training_data = pd.concat([
    training_data,
    pd.DataFrame({
        "Accession": dummy_seqs["Mutation"].values,
        "Lineage": "None",
        "Date": training_data["Date"].max(),
        "Seq_id": dummy_seqs["Pos_id"].values + len(seq_info["Seq_id"]),
        "Protein": dummy_seqs["Protein"],
        "Pos": dummy_seqs["Pos"].values,
        "To": dummy_seqs["To"].values,
        "Pos_id": dummy_seqs["Pos_id"].values
    })
])
logging.info(f"{len(training_data)} after dummy added")


In [9]:
training_data = training_data.rename(columns={"To": "AA_state"})
aa_table = training_data["AA_state"].unique()
aa_table.sort()
aa_table = pd.Series(aa_table)
aa_table.index.name = "AA_idx"
aa_table = aa_table.reset_index(name="AA_state")

training_data = training_data.merge(aa_table, on="AA_state")


In [10]:
training_data.reset_index(drop=True)
training_data.to_feather(TRAINING_DATA_FILE)
logging.info(f"{TRAINING_DATA_FILE} saved!")

with open(DUMMY_SEQ_NAMES_FILE, "w") as f:
    json.dump(list(dummy_seqs["Mutation"].values), f)
