In [1]:
import pandas as pd

from utils import (
    logging,
    TARGET_PROTEIN,
    SAMPLE_END_DATE,
    MUTATION_SCORE,
    MISSING_PENALTY,
    MUTATION_PER_SEQ_FILE,
    TRAINING_DATA_FILE,
)
from utils.miscellaneous import mut_seq_info


In [2]:
logging.info("Load data...")
df: pd.DataFrame = pd.read_feather(MUTATION_PER_SEQ_FILE)
df["Value"] = MUTATION_SCORE

df["Date"] = pd.to_datetime(df["Date"])
df = df[df["Date"] < SAMPLE_END_DATE]
logging.info(f"{mut_seq_info(df)} before {SAMPLE_END_DATE}")

df = df[df["Mutation"].str.contains(TARGET_PROTEIN)]
logging.info(f"{mut_seq_info(df)} are on {TARGET_PROTEIN}")

df = df[~df["Mutation"].str.contains("stop")]
logging.info(f"{mut_seq_info(df)} are not stop codon")

df = df[~df["Mutation"].str.contains("ins")]
logging.info(f"{mut_seq_info(df)} are not insertion")


In [3]:
mutation_socre_matrix = df.pivot_table(
    index="Accession",
    columns="Mutation",
    values="Value",
    fill_value=MISSING_PENALTY,
    sort=False,
)
logging.info("pivot_table done!")

mutation_socre_matrix = mutation_socre_matrix.drop_duplicates()
logging.info("Duplicates removed")


In [4]:
training_data = pd.DataFrame(
    mutation_socre_matrix.stack(),
    columns=["Value"]
)
# training_data = training_data[training_data["Value"] != MISSING_PENALTY]
seq_info = df[["Accession", "Lineage", "Date"]].drop_duplicates()
seq_info = seq_info.set_index("Accession")

training_data["Lineage"] = seq_info.loc[
    training_data.index.get_level_values("Accession"),
    "Lineage"
].values
logging.info(f"Lineage assigned")

training_data["Date"] = seq_info.loc[
    training_data.index.get_level_values("Accession"),
    "Date"
].values
logging.info(f"Dates assigned")

training_data = training_data.reset_index()
logging.info(f"{mut_seq_info(training_data)} are unique")

seq_names = pd.DataFrame(
    training_data["Accession"].unique(),
    columns=["Accession"]
)
seq_names["Seq_id"] = seq_names.index
training_data = training_data.merge(seq_names, on="Accession")
logging.info(f"Seq_id assigned")

mut_names = pd.DataFrame(
    training_data["Mutation"].unique(),
    columns=["Mutation"]
)
mut_names["Mut_id"] = mut_names.index
training_data = training_data.merge(mut_names, on="Mutation")
logging.info(f"Mut_id assigned")


In [5]:
training_data = pd.concat([
    training_data,
    pd.DataFrame({
        "Accession": mut_names["Mutation"].values,
        "Mutation": mut_names["Mutation"].values,
        "Value": 1,
        "Lineage": "None",
        "Date": SAMPLE_END_DATE,
        "Seq_id": mut_names["Mutation"].index + len(seq_names["Seq_id"]),
        "Mut_id": mut_names["Mutation"].index
    })
])
training_data = training_data.reset_index(drop=True)
logging.info(f"{mut_seq_info(training_data)} after dummy added")

# training_data = training_data.sort_values("Accession")
# training_data = training_data.sort_values("Date")
training_data.to_feather(TRAINING_DATA_FILE)
logging.info(f"{TRAINING_DATA_FILE} saved!")
