In [18]:
from sadie.reference import Reference, References
from sadie.airr import Airr
import pandas as pd

In [2]:
# get all the data thats unpersonalized
all_combined = pd.read_feather("../data/all_processed_combined.feather")

In [3]:
# genotype data per animal
genotype = pd.read_excel("../data/genotype.xlsx",index_col=0)

In [4]:
# our baseline references is all available species we get from our yaml file built into sadie
baseline_references: References = References().from_yaml()

# now we will get just macaque
baseline_macaque: pd.DataFrame = baseline_references.get_dataframe().query("name=='macaque'")

# take out D3-41
baseline_no_d3_41 = baseline_macaque[baseline_macaque["gene"].str.split("*").str.get(0) != "IGHD3-41"].copy()

references: References = References()
haplotype_df = []

 # group by allele1 and 2
for allele_group, allele_group_df in genotype.groupby(["allele_1", "allele_2"]):

    # the haplotype name will be ex allele1_allele2, eg 02_02
    name = "_".join(list(map(lambda x: x.split("*")[-1], allele_group)))

    # only get the common
    need_alleles = list(set(list(allele_group)))

    # create a baseline ref with no d3-41
    reference = Reference().from_dataframe(baseline_no_d3_41.drop("name", axis=1))

    # add each allele one at a time
    for needed_allele in need_alleles:
        reference.add_gene({"species": "macaque", "gene": needed_allele, "source": "custom"})
    print(f"adding_references:{name}")

    # add the reference to references with name
    references.add_reference(name=name, reference=reference)
    allele_group_df["haplotype"] = name
    haplotype_df.append(allele_group_df)

haplotype_df = pd.concat(haplotype_df).reset_index(drop=True)

In [None]:
# personalize by passing references to SADIE we just created
personalized_df = []
all_combined_index = all_combined.set_index('sequence_id').copy()
for haplo, haplot_df in haplotype_df.groupby("haplotype"):
    # only get animal_ids of that group
    animal_ids = haplot_df["animal_id"].to_list()

    # sub df is unblind dataframe with just the subjects of interest
    sub_df = all_combined[all_combined["NHP"].isin(animal_ids)]

    # make an api call to get the allele
    airr_api = Airr(haplo, adaptable=True, references=references)
    heavy_airr_df = airr_api.run_dataframe(sub_df, "sequence_id", "sequence_heavy")
    h_ = heavy_airr_df.set_index('sequence_id')
    h_.columns = [i + '_heavy' for i in h_.columns]
    all_combined_index.update(h_)

In [64]:
all_combined_index['reference_name_heavy'].value_counts()

macaque              19019
01_01                16935
01_01_S8240          11230
01_S8240_01_S8240     5199
Name: reference_name_heavy, dtype: int64

In [67]:
all_combined_index.reset_index().to_feather("../data/all_processed_combined_personalized.feather")

In [68]:
all_combined_index.reset_index().to_csv("../data/all_processed_combined_personalized.csv")