### Timeseries for the patient:donor relative abundance ratio of specific genera
Using the "high-confidence" genera

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
import skbio
from matplotlib.backends.backend_pdf import PdfPages 
from matplotlib import pylab
import os
from collections import OrderedDict

In [2]:
def unique(seq, idfun=None): 
   # order preserving
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        if item != np.nan:
            marker = idfun(item)
            if marker in seen: continue
            seen[marker] = 1
            result.append(item)
    return result

In [6]:
#takes in more params
def engraftment_capability_all_otus_all_timepoints(otu_table_counts, otu_table_abundance, metadata):
    engraftment_capablity_df = pd.DataFrame(columns=["timepoint","patient","otu", "otu_source","count", "abundance", "aerobic", "donor", "donor_count", "donor_abundance", "donor_direct"])
    donor_only = {} #dict where keys are patient ids and values are lists of indices representing seq vars that are in 
                    #the corresponding donor but not in the patient originally, where the first list is direct and the second pma 
    for i in otu_table_counts.index:
        if metadata.loc[i, "timepoint"] == "Pre-FMT":
            donor_only[metadata.loc[i, "person_id"]] = [list(set(list(otu_table_counts.loc[metadata.loc[i, "direct_donor_fmt_id"]].nonzero()[0]))-set(list(otu_table_counts.loc[i].nonzero()[0]))),list(set(list(otu_table_counts.loc[metadata.loc[i, "pma_donor_fmt_id"]].nonzero()[0]))-set(list(otu_table_counts.loc[i].nonzero()[0])))]
    both = {}
    for i in otu_table_counts.index:
        if metadata.loc[i, "timepoint"] == "Pre-FMT":
            both[metadata.loc[i, "person_id"]] = [list(set(list(otu_table_counts.loc[metadata.loc[i, "direct_donor_fmt_id"]].nonzero()[0]))&set(list(otu_table_counts.loc[i].nonzero()[0]))),list(set(list(otu_table_counts.loc[metadata.loc[i, "pma_donor_fmt_id"]].nonzero()[0]))&set(list(otu_table_counts.loc[i].nonzero()[0])))]
    patient_only = {}
    for i in otu_table_counts.index:
        if metadata.loc[i, "timepoint"] == "Pre-FMT":
            patient_only[metadata.loc[i, "person_id"]] = [list(set(list(otu_table_counts.loc[i].nonzero()[0]))-set(list(otu_table_counts.loc[metadata.loc[i, "direct_donor_fmt_id"]].nonzero()[0]))),list(set(list(otu_table_counts.loc[i].nonzero()[0]))-set(list(otu_table_counts.loc[metadata.loc[i, "pma_donor_fmt_id"]].nonzero()[0])))]
    timepoints = list(metadata["timepoint"].unique())[:-1]
    #timepoints = timepoints[~np.isnan(timepoints)]
    patients = list(metadata["patient_num"].unique())[:-1]
    #patients = patients[~np.isnan(patients)]
    otus = otu_table_counts.columns
    
    for timepoint in timepoints:
        for patient in patients: 
            try:
                sample_id = metadata.index[(metadata["person_id"] == patient) & (metadata["timepoint"] == timepoint)][0]
            except Exception as e:
                print(e)
                print(patient, timepoint)
                
            aerobic = metadata.loc[sample_id, "anaerobic_fmt"]
            donor = int(metadata.loc[sample_id,"donor_fmt_num"])
            for otu in otus:
                count = otu_table_counts.loc[sample_id, otu]
                abundance = otu_table_abundance.loc[sample_id, otu]
                otu_index = otu_table_counts.columns.get_loc(otu)
                for i, dp in enumerate(["direct", "pma"]):
                    
                    if otu_index in donor_only[patient][i]:
                        otu_source = "donor"
                    elif otu_index in both[patient][i]:
                        otu_source = "both"
                    elif otu_index in patient_only[patient][i]:
                        otu_source = "patient"
                    else:
                        otu_source = "environmental"
                    donor_count = otu_table_counts.loc[metadata.loc[sample_id, "{}_donor_fmt_id".format(dp)], otu]
                    donor_abundance = otu_table_abundance.loc[metadata.loc[sample_id, "{}_donor_fmt_id".format(dp)], otu]
                    engraftment_capablity_df = engraftment_capablity_df.append({"timepoint":timepoint, "patient":patient, "otu":otu, "otu_source":otu_source, "count":count, "abundance":abundance, "aerobic":aerobic, "donor":donor, "donor_count":donor_count, "donor_abundance":donor_abundance, "donor_direct":dp,}, ignore_index=True)
            
    return engraftment_capablity_df

In [7]:
genus_percent = pd.read_csv("abundances_by_tax_level_relfreq_high_confidence/Genus_otu_table.csv", index_col=0).transpose().drop(["ndc432", "ndc442"])
genus_counts = pd.read_csv("abundances_by_tax_level_counts_high_confidence/Genus_otu_table.csv", index_col=0).transpose().drop(["ndc432", "ndc442"])
metadata= pd.read_csv("sample_metadata.csv", index_col=0)

In [8]:
genus_engraftment = engraftment_capability_all_otus_all_timepoints(genus_counts, genus_percent, metadata)

index 0 is out of bounds for axis 0 with size 0
141.0 10 days
index 0 is out of bounds for axis 0 with size 0
141.0 3 days
index 0 is out of bounds for axis 0 with size 0
141.0 8 weeks
index 0 is out of bounds for axis 0 with size 0
159.0 8 weeks


In [11]:
genus_engraftment.otu.values

array(['Actinomyces', 'Actinomyces', 'Akkermansia', ..., 'Tyzzerella',
       'Veillonella', 'Veillonella'], dtype=object)