In [1]:
import pandas as pd
import numpy as np
import plotly

In [2]:
# Load the data
metadata_unfiltered = pd.read_csv("../data/metadata_filtered.csv")
metagenomics = pd.read_csv("../data/metagenomics.csv")

In [3]:
# Filter metadata for metagenomics
metadata = metadata_unfiltered[metadata_unfiltered["data_type"] == "metagenomics"]

In [4]:
# Filters rows without species-level data
species_df = metagenomics[~metagenomics["species"].isna()]

In [5]:
# Removes samples that don't sum to 100%
sum_100_mask = np.concatenate((np.zeros(9,dtype=bool), np.isclose(species_df.iloc[:,9:].sum(axis=0), 100)))
# Only keep species column
sum_100_mask[7] = True
clean_species_df = species_df.iloc[:, sum_100_mask]


In [6]:
# transpose and specify correct column names
cleaner_species_df = clean_species_df.T
cleaner_species_df.columns = cleaner_species_df.iloc[0,:]
cleaner_species_df = cleaner_species_df.drop(cleaner_species_df.index[0])

In [7]:
# get the infos for the sample we actually want
sample_list = np.array([x.split("_profile")[0] for x in cleaner_species_df.index])
sample_in_meta_idx = np.array([list(metadata["External.ID"]).index(x) for x in sample_list])

In [8]:
# put the two dataframes together
combined_df = pd.concat ((metadata.iloc[sample_in_meta_idx,:].reset_index(), cleaner_species_df.reset_index()), axis=1)
combined_df = combined_df.drop(columns=["index", "Unnamed: 0"])

In [9]:
# get the features for UMAP
features = np.array(combined_df.iloc[:,15:])

In [10]:
# compute umaps
from umap import UMAP

umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(features)
proj_3d = umap_3d.fit_transform(features)

  from .autonotebook import tqdm as notebook_tqdm
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [11]:
umap_df = pd.concat ((
    combined_df,
    pd.DataFrame ({
        "umap_2d_x" : proj_2d[:,0],
        "umap_2d_y" : proj_2d[:,1],
        "umap_3d_x" : proj_3d[:,0],
        "umap_3d_y" : proj_3d[:,1],
        "umap_3d_z" : proj_3d[:,2],
    })
    ), axis=1)

In [12]:
# umap_df

In [13]:
# these are the columns where we find the species
microbe_column_indices = np.arange(15,593)
species_names = umap_df.columns[microbe_column_indices]

In [14]:
diagMasks = {}
for diag in np.unique(umap_df['diagnosis']):
    diagMasks[diag] = (umap_df['diagnosis'] == diag)

In [15]:
import plotly.graph_objects as go
import plotly.express as px

fig_2d = go.Figure()
for (thisDiag, thisMask) in diagMasks.items():
    fig_2d.add_trace(go.Scatter(
        x=umap_df[thisMask]["umap_2d_x"],
        y=umap_df[thisMask]["umap_2d_y"],
        mode='markers',
        name=thisDiag,
        # marker=dict(size=df_subset['petal_length'] * 5),
        # name=sp,
        # marker_color=feature_metadata[thisMask]['diagnosis'],
        customdata=umap_df[thisMask][['External.ID', 'Participant.ID']],
        hovertemplate=(
            "External.ID: %{customdata[0]}<br>"
            "Participant.ID: %{customdata[1]}<extra></extra>"
            )
        )
    )

fig_2d.update_layout(
    title='Microbiome UMAP: Disease vs. Healthy',
    xaxis_title='UMAP 1',
    yaxis_title='UMAP 2',
    legend_title='Diagnosis',
    width = 1000,
    height = 1000,
)
fig_2d.show()


In [16]:
fig_3d = go.Figure()
for (thisDiag, thisMask) in diagMasks.items():
    fig_3d.add_trace(go.Scatter3d(
        x=umap_df[thisMask]["umap_3d_x"],
        y=umap_df[thisMask]["umap_3d_y"],
        z=umap_df[thisMask]["umap_3d_z"],
        mode='markers',
        name=thisDiag,
        # marker=dict(size=df_subset['petal_length'] * 5),
        # name=sp,
        # marker_color=feature_metadata[thisMask]['diagnosis'],
        customdata=umap_df[thisMask][['External.ID', 'Participant.ID']],
        hovertemplate=(
            "External.ID: %{customdata[0]}<br>"
            "Participant.ID: %{customdata[1]}<extra></extra>"
            )
        )
    )

fig_3d.update_layout(
        title='Microbiome UMAP: Disease vs. Healthy',
        scene = dict(
        xaxis_title='UMAP 1',
        yaxis_title='UMAP 2',
        zaxis_title='UMAP 3'),
        width = 1000,
        height = 1000,
        legend_title='Diagnosis'
)
fig_3d.show()

In [33]:
umap_df['gini_simpson'] = 1 - np.sum(np.power(umap_df.iloc[:, microbe_column_indices] / 100, 2), axis = 1)

fig_3d = go.Figure()
fig_3d.add_trace(go.Scatter3d(
    x=umap_df["umap_3d_x"],
    y=umap_df["umap_3d_y"],
    z=umap_df["umap_3d_z"],
    mode='markers',
    name=thisDiag,
    marker=dict(color=umap_df["gini_simpson"],
                colorscale = 'Viridis',
                opacity=0.8),
    # name=sp,
    # marker_color=feature_metadata[thisMask]['diagnosis'],
    customdata=umap_df[['External.ID', 'Participant.ID']],
    hovertemplate=(
        "External.ID: %{customdata[0]}<br>"
        "Participant.ID: %{customdata[1]}<extra></extra>"
        )
    )
)

fig_3d.update_layout(
        title='Microbiome UMAP: Disease vs. Healthy',
        scene = dict(
        xaxis_title='UMAP 1',
        yaxis_title='UMAP 2',
        zaxis_title='UMAP 3'),
        width = 700,
        height = 700,
        legend_title='Diagnosis'
)
fig_3d.show()

In [32]:
fig_3d = go.Figure()
fig_3d.add_trace(go.Scatter3d(
    x=umap_df["umap_3d_x"],
    y=umap_df["umap_3d_y"],
    z=umap_df["umap_3d_z"],
    mode='markers',
    name=thisDiag,
    marker=dict(color=umap_df["Age.at.diagnosis"],
                colorscale = 'Viridis',
                opacity=0.8),
    # name=sp,
    # marker_color=feature_metadata[thisMask]['diagnosis'],
    customdata=umap_df[['External.ID', 'Participant.ID']],
    hovertemplate=(
        "External.ID: %{customdata[0]}<br>"
        "Participant.ID: %{customdata[1]}<extra></extra>"
        )
    )
)

fig_3d.update_layout(
        title='Microbiome UMAP: Disease vs. Healthy',
        scene = dict(
        xaxis_title='UMAP 1',
        yaxis_title='UMAP 2',
        zaxis_title='UMAP 3'),
        width = 1000,
        height = 1000,
        legend_title='Diagnosis'
)
fig_3d.show()

In [31]:
umap_df.columns

Index(['External.ID', 'Participant.ID', 'site_name', 'site_sub_coll',
       'data_type', 'week_num', 'reads_raw', 'reads_filtered',
       'Age.at.diagnosis', 'biopsy_location',
       ...
       's__Fretibacterium_fastidiosum', 's__Akkermansia_muciniphila',
       's__Saccharomyces_cerevisiae', 's__Blastocystis_sp_subtype_1',
       'umap_2d_x', 'umap_2d_y', 'umap_3d_x', 'umap_3d_y', 'umap_3d_z',
       'gini_simpson'],
      dtype='object', length=599)

In [17]:
assert(False)

AssertionError: 