# Microbial Biogeography of Public Restroom Surfaces

In [14]:
import pandas as pd
import itertools
import plotly.graph_objects as go
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.express as px

In [15]:
# define a Dash app
app = dash.Dash(__name__)

In [16]:
data = pd.read_csv("data/VDB_16S_dataset.txt", sep="\t")
data.head()

Unnamed: 0,#OTU ID,EKCM2.489495,EKBM8.489473,EKCF4.489498,PTBM9.489505,EKBF10.489552,PTAM4.489517,EKCM1.489478,EKAM4.489564,EKCM7.489464,...,PTCF8.489486,EKCM9.489514,PTBF4.489483,PTBF1.489562,B6.489449,B5.489455,B1.489537,B3.489528,B2.489526,ConsensusLineage
0,469478,3.0,5.0,7.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,k__Bacteria; p__Firmicutes; c__Clostridia; o__...
1,208196,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,k__Bacteria; p__Proteobacteria; c__Alphaproteo...
2,378462,0.0,0.0,0.0,0.0,0.0,2.0,2.0,8.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,k__Bacteria; p__Firmicutes; c__Bacilli; o__Bac...
3,265971,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,k__Bacteria; p__Actinobacteria; c__Actinobacte...
4,570812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,k__Bacteria; p__Proteobacteria; c__Alphaproteo...


In [17]:
metadata = pd.read_csv("data/VDB_16S_metadata.txt", sep="\t")
metadata.head()

Unnamed: 0,SampleID,Gender,Floor,Building,Surface
0,EKCM2.489495,Male,C,Ekeley,Door out
1,EKBM8.489473,Male,B,Ekeley,Faucet handles
2,EKCF4.489498,Female,C,Ekeley,Stall out
3,PTBM9.489505,Male,B,Porter,Soap dispenser
4,EKBF10.489552,Female,B,Ekeley,Sink floor


## 1. Process "ConsensusLineage" column

In [18]:
def process_lineage(lineage, index):
    """
    Given a lineage from 'ConsensusLineage' it returns value for the 'index' column (corresponding a taxa 
    level) and replace with NaN when no taxa name is provided.
    """
    taxa_lvl = lineage.split(";")[index]
    taxa_lvl = taxa_lvl.split("__")
    if taxa_lvl[1] != "":
        return taxa_lvl[1] # correct value provided
    else:
        return "NaN" # no correct value provided

In [19]:
ConsensusLineage = data["ConsensusLineage"].str.split(";")
new_columns = ["kingdom", "phyla", "class", "order", "family", "genus", "species"]

for i in range(len(new_columns)):
    data[str(new_columns[i])] = data["ConsensusLineage"].map(lambda x: process_lineage(x, i))

## 2. Transformation of the dataset

In [20]:
# Recupération des différentes surfaces
surfaces = metadata["Surface"].unique().tolist()

# Recupération des label des échantillons pour chaque surface
samples = {} # key: surface, value: list of samples of the surface
for sur in surfaces:
    samples[sur] = metadata.loc[metadata["Surface"] == sur, "SampleID"].tolist()

In [21]:
# Recupération des comptage pour chaque surface
for surface in surfaces:
    data[surface] = 0
    for sample in samples[surface]:
        data[surface] += data[sample]

## 3. Boxplot of surfaces 

In [22]:
def taxa_lvl2table(data, taxa_lvl):
    """
    Given the starting dataframe, it returns a new dataframe with N_surfaces x N_taxa rows (with N_surfaces the
    number of possible surfaces and N_taxa the number of taxa for the given taxa level). 
    Then, for each row it takes associated count for the given combination (surface-taxa) and rescale as 
    frequency (count_value/sum_count_column).
    """
    
    # Génération d'un tableau avec toutes les combinaisons possibles de surface-genre
    taxa_lvl = str(taxa_lvl)
    taxa = data[taxa_lvl].unique().tolist()
    comb_sur_taxa_lvl = list(itertools.product(surfaces, taxa))
    new_data = pd.DataFrame(comb_sur_taxa_lvl, columns=["surface", taxa_lvl])
    new_data["count"] = ""
    
    # Remplissage de la colonne "count" (étape longue)
    for index, row in new_data.iterrows():
        surface = row["surface"]
        taxa = row[taxa_lvl]
        count = data.loc[data[taxa_lvl] == taxa][surface]
        row["count"] = sum(count.tolist())
    
    # Filtrage pour enlever les taxa faiblement représentés
    new_data = new_data[new_data["count"] > 1500]
   
    # Calcul des fréquences
    for surface in surfaces:
        sum_surface = new_data[new_data["surface"] == surface]["count"].sum()
        new_data.loc[new_data['surface'] == surface, 'count'] = new_data[new_data["surface"] == surface]["count"].map(lambda x: x/sum_surface)
    
    return new_data

In [28]:
app.layout = html.Div([
    html.P("Taxa level"),
    dcc.Dropdown(
        id='taxa_lvl', 
        options=[{"value": x, "label": x} 
                 for x in new_columns],
        value='genus'
    ),
    dcc.Graph(id="graph"),
])

@app.callback(
    Output("graph", "figure"), 
    [Input("taxa_lvl", "value")])
def change_taxa_lvl(taxa_lvl):
    new_data = taxa_lvl2table(data, taxa_lvl)
    
    fig = px.bar(new_data, x="surface", y="count", color=taxa_lvl, color_discrete_sequence=px.colors.qualitative.Alphabet, title="Taxonomic composition by surface")
    return fig

app.run_server(debug=False, port=5000)

Dash is running on http://127.0.0.1:5000/

Dash is running on http://127.0.0.1:5000/

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [30/Nov/2021 11:24:09] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [30/Nov/2021 11:24:09] "GET /_dash-dependencies HTTP/1.1" 200 -
127.0.0.1 - - [30/Nov/2021 11:24:09] "GET /_dash-layout HTTP/1.1" 200 -
127.0.0.1 - - [30/Nov/2021 11:24:09] "GET /_favicon.ico?v=2.0.0 HTTP/1.1" 200 -
127.0.0.1 - - [30/Nov/2021 11:24:09] "GET /_dash-component-suites/dash/dcc/async-dropdown.js HTTP/1.1" 200 -
127.0.0.1 - - [30/Nov/2021 11:24:09] "GET /_dash-component-suites/dash/dcc/async-plotlyjs.js HTTP/1.1" 200 -
127.0.0.1 - - [30/Nov/2021 11:24:09] "GET /_dash-component-suites/dash/dcc/async-graph.js HTTP/1.1" 200 -


In [44]:
# static
# new_data = taxa_lvl2table(data, "") # choisir le niveau taxonomique parmi: ["kingdom", "phyla", "class", "order", "family", "genus", "species"]
# fig = px.bar(new_data, x="surface", y="count", color="genus", title="Taxonomic composition by surface")
# fig.show()