Gut microbiome of children with autism vs normally developing kids

In [15]:
import pandas as pd
import yaml

In [16]:
#opening the config.yaml
with open("config.yaml",'r') as conf:
    config = yaml.safe_load(conf)
    abundance_df = pd.read_csv(config['ASD_abundance'])

abundance_df.head()

Unnamed: 0,Taxonomy,A3,A5,A6,A9,A31,A51,A52,A53,A54,...,B120,B127,B132,B141,B142,B143,B152,B156,B158,B164
0,g__Faecalibacterium;s__Faecalibacterium prausn...,4988,5060,2905,5745,4822,3889,4646,6337,5064,...,4471,5868,6561,4910,4492,2812,5303,4205,3430,4563
1,g__Hungatella;s__Hungatella hathewayi,5803,5612,4109,1432,2652,4175,3891,894,4903,...,2126,4429,2598,4222,4925,5753,1261,1822,2478,4868
2,g__Clostridium;s__uncultured Clostridium sp.,3793,2795,1355,5558,5383,3505,5541,4429,4121,...,4085,6041,6188,3960,4403,2841,2746,3808,3856,3211
3,g__Butyricimonas;s__Butyricimonas virosa,64,1385,725,1553,40,53,33,175,58,...,2065,21,27,55,35,8,884,13,3,218
4,g__Alistipes;s__Alistipes indistinctus,15,20,723,620,3261,43,83,37,43,...,90,22,30,1027,2641,4,1587,2223,6,1473


In [17]:
#renaming the taxonomy label to ID and transposing the dataframe
abundance_df = abundance_df.rename(columns={"Taxonomy":"ID"})
abundance_df["ID"] = abundance_df["ID"].astype(str)
abundance_df = abundance_df.set_index('ID')
abundance_df = abundance_df.transpose()
#check for empty values
print(f'There are {sum(abundance_df.isnull().sum())} empty cells')
#check transposed, datatypes
abundance_df.info()



There are 0 empty cells
<class 'pandas.core.frame.DataFrame'>
Index: 60 entries, A3 to B164
Columns: 5619 entries, g__Faecalibacterium;s__Faecalibacterium prausnitzii to g__Unclassified;s__Freshwater phage uvFW-CGR-AMD-COM-C203
dtypes: int64(5619)
memory usage: 2.6+ MB


In [18]:
import re
# Regex the column names to 'Genus species' format, the use of lambda makes it look very clean:
#https://www.w3schools.com/python/python_lambda.asp
abundance_df = abundance_df.rename(columns=lambda x: re.sub('(g__|s__)',' ',x))
# And now without the space in front of the species
abundance_df = abundance_df.rename(columns=lambda x: re.sub('^ ','',x))
# Insert "autism column" with either yes or no based on A in front of the ID
abundance_df["Autism"] = ["yes" if 'A' in i else 'no' for i in abundance_df.index]

abundance_df.head()

ID,Faecalibacterium; Faecalibacterium prausnitzii,Hungatella; Hungatella hathewayi,Clostridium; uncultured Clostridium sp.,Butyricimonas; Butyricimonas virosa,Alistipes; Alistipes indistinctus,Unclassified; Firmicutes bacterium CAG:176,Clostridium; Clostridium sp. CAG:7,Unclassified; Firmicutes bacterium CAG:882,Lachnoclostridium; [Clostridium] asparagiforme,Butyricicoccus; uncultured Butyricicoccus sp.,...,Unclassified; Enterococcus phage EFDG1,Unclassified; Podovirus Lau218,Sap6virus; Enterococcus phage VD13,Unclassified; Bacillus phage vB_BanS-Tsamsa,Unclassified; Gordonia phage GTE2,Alphabaculovirus; Hyphantria cunea nucleopolyhedrovirus,Potyvirus; Bean common mosaic virus,Potyvirus; Telosma mosaic virus,Unclassified; Freshwater phage uvFW-CGR-AMD-COM-C203,Autism
A3,4988,5803,3793,64,15,100,2119,12,453,1266,...,0,0,0,0,0,0,0,0,0,yes
A5,5060,5612,2795,1385,20,29,1230,24,691,1682,...,0,0,0,0,0,0,0,0,0,yes
A6,2905,4109,1355,725,723,11,1322,1,2278,43,...,0,0,0,0,0,0,0,0,0,yes
A9,5745,1432,5558,1553,620,1320,2675,44,107,1726,...,0,0,0,0,0,0,0,0,0,yes
A31,4822,2652,5383,40,3261,51,1470,26,342,1804,...,0,0,0,0,0,0,0,0,0,yes


In [19]:
abundance_df[abundance_df["Autism"]=='yes'].describe()


ID,Faecalibacterium; Faecalibacterium prausnitzii,Hungatella; Hungatella hathewayi,Clostridium; uncultured Clostridium sp.,Butyricimonas; Butyricimonas virosa,Alistipes; Alistipes indistinctus,Unclassified; Firmicutes bacterium CAG:176,Clostridium; Clostridium sp. CAG:7,Unclassified; Firmicutes bacterium CAG:882,Lachnoclostridium; [Clostridium] asparagiforme,Butyricicoccus; uncultured Butyricicoccus sp.,...,Unclassified; Clostridium phage c-st,Unclassified; Enterococcus phage EFDG1,Unclassified; Podovirus Lau218,Sap6virus; Enterococcus phage VD13,Unclassified; Bacillus phage vB_BanS-Tsamsa,Unclassified; Gordonia phage GTE2,Alphabaculovirus; Hyphantria cunea nucleopolyhedrovirus,Potyvirus; Bean common mosaic virus,Potyvirus; Telosma mosaic virus,Unclassified; Freshwater phage uvFW-CGR-AMD-COM-C203
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,...,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,4942.8,3386.533333,3708.966667,562.366667,897.533333,398.433333,1611.266667,124.366667,1007.066667,1353.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,1325.099772,1632.505832,1266.748226,788.912236,1159.686246,716.790918,667.734716,532.410879,787.139997,635.24929,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,2905.0,894.0,1014.0,7.0,3.0,11.0,144.0,1.0,20.0,43.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3813.5,1935.25,3021.0,49.25,43.0,45.75,1298.75,15.0,426.75,1037.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5062.0,3122.5,3825.5,65.5,75.5,54.5,1725.0,18.5,719.0,1291.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5741.75,4724.0,4334.0,1125.75,1875.75,150.75,2027.25,29.75,1546.25,1755.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,8504.0,6849.0,6372.0,3285.0,3261.0,3034.0,2984.0,2940.0,2758.0,2691.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
#drop columns with only zeros
#tried some functions myself but found this more elegant method on stackoverflow:
#https://stackoverflow.com/questions/21164910/how-do-i-delete-a-column-that-contains-only-zeros-in-pandas

abundance_df = abundance_df.loc[:, (abundance_df != 0).any(axis=0)]
abundance_df.head()

ID,Faecalibacterium; Faecalibacterium prausnitzii,Hungatella; Hungatella hathewayi,Clostridium; uncultured Clostridium sp.,Butyricimonas; Butyricimonas virosa,Alistipes; Alistipes indistinctus,Unclassified; Firmicutes bacterium CAG:176,Clostridium; Clostridium sp. CAG:7,Unclassified; Firmicutes bacterium CAG:882,Lachnoclostridium; [Clostridium] asparagiforme,Butyricicoccus; uncultured Butyricicoccus sp.,...,Unclassified; Environmental Halophage eHP-30,Unclassified; Flavobacterium phage FKj-2,Unclassified; Geobacillus virus E2,Unclassified; Salmonella phage 64795_sal3,Unclassified; Skermania phage SPI1,Unclassified; Vibrio phage vB_VhaS-tm,Unclassified; uncultured Mediterranean phage uvDeep-CGR0-AD1-C123,Unclassified; uncultured Mediterranean phage uvDeep1-CGR2-KM23-C896,Unclassified; uncultured virus,Autism
A3,4988,5803,3793,64,15,100,2119,12,453,1266,...,0,0,0,0,0,0,0,0,0,yes
A5,5060,5612,2795,1385,20,29,1230,24,691,1682,...,0,0,0,0,0,0,0,0,0,yes
A6,2905,4109,1355,725,723,11,1322,1,2278,43,...,0,0,0,0,0,0,0,0,0,yes
A9,5745,1432,5558,1553,620,1320,2675,44,107,1726,...,0,0,0,0,0,0,0,1,0,yes
A31,4822,2652,5383,40,3261,51,1470,26,342,1804,...,0,0,0,0,0,0,0,0,0,yes


In [21]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
output_notebook()

ImportError: cannot import name 'InstanceDefault' from 'bokeh.core.properties' (/opt/miniconda3/envs/testenvironment/lib/python3.10/site-packages/bokeh/core/properties.py)

In [None]:
# Two dataframes for the autism group and the normally developing children
# histogram of two 

from bokeh.models.widgets import Panel, Tabs

abundance_ASD = abundance_df[abundance_df["Autism"]=='yes']
abundance_ASD = abundance_ASD.drop(columns=["Autism"])
abundance_CON = abundance_df[abundance_df["Autism"]=='no']
abundance_CON = abundance_CON.drop(columns=["Autism"])

def compare_hist(df1,df2,index):
    x = [[x for x in df1[index]],[x for x in df2[index]]]
    return x

list_indexes = abundance_CON.columns.values.tolist()

print(list_indexes)



ImportError: cannot import name 'InstanceDefault' from 'bokeh.core.properties' (/opt/miniconda3/envs/testenvironment/lib/python3.10/site-packages/bokeh/core/properties.py)

In [None]:
#Prepare a dataframe with only means and standarddeviations of the two groups.




abundance_ASD.loc["AS_STDEV"] = abundance_ASD.std()
abundance_ASD.loc["AS_VALUE"] = abundance_ASD.mean()
abundance_ASD = abundance_ASD["AS_VALUE","AS_STDEV"]


abundance_CON.loc["CT_STDEV"] = abundance_CON.std()
abundance_CON.loc["CT_VALUE"] = abundance_CON.mean()
abundance_CON = abundance_CON.loc["CT_VALUE"]

abundance_ASD = abundance_ASD.reset_index()
abundance_CON = abundance_CON.reset_index()

data = pd.merge(abundance_ASD,abundance_CON)
data = data.set_index("ID").iloc[:101,:].transpose()

data.head()

KeyError: ('AS_VALUE', 'AS_STDEV')

In [None]:
import matplotlib.pyplot as plt
from bokeh.models import Box
import panel as pn

ImportError: cannot import name 'InstanceDefault' from 'bokeh.core.properties' (/opt/miniconda3/envs/testenvironment/lib/python3.10/site-packages/bokeh/core/properties.py)

NameError: name 'pn' is not defined