In [6]:
# Semi-automate data 

In [7]:
# Imports pandas, which is used for data parsing 
# Make sure QIIME2 is activated and pandas is installed
import pandas as  pd

In [8]:
# ONLY RUN ONCE
i = 0

In [9]:
# ONLY CELL REQUIRING INPUT
# Can use 'Run All Below' here from 'Cell' taskbar

# Input combined data file
dataFile = "serum_and_cecum_data.txt"
combinedData = pd.read_table(dataFile)

# Input metadata file
metadata = "serum_and_cecum_metadata.txt"
metadataTable = pd.read_table(metadata)

# Input column name with sample types
columnType = 'ATTRIBUTE_SampleType'

# Input the metadata column used for the visualization file
attributeType = 'ATTRIBUTE_diet'

In [10]:
combinedData.head()

Unnamed: 0,sample-id,X685.5337_3.76,X385.142_2.61,X476.2701_0.45,X699.3657_2.29,X635.3052_2.6,X299.65_0.64,X331.2219_2.09,X390.7519_2.15,X655.5228_3.99,...,X591.3157_2.44,X447.3086_3,X785.5881_2.83,X511.3592_3.32,X817.5779_2.67,X391.2826_2.78,X355.2615_2.67,X373.272_2.61,X357.2772_2.83,X373.2721_2.67
0,P115_D1_Conventional_cellulose_33_NT.mzXML,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003766,0.0,0.06447,0.0,0.022398,0.039077,0.046975,0.168414,0.239779,0.152882
1,P115_D10_Conventional_whole_grains_34_2LET.mzXML,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.014651,0.0,0.114038,0.0,0.015444,0.0,0.064935,0.0,0.335005,0.0
2,P115_D11_Conventional_vegetable_35_NT.mzXML,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001102,0.0,0.038302,0.0,0.016525,0.044081,0.0,0.433875,0.147597,0.0
3,P115_D12_Conventional_vegetable_35_LET.mzXML,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000261,0.000769,0.007514,0.0,0.021021,0.0,0.088131,0.0,0.121502,0.227194
4,P115_D2_Conventional_cellulose_33_LET.mzXML,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.008581,0.0,0.02032,0.0,0.027133,0.0,0.065095,0.457712,0.059399,0.0


In [11]:
metadataTable.head()

Unnamed: 0,sample-id,ATTRIBUTE_SampleType,ATTRIBUTE_donor,ATTRIBUTE_diet,ATTRIBUTE_Mouse_Number,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 2673,Unnamed: 2674,Unnamed: 2675,Unnamed: 2676,Unnamed: 2677,Unnamed: 2678,Unnamed: 2679,Unnamed: 2680,Unnamed: 2681,Unnamed: 2682
0,P115_D1_Conventional_cellulose_33_NT.mzXML,cecum contents,Conventional,cellulose,33_NT,,,,,,...,,,,,,,,,,
1,P115_D10_Conventional_whole_grains_34_2LET.mzXML,cecum contents,Conventional,whole_grains,34_2LET,,,,,,...,,,,,,,,,,
2,P115_D11_Conventional_vegetable_35_NT.mzXML,cecum contents,Conventional,vegetable,35_NT,,,,,,...,,,,,,,,,,
3,P115_D12_Conventional_vegetable_35_LET.mzXML,cecum contents,Conventional,vegetable,35_LET,,,,,,...,,,,,,,,,,
4,P115_D2_Conventional_cellulose_33_LET.mzXML,cecum contents,Conventional,cellulose,33_LET,,,,,,...,,,,,,,,,,


In [12]:
# Gives the names of each unique sample
sampleTypes = metadataTable[columnType].unique()
print(sampleTypes)

['cecum contents' 'serum']


In [13]:
# Finds the indexes of the samples in the ATTRIBUTE_SampleType column
# Save sampleIDs corresponding to that sample into a list with the indexes
# By using i variable, it selects a sampleType from the samples listed in the previous cell
sampleIndexes = metadataTable.loc[metadataTable[columnType] == sampleTypes[i]].index.tolist()

In [14]:
# Finds sampleIDs from metadataTable given previously found organ indexes
sampleIDs = metadataTable['sample-id'].iloc[sampleIndexes]
print(sampleIDs)

0            P115_D1_Conventional_cellulose_33_NT.mzXML
1      P115_D10_Conventional_whole_grains_34_2LET.mzXML
2           P115_D11_Conventional_vegetable_35_NT.mzXML
3          P115_D12_Conventional_vegetable_35_LET.mzXML
4           P115_D2_Conventional_cellulose_33_LET.mzXML
                             ...                       
145               P115_B3_WLS_291_cellulose_28_NT.mzXML
146              P115_B4_WLS_291_cellulose_28_LET.mzXML
147              P115_B5_WLS_291_cellulose_28_RET.mzXML
148              P115_B6_WLS_291_cellulose_28_BET.mzXML
149             P115_B7_WLS_291_cellulose_28_2LET.mzXML
Name: sample-id, Length: 150, dtype: object


In [15]:
# Saves the sampleIDs found at the indexes
values = metadataTable['sample-id'].loc[sampleIndexes]

In [16]:
# Finds the indexes where the sampleIDs occur in the combinedData dataset
test = []
for j in sampleIndexes:
    test = test + (combinedData.loc[combinedData['sample-id'].str.match(values[j])].index.tolist())
    
print(test)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]


In [17]:
# Makes sure there's no spaces in the organs so filenames are cleaner
for j in range(len(sampleTypes)):
    sampleTypes[j] = sampleTypes[j].replace(" ", "_")
print(sampleTypes)

['cecum_contents' 'serum']


In [18]:
# Add "_data.csv" to the end of each organ's file name
for j in range(len(sampleTypes)):
    sampleTypes[j] = sampleTypes[j].replace(" ", "_")
    sampleTypes[j] = sampleTypes[j] + "_data.csv"

In [19]:
# stores indexes of rows with correct sample-id into a variable
selectedRows = combinedData.iloc[test]

In [21]:
# Variables to hold adjustable file names
fileName = sampleTypes[i]
fileNameNoTxt = fileName.replace(".csv","")
fileNameQza = fileNameNoTxt + ".qza"
visualizationFile = fileNameNoTxt + "_" + attributeType + "-significance.qzv"

In [22]:
# Creates csv file with \t delimeter
selectedRows.to_csv(fileName, sep='\t', index=False)

In [23]:
# Creates variable holding emperor filename
sampleName = fileNameNoTxt.replace("_data", "")
sampleEmperor = sampleName + "_emperor"
i = i + 1

In [25]:
# Creates braycurtis filenames
braycurtisOutput = "braycurtis_qiime2_" + sampleName
braycurtisQza = braycurtisOutput + "/distance_matrix.qza"
braycurtisPCoA = braycurtisOutput + "_PCoA"
braycurtisPCoAQza = braycurtisPCoA + "/pcoa.qza"

In [26]:
%%bash -s "$fileName" "$fileNameNoTxt"
qiime sample-classifier metatable \
--m-metadata-file $1 \
 --o-converted-table $2 \


Saved FeatureTable[Frequency] to: cecum_contents_data.qza


In [27]:
%%bash -s "$fileNameQza" "$braycurtisOutput"
qiime diversity beta \
--i-table $1 \
--p-metric "braycurtis" \
--output-dir $2


Saved DistanceMatrix to: braycurtis_qiime2_cecum_contents/distance_matrix.qza


In [28]:
%%bash -s "$braycurtisQza" "$braycurtisPCoA"
qiime diversity pcoa \
--i-distance-matrix $1 \
--output-dir $2


Saved PCoAResults to: braycurtis_qiime2_cecum_contents_PCoA/pcoa.qza


In [29]:
%%bash -s "$braycurtisPCoAQza" "$metadata" "$sampleEmperor"
qiime emperor plot \
--i-pcoa $1 \
--m-metadata-file $2 \
--output-dir $3


Saved Visualization to: cecum_contents_emperor/visualization.qzv


In [30]:
%%bash -s "$braycurtisQza" "$metadata" "$attributeType" "$visualizationFile"
qiime diversity beta-group-significance \
  --i-distance-matrix $1 \
  --m-metadata-file $2 \
  --m-metadata-column $3 \
  --o-visualization $4 \
  --p-pairwise


Saved Visualization to: cecum_contents_data_ATTRIBUTE_diet-significance.qzv
