# Script Merge tas tfb micom
Version GitHub 01

Transformation of heavily modified taxonomy file and table.from_biom file  
into python-based MICOM-compatible 1-dimensional id/taxonomy/sample_id/abundance csv file  

########################################################  
By Torben Kuehnast, torben.kuehnast@gmail.com, 2024


In [None]:
import sys
import os
import pandas as pd


In [None]:

###########INPUT FIELD#############
# Before you start:

### INPUT:
# Insert file name for XXX_table.from_biom.csv - aside of XXX, the file name has to be PRECISELY like that!
filepath_tfb = '/home/project_table-from-biom.csv'
# --> it has to contain:
# ASV ------- sample1  --- sample2  --- ...
# <ident> --- <number> --- <number> --- ...
# <ident> --- <number> --- <number> --- ...
# So first column with ASV identifiers
# Second column containing the first sample_id in the header and the abundance-number
# corresponding to the ASV identifier in the same row

### INPUT:
# Insert file name for XXX_taxa_abund_sample.csv - aside of XXX, the file name has to be PRECISELY like that!
filepath_tas = '/home/project_taxa_abund_sample.csv'
# --> it has to contain:
# id, kingdom, phylum, class, order, family, genus, species
# Make sure all fields filled up!
# Make sure id (a random unique identifier of the respective row) was prepared beforehand.
# ALERT: the order and count of the rows must be the SAME for BOTH files.
# ALERT: If not, you will mix up the wrong abundances to the wrong taxa, or create fatal errors.
###################################



print('----------------------')
print('## DATA OVERVIEW ##')
print('----------------------')
print('filepath_tfb is:', filepath_tfb)
print('filepath_tas is:', filepath_tas)
print('-----------')
# extracting project name from XXX_...
project_tas = os.path.basename(filepath_tas).split('_taxa_abund_sample.csv', 1)[0]
print('project_tas is:', project_tas)
print('-----------')
data_tfb = pd.read_csv(filepath_tfb, sep=";")
print('data_tfb is:', data_tfb.head(4))
print('-----------')
data_tas = pd.read_csv(filepath_tas, sep=";")
print('data_tas is:', data_tas.head(4))
print('-----------')
# showing sample lists
print('samples in the table from biome (please verify):', data_tfb.columns)
print('-----------')

# in table from biom we have 113 columns. 
column_count_tfb = data_tfb.shape[1]
print('columns counted in table from biome:', column_count_tfb)

sample_count_tfb = column_count_tfb
print('column in tfb minus 1 = samples counted in table from biome:', sample_count_tfb)
print('-----------')

print("Total count of samples found in taxa.from.biom:", sample_count_tfb)


In [None]:

# Sample that the loop is currently working on. Giving it 0 for start.
current_working_sample = 0

# We create the foundation for the final micom file, containing all 10 columns, of all samples and taxa
# Initially, just the column headers
final_micom_tas = pd.DataFrame(columns=['id', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'sample_id', 'abundance'])
final_micom_tas
# With each sample in the for loob, we will be adding a bunch of rows, until all samples are integrated.
# Then it is ready for micom.


# Modify column headers, to make sure writing is correct for micom
data_tas_col = data_tas
data_tas_col.columns = ['id', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
data_tas_col
# data_tas_col serves as taxa pattern, not changed during program


# Going through all the samples listed in the table.from_biom.csv
# Use range() to transform sample_count_tfv into usable int numbers
for current_working_sample in range(1, sample_count_tfb):
    # which sample are we in      (ASV - Sample1 - Sample2 - Sample3 ....)?
    # Number of sample_count_tfb:   0       1         2         3  
    # Range starts at _1_! This way we leave out ASV column.
    print('Sample number:', current_working_sample)
    
    # What is column header-name at current position?
    # find the column name of the current_work_sample in the data_tfb and show it
    sample_id = [data_tfb.columns[current_working_sample]]
    print('Sample_id: ', sample_id)
    
    #Look through data_tfb, containing:
    #print("Origin of data_tfb: ", filepath_tfb)
    # for a COLUMN whose header is called:
    #print("Column with THIS header we look for: ", sample_id)
    
    sample_data = data_tfb[[data_tfb.columns[current_working_sample]]]
    #print("Show me current column, stored in sample_data:", sample_data)
           
    # From our current column, or respectively, the whole data_tfb: how many rows does it even have?
    # Count rows of our table from biom file and store the count in tfb_column_count
    tfb_column_count = data_tfb.shape[0]
    #print('ROWS in the current column/file:', tfb_column_count)
    
    
    # Storing the column from "sample_data" in abundance_column as DataFrame with abundance header
    #OLD(doesnt work): abundance_column = pd.DataFrame(sample_data, columns=["abundance"])
    #NEW:
    abundance_column = sample_data
    #set column header to abundance
    abundance_column.columns = ['abundance']
    #print("abundance_column: ", abundance_column)
    
    
    #debugging
    if tfb_column_count == 0:
        print("Error: You have no data in your abundance table of sample: ", sample_id)
    #debugging
    if tfb_column_count == 1:
        print("Alert: You only have 1 column in your dataset, usually ASV identifiers:", sample_id)
    #debugging
    if tfb_column_count == 2:
        print("Alert: You have just one sample. Is it intended? -->", sample_id)
        
    # The initial sample_id column, which is incremented x times (x = tfb_column_count)
    incre_sample_id = pd.DataFrame(sample_id, columns=["sample_id"])
    #print("incre_sample_id", incre_sample_id)

    # The 1x_sample_id column, which is added to incre_sample_id each loop
    one_added_sample_id = pd.DataFrame(sample_id, columns=["sample_id"])
    #print("one_added_sample_id:", one_added_sample_id)

    # So, initially, incre_sample_id has only 1 sample-id row
    # This is now incremented with each loop for that many times as we have rows in the abundance files

    x = 0
    print("Initial tfb_column_count:", tfb_column_count)
    for x in range(1, tfb_column_count):
        #print("x:", x)

        incre_sample_id = pd.concat([incre_sample_id, one_added_sample_id], ignore_index=True)
        #print("New incre_sample_id: ", incre_sample_id)
        #print("---------------- loop ended ---------------")
    
    # Combine the freshly created incre_sample_id (sample_id column)
    # with the abundance_column (abundance values column)
    samp_id_abund_column = pd.concat([incre_sample_id, abundance_column], axis=1)
    #print("samp_id_abund_column: ", samp_id_abund_column)
    
    #Now combine samp_id_abund_column (2 columns: abundance and sample_id)
    #..with data_tas_col (8 columns: id and taxas)
    samp_id_abund_column_merge = pd.concat([data_tas_col, samp_id_abund_column], axis=1)
    print("Taxa/abundance/id/sample_id of", sample_id, ":", samp_id_abund_column_merge)
    #..creating samp_id_abund_column_merge (10 columns: id, taxa, sample_id, abundance)
    #.. of ONE sample (the one currently in the foor loop)

    # Now, this has to be appended to the final_micom_tas:  
    final_micom_tas = pd.concat([final_micom_tas, samp_id_abund_column_merge], ignore_index=True)
    print("Growing final_micom_tas dataframe:", final_micom_tas)
    
    

        
    print('--------------------------------')
    
    
print('-- Sample loop ended --------------------------------')
print('Transformed final_micom_tas should be created:', final_micom_tas)

df = pd.DataFrame(final_micom_tas)
df.to_csv(project_tas+'_final_micom_tas.csv', index=False, header=True)

print("---- Created: ----")
print("")
print(os.path.abspath("")+"/"+project_tas+'_final_micom_tas.csv')
print("------------------")


# Script Merge tas tfb micom
Finished!