In [None]:
import os
import pandas as pd
import csv

## This script takes output from Kraken2 or Bracken which was previously transformed into
## mpa-like datasheets via the KrakenTools-script kreport2mpa.py.

## INPUT
# You copy all your Kraken2/Bracken-output into a new directory on the cluster
# You add the full path of this directory to the input section (directory_path)
# You look at the name of our Kraken2/Bracken-output an
# Example: the files are called:
# Example: SRR7658581_report_bracken_species_taxinfo.out
# Example: SRR7658590_report_bracken_species_taxinfo.out ...
# Example: "SRR7658590" is the variable part, defining the sample's name
# Example: "_report_bracken_species_taxinfo.out" is the conserved part which stays the same in all files
# Now in the input section, you add the conserved part in "active_input".

# PROCEDURE:
# The script screens through directory_path for files that have the ending
# that you define in the input section (active_input) in the directory_path

## The files will ONLY contain species-taxonomy
## All genus and higher taxonomy (some reads are this level only!) will be discarded
##  If you need these, please let me know and I will try to modify it
## An id is created which is made of species name and an incrementing number
## The script will also create Namco/MicrobiomeExplorer-style data
## They id is replaced by the species-name
## The sample-id is a column header, the sample's abundance is now in each column
## It will also create a merged version of taxonomy and abundance

# The output will be stored in folder directory_path/01_trafo (you can change that)

# The output contains:
# - MICOM-style individual samples:
#        -> <variable samples name>_bra_mpa_trafo.csv

# - MICOM-style SUMMARY of all samples:
#        -> Bra_mpa_trafo_summary_micom.csv

# - NAMCO-style data-files:
#        -> Bra_mpa_trafo_summary_taxonomy.csv     / like a Qiime2 taxonomy.tsv
#        -> Bra_mpa_trafo_summary_featuretable.csv / like a Qiime2 feature table biom file


# - A merged version of feature table and taxonomy file, which may be used for other analyses
#        -> TKWGS_k2_mpa_trafo_summary_merge_tax_ab.csv (both merged)

###################################################
## Torben Kuehnast, 2024, torben.kuehnast@gmail.com
###################################################
# Versions
# v06: now includes instantly compatible output for Namco.
# v07: updated input to be more general and less specific
# v08: version for github
###################################################



############## INPUT HERE ##############################
############## INPUT HERE ##############################
############## INPUT HERE ##############################

# Global directory: this one is screened for input files!
directory_path = "/home/"
# Conserved part of your samples file names (same in all samples)
# This will be looked for automatically by the program.
active_input = "_report_bracken_species_taxinfo.out"


# Change, if you want a different output path. 
# Default is creating a new folder called 01_trafo and storing there.
output_path = os.path.join(directory_path, "01_trafo")
if not os.path.exists(output_path):
    os.makedirs(output_path)

# Change if you want different OUTPUT names.
inter_filename = "_bra_mpa_trafo.csv"
micom_summary = "Bra_mpa_trafo_summary_micom.csv"
output_merged = os.path.join(output_path, "Bra_mpa_trafo_summary_merge_tax_ab.csv")
output_taxonomy = os.path.join(output_path, "Bra_mpa_trafo_summary_taxonomy.csv")
output_abundance = os.path.join(output_path, "Bra_mpa_trafo_summary_featuretable.csv")

###################Input END##########################
###################Input END##########################
###################Input END##########################


################### Start program ####################
################### Start program ####################
################### Start program ####################

print("#active_input:", active_input)
print("#directory_path:", directory_path)
print("#output_path:", output_path)
print("#inter_filename:", inter_filename)
print("#micom_summary:", micom_summary)
print("output_merged:", output_merged)
print("output_taxonomy:", output_taxonomy)
print("output_abundance:", output_abundance)
print(":", )

# Counter for unique "id" generation.
id_counter = 0

print("Starting to sort -metaphlan-like to micom-style.")
for filename in os.listdir(directory_path):
    # The whole folder from input will be looked for the ending you specified in active_input
    if filename.endswith(active_input):
        sample_name = filename.split(active_input)[0]
        print("sample_name:", sample_name)
        target_filename = os.path.join(output_path, f"{sample_name}{inter_filename}")

        print("-----------------")
        print("Analysing the following file:")
        print("filename:", filename)
        
        # saving the summary of all samples in here:
        summary_file = os.path.join(output_path, f"{micom_summary}")
        print("summary_file", summary_file)


        # Ziel-CSV-Datei erstellen
        header = ["id", "kingdom", "phylum", "class", "order", "family", "genus", "species", "sample_id", "abundance"]
        print("Creating new sample file:", target_filename)
        with open(target_filename, "w", newline='') as f:
            csv_writer = csv.writer(f, delimiter=';')
            csv_writer.writerow(header)


        if not os.path.exists(summary_file):
            print("Summary file doesnt exist. Creating new one:", summary_file)
            with open(summary_file, "w", newline='') as f:
                csv_writer = csv.writer(f, delimiter=';')
                csv_writer.writerow(header)

        source_filepath = os.path.join(directory_path, filename)
        df = pd.read_csv(source_filepath, sep='\t', header=None)

        for index, row in df.iterrows():
            id_counter += 1
            kingdom = phylum = _class = _order = family = genus = species = sample_id = ""
            abundance = 0.0
            
            first_col = row[0]
            patterns = ["s__", "g__", "f__", "o__", "c__", "p__", "k__", "d__"]  # 'd__' hinzugefügt
            
            for pattern in patterns:
                if pattern in first_col:
                    parts = first_col.split("|")
                    for part in parts:
                        if part.startswith(("k__", "d__")):  # 'd__' hinzugefügt
                            kingdom = part[3:]
                        elif part.startswith("p__"):
                            phylum = part[3:]
                        elif part.startswith("c__"):
                            _class = part[3:]
                        elif part.startswith("o__"):
                            _order = part[3:]
                        elif part.startswith("f__"):
                            family = part[3:]
                        elif part.startswith("g__"):
                            genus = part[3:]
                        elif part.startswith("s__"):
                            species = part[3:]
                    
                    abundance = float(row[1])
                    
                    if species:
                        id_combined = species+"_"+str(id_counter)
                        with open(target_filename, "a", newline='') as f:
                            csv_writer = csv.writer(f, delimiter=';')
                            csv_writer.writerow([id_combined, kingdom, phylum, _class, _order, family, genus, species, sample_name, abundance])

                        with open(summary_file, "a", newline='') as f:
                            csv_writer = csv.writer(f, delimiter=';')
                            csv_writer.writerow([id_combined, kingdom, phylum, _class, _order, family, genus, species, sample_name, abundance])
                    
                    break
        print("Wrote", target_filename)
        print("Updated", summary_file)
        print("-------------")
print(f"Finished creation of micom-like files.")
print("--------------------------")
print("NEXT STEP: Modifying micom-style to fit Namco/MicrobiomExplorer-style:")
print("--------------------------")

############################################################################################
### Modifying micom-style to fit Namco/MicrobiomExplorer-style:

           
input_file = summary_file
print("Working on:", input_file)
# 1. Laden der CSV-Datei mit dem korrekten Trennzeichen
df = pd.read_csv(input_file, delimiter=';')

# 2. Initialisieren eines leeren DataFrame zur Speicherung der umstrukturierten Daten
columns = ['id', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
restructured_df = pd.DataFrame(columns=columns)

# 3. Durchlaufen jeder Zeile im originalen DataFrame
for index, row in df.iterrows():
    species = row['species']
    sample_id = row['sample_id']
    abundance = int(row['abundance'])  # Konvertieren in Integer
    
    # 4. Überprüfen, ob die Spezies bereits im umstrukturierten DataFrame ist
    species_row = restructured_df[restructured_df['id'] == species]
    
    if species_row.empty:
        # 4.1 Fügen Sie eine neue Zeile für diese Spezies hinzu
        new_row = pd.DataFrame({col: [row[col]] if col != 'id' else [species] for col in columns})
        new_row[sample_id] = abundance
        restructured_df = pd.concat([restructured_df, new_row], ignore_index=True)
    else:
        # 4.2 Aktualisieren der vorhandenen Zeile mit dem neuen Abundanzwert
        restructured_df.loc[restructured_df['id'] == species, sample_id] = abundance
print("Finished all 1D-sample/abundance to 2D-sample/abundance")

# 5. Füllen Sie alle NaN-Werte mit 0 und konvertieren Sie die Zahlen in Integer
restructured_df.fillna(0, inplace=True)
for col in restructured_df.columns[8:]:  # Wir starten bei der ersten 'sample_id'-Spalte
    restructured_df[col] = restructured_df[col].astype(int)
print("Changed empty fields to value zero.")

# 6. Speichern des umstrukturierten DataFrames als neue CSV-Datei
restructured_df.to_csv(output_merged, index=False, sep=';')
print("Creating", output_merged)

# 7. Erstellen der output_taxonomy CSV-Datei
taxonomy_df = restructured_df[['id', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']]

# debug capitalizing for Namco:
taxonomy_df = taxonomy_df.rename(columns={
    'kingdom': 'Kingdom',
    'phylum': 'Phylum',
    'class': 'Class',
    'order': 'Order',
    'family': 'Family',
    'genus': 'Genus',
    'species': 'Species'
})

taxonomy_df.to_csv(output_taxonomy, index=False, sep='\t')
print("Created", output_taxonomy)

# 8. Erstellen der output_abundance CSV-Datei
abundance_df = restructured_df.drop(columns=['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'])
abundance_df.to_csv(output_abundance, index=False, sep='\t')
print("Created", output_abundance)
print("-------------------------")
print("----->>>  Finished all. TK.")
print("-------------------------")
