In [11]:
import pandas as pd
import os
from datetime import datetime

In [2]:
microbiome_path = "/home/kunal/tscc_projects/tanner/data/family3/microbiome/14009b/"

In [3]:
!cat $microbiome_path/README.txt

List of files and directories in this folder

README.txt: 	this file

Samples.txt: 	a text file with a list of sample names
        For projects that we received samples by batches, and we processed and annotated these samples
        by batches, there will be additional files <Sample-batch-number.txt>.

Sample directories: 	each sample directory has a list of files:
        taxonomy-data: 		taxonomy binning results for all genomes in HLI's reference genome database	
        superkingdom-reads-count.txt:	number of reads mapped to different superkingdoms

        assembles.fna:		assemble sequences in FASTA format
        assembles-cov:		average coverage of assembles
        species-bins-assembles:	assembles binned at species level
        ORF.faa:		ORF sequences predicted from assembles in FASTA format
        ORF-cov:		average coverage of ORFs
        species-bins-ORFs:	ORFs binned at species level
        ORF-single.faa:		ORF sequences predicted from un-assembled rea

In [4]:
manifest = "/home/kunal/tscc_projects/tanner/doc/Nof1samples_manifest.xlsx"
manifest = pd.read_excel(manifest, )

In [5]:
manifest.head()

Unnamed: 0,FamilyID,Individual ID,Date,Type,Tissue,VendorID,Individual ID 2,Sample ID 1,Sample ID 2,Unnamed: 9,Individual Key,Unnamed: 11,Family Key,Unnamed: 13
0,1,CaLo,04-16-14,Metabolites,Plasma,JCVI-00001,2,001_CaLo_04-16-14_Metabolites_JCVI-00001,001_002_Metabolites_Plasma_JCVI-00001_04-16-14,,TaLo,1,Longstreet,1.0
1,1,CaLo,04-16-14,Metabolites,Plasma,JCVI-00002,2,001_CaLo_04-16-14_Metabolites_JCVI-00002,001_002_Metabolites_Plasma_JCVI-00002_04-16-14,,CaLo,2,,
2,1,CaLo,07-11-14,Metabolites,Plasma,JCVI-00003,2,001_CaLo_07-11-14_Metabolites_JCVI-00003,001_002_Metabolites_Plasma_JCVI-00003_07-11-14,,MaLo,3,,
3,1,CaLo,09-25-14,Metabolites,Plasma,JCVI-00008,2,001_CaLo_09-25-14_Metabolites_JCVI-00008,001_002_Metabolites_Plasma_JCVI-00008_09-25-14,,GrLo,4,,
4,1,MaLo,09-25-14,Metabolites,Plasma,JCVI-00009,3,001_MaLo_09-25-14_Metabolites_JCVI-00009,001_003_Metabolites_Plasma_JCVI-00009_09-25-14,,ElGo,5,,


In [9]:
samples = [sample for sample in os.listdir(microbiome_path)
           if os.path.isdir(os.path.join(microbiome_path, sample)) and 
           sample not in ['Fastq', 'summary', 'new']]

In [12]:
sample_map = dict((sample, manifest[manifest['VendorID'] == sample]['Sample ID 2'].values[0])
                  for sample in samples)
sort_by_date = lambda x: datetime.strptime(x[1].split('_')[-1], "%m-%d-%y")
sorted_sample_map = sorted(sample_map.items(), key=sort_by_date)

In [13]:
files = [fn for fn in os.listdir(os.path.join(microbiome_path, samples[1]))
         if fn.endswith('.txt')]
files = [fn for fn in files
         if not ((fn.startswith('Virus') and (fn.split('-')[1] != 'toprank')))]
files = [fn for fn in files
         if not fn.startswith('Virus')]
files = [fn for fn in files if fn not in ['protein-ann.txt']]
files.sort()

In [15]:
def create_df(fn):
    def load_df(file_path, sample_id):
        df = pd.read_table(file_path)
        indices = set(df.columns)
        file_type = os.path.basename(file_path).split('-')[0]
        if file_type in ['Eukaryota', 'Bacteria', 'Viruses']:
            indices = indices-set(['abundance'])
            indices = list(indices)
            df.set_index(indices, inplace=True, drop=True)
        elif file_type in ['superkingdom']:
            indices = indices-set(['reads_count'])
            indices = list(indices)
            df.set_index(indices, inplace=True, drop=True)
        else:
            indices = indices - set(['Coverage', 'No_orfs', 'Abundance'])
            indices = list(indices)
            df.set_index(indices, inplace=True, drop=True)
            df = df[['Abundance']]
        df.columns = [sample_id]
        return df
    dfs = []
    for sample, sample_id in sorted_sample_map:
        #print sample
        file_path = os.path.join(microbiome_path, 
                                 sample,
                                 fn)
        dfs.append(load_df(file_path, sample_id))
    return pd.concat(dfs, axis=1)

In [16]:
out_file = "/home/kunal/Schork-11-samples-taxon-and-function.xlsx"
writer = pd.ExcelWriter(out_file)
for fn in files:
    print fn
    df = create_df(fn).fillna(0)
    df.to_csv("/home/kunal/tscc_projects/tanner/data/family3/microbiome/14009b/new/"+fn,
             sep="\t")
    df.to_excel(writer, fn)
writer.close()

Bacteria-class-abundance.txt
Bacteria-family-abundance.txt
Bacteria-genus-abundance.txt
Bacteria-order-abundance.txt
Bacteria-phylum-abundance.txt
Bacteria-species-abundance.txt
Bacteria-toprank-abundance.txt
Eukaryota-class-abundance.txt
Eukaryota-family-abundance.txt
Eukaryota-genus-abundance.txt
Eukaryota-order-abundance.txt
Eukaryota-phylum-abundance.txt
Eukaryota-species-abundance.txt
Eukaryota-toprank-abundance.txt
cog-ann.txt
cog-class-ann.txt
kog-ann.txt
kog-class-ann.txt
pfam-ann.txt
superkingdom-reads-count.txt
tigrfam-ann.txt
tigrfam-mainrole-ann.txt
tigrfam-subrole-ann.txt
