In [1]:
import os
import pandas as pd
import numpy as np
import json

In [2]:
class FormatCompiler():
    
    def __init__(self, dataset_name = None, part = None, 
                 input_dir = '/home/biodb/data/dataset_collection/datasets/3_standard_dataset',
                 output_dir = '/home/biodb/data/dataset_collection/datasets/genentech_format_copy'):
        
        self.data_name = dataset_name
        self.part = part
        self.dataset_name = dataset_name + '/' + part
        self.new_name = dataset_name + '_' + part
        self.input_dir = input_dir
        self.output_dir = output_dir
        
        self.unstructured_data_dir =  self.input_dir + '/' + self.dataset_name + '/processed_data' + '/unstructuredData.json'
        self.cell_anno_dir = self.input_dir + '/' + self.dataset_name + '/processed_data' + '/cellAnnotation.tsv'
        self.gene_anno_dir = self.input_dir + '/' + self.dataset_name + '/processed_data' + '/geneAnnotation.tsv'
        self.exp_raw_dir = self.input_dir + '/' + self.dataset_name + '/processed_data' + '/expressionMatrix_rawCounts.tsv'
        self.exp_tpm_dir = self.input_dir + '/' + self.dataset_name + '/processed_data' + '/expressionMatrix_TPM.tsv'
        self.exp_normalized_dir = self.input_dir + '/' + self.dataset_name + '/processed_data' + '/expressionMatrix_normalized.tsv'
        
        self.metadata_dir = self.output_dir + '/' + self.dataset_name + '/metadata.tsv'
        self.phenotype_dir = self.output_dir + '/' + self.dataset_name + '/phenotype.tsv'
        self.geneID_dir = self.output_dir + '/' + self.dataset_name + '/geneID.tsv'
        self.cellID_dir = self.output_dir + '/' + self.dataset_name + '/cellID.tsv'
        self.marker_genes_dir = self.output_dir + '/' + self.dataset_name + '/markerGenes.tsv'
        self.ontology_dir =  self.output_dir + '/' + self.dataset_name + '/ontologyMapping.tsv'
    
    def making_dir(self):
        path = self.output_dir + '/' + self.data_name
        if not self.data_name in os.listdir(self.output_dir):
            os.makedirs(path)
        elif not self.part in os.listdir(path):
            os.makedirs(self.output_dir + '/' + self.dataset_name)
        else:
            pass
    
    def transform_format(self):
        
        self.transform_phenotype()
        self.transform_marker_genes()
        self.transform_ontology()
        self.transform_geneID()
        self.transform_metadata()
        
        return_message = 'successfully'
        return return_message
    
    def transform_metadata(self):
        
        self.making_dir()
        with open(self.unstructured_data_dir, 'r') as json_file:
            meta = json.load(json_file)['metadata']
    
        with open(self.exp_normalized_dir, 'r') as file:
            x = file.readline()
            x = file.readline()
            normalized_method = x.split('\t')[0]
        
        metadata = dict()
        list_names = ['datasetID','title','accessionNumber','abstract','source','sourceID','numberOfCells','libraryPreparationMethod',
                      'sequencingPlatform','pubmedID','clusteringMethod','biomarkerDerivationMethod','fastqURL','genomeBuild','annotation',
                      'subDataset','description']
        for i in list_names:
            metadata[i] = meta[i]
        metadata['datasetID'] = self.new_name
        metadata['normalizationMethod'] = normalized_method
        metadata = pd.DataFrame(metadata, index = [0])
        metadata.replace('notAvailable','',inplace = True)
        metadata.replace('NA','',inplace = True)
        metadata = metadata.fillna('')
        metadata.to_csv(self.metadata_dir ,sep = '\t',index = False)
            
        return_message = 'successfully generated Metadata.tsv'
        return return_message
    
    def transform_phenotype(self):
        
        self.making_dir()
        cell_anno = pd.read_csv(self.cell_anno_dir ,sep='\t')
        df_cell = cell_anno.drop(['cellOntologyName','cellOntologyID','FACSMarker'], axis=1)
        df_cell.replace('notAvailable','',inplace = True)
        df_cell['filtered'] = True
        df_cell = df_cell.fillna('')
        cell_raw = pd.read_csv(self.exp_raw_dir, sep='\t')['cellID'].tolist()
        if cell_raw != []:
            if len(cell_raw) != cell_anno.shape[0]:
                cell = pd.DataFrame()
                cell['cellID'] = cell_raw
                df_cell = cell.merge(df_cell, how = 'left', sort = False)
                df_cell['filtered'] = df_cell['filtered'].fillna(False).tolist()
            else:
                df_cell['filtered'] = False
        df_cell = df_cell.fillna('')
        df_cell.to_csv(self.phenotype_dir ,sep = '\t',index = False)
        return_message = 'successfully generated phenotype.tsv'
        return return_message
    
    def transform_marker_genes(self):
        
        self.making_dir()
        with open(self.unstructured_data_dir, 'r') as json_file:
            markers= json.load(json_file)['markerGenes']
        if markers == {}:
            pass 
        else:
            cluster = [x for x in markers]
            marker_genes = pd.DataFrame(markers[cluster[0]])
            marker_genes['clusterName'] = cluster[0]
            for x in range(1,len(cluster)):
                z = pd.DataFrame(markers[cluster[x]])
                z['clusterName'] = cluster[x]
                marker_genes = marker_genes.append(z, ignore_index = True)
            
            marker_genes.replace('notAvailable','',inplace = True)
            marker_genes.to_csv(self.marker_genes_dir, sep = '\t', index = False)   

            return_message = 'successfully generated Marker_genes.tsv'
            return return_message
    
    def generate_ensemblID(self, gene_symbol):
        
        self.making_dir()
        ref_dict = {}
        with open(self.unstructured_data_dir, 'r') as json_file:
            metadata = json.load(json_file)['metadata']
        taxonomyID = metadata['taxonomyID']
        gene_ref_dir = '/home/biodb/data/abio_database_pipeline/gene_references'
        if not taxonomyID == 0:
            species = taxonomyID
        if species == 9606:
            df = pd.read_csv(gene_ref_dir + '/human_unique_id_length.tsv', sep='\t')
            for i, symbol in enumerate(df['Gene_name'].tolist()):
                ref_dict[symbol] = df['Gene_ID'].tolist()[i]
        
        elif species == 10090:
            df = pd.read_csv(gene_ref_dir + '/mouse_unique_id_length.tsv', sep='\t')
            for i, symbol in enumerate(df['Gene_name'].tolist()):
                ref_dict[symbol] = df['Gene_ID'].tolist()[i]
        
        elif species == 7955:
            df = pd.read_csv(gene_ref_dir + '/zebra_fish_unique_id_length.tsv', sep='\t')
            for i, symbol in enumerate(df['Gene_name'].tolist()):
                ref_dict[symbol] = df['Gene_ID'].tolist()[i]
        IDs = []
        for i, name in enumerate(gene_symbol):
            try:
                IDs.append(ref_dict[name])
            except:
                IDs.append('notAvailable')
        return IDs
        
    def transform_geneID(self):
        
        self.making_dir()
        gene_anno = pd.read_csv(self.gene_anno_dir, sep = '\t')
        genes = pd.DataFrame(gene_anno.iloc[:,:2])
        genes.replace('notAvailable' or 'NA','',inplace = True)
        genes['filtered'] = False
        with open(self.exp_raw_dir, 'r') as file:
            x = file.readline()[:-1]
        gene_raw = x.split('\t')[1:]
        with open(self.exp_normalized_dir, 'r') as file:
            y = file.readline()[:-1]
        gene_norm = y.split('\t')[2:]
        if gene_norm[0] == 'geneSymbol1':
            pass
        elif gene_raw[0] == 'geneSymbol1':
            genes['filtered'] = True
        elif len(gene_raw) != genes.shape[0]:
            filtered = [x in gene_norm for x in gene_raw]
            genes = pd.DataFrame()
            genes['geneSymbol'] = gene_raw
            genes['ensemblID'] = self.generate_ensemblID(gene_raw)
            genes['filered'] = filtered
        elif len(gene_raw) == genes.shape[0]:
            genes['filtered'] = True
        
        genes.to_csv(self.geneID_dir ,sep = '\t',index = False)
        
        return_message = 'successfully generated geneID.tsv'
        return return_message
        
        
    def transform_ontology(self):
        
        self.making_dir()
        cell_anno = pd.read_csv(self.cell_anno_dir ,sep='\t')
        onto = cell_anno[['clusterID','clusterName','cellOntologyName','cellOntologyID']]
        ontology = onto.drop_duplicates()
        ontology.replace('notAvailable','',inplace = True)
        ontology = ontology.fillna('')
        if not set(ontology['cellOntologyName']) == {''}:
            ontology.to_csv(self.ontology_dir, sep = '\t',index = False)
        else:
            pass
        
        return_message = 'successfully generated Ontology_mapping.tsv'
        return return_message
    

In [8]:
for i in range(1,33):
    p = 'part_' + str(i)
    myformatcomplier = FormatCompiler(dataset_name='No_17', part = p)
    myformatcomplier.transform_marker_genes()

In [11]:
myformatcomplier = FormatCompiler(dataset_name='No_5', part = 'part_2')
myformatcomplier.transform_metadata()

'successfully generated Metadata.tsv'

In [41]:
myformatcomplier.dataset_name

'No_33/part_1'

In [4]:
cell_anno = pd.read_csv(myformatcomplier.cell_anno_dir ,sep='\t')
df_cell = cell_anno.drop(['cellOntologyName','cellOntologyID','FACSMarker'], axis=1)
df_cell.replace('notAvailable','',inplace = True)
df_cell['filtered'] = False
df_cell.to_csv(myformatcomplier.phenotype_dir ,sep = '\t',index = False)

In [7]:
import pandas as pd
import os
path = '/home/biodb/data/dataset_collection/datasets/3_standard_dataset/'
for i in [43]:
    dataset_name = 'No_' + str(i)
    path1 = path + dataset_name
    for j in os.listdir(path1):
        try:
            myformatcomplier = FormatCompiler(dataset_name, part = j, input_dir = path)
            myformatcomplier.transform_format()
        except:
            print([i,j])

In [142]:
import pandas as pd
import os
import json
path = '/home/biodb/data/dataset_collection/datasets/3_standard_dataset/'
for i in range(46,47):
    path1 = path + 'No_' + str(i)
    for j in os.listdir(path1):
        path2 = path1 + '/' + str(j) + '/processed_data/cellAnnotation.tsv'
        cell = pd.read_csv(path2,sep='\t')
        delete = ['libraryPreparationMethod','sequencingPlatform','PCA1','PCA2']
        for i in delete:
            if i in cell.columns.tolist():
                cell = cell.drop(columns=[i])
        cell.to_csv(path2, sep='\t', index=False)

In [31]:
import pandas as pd
import os
import json
path = '/home/biodb/data/dataset_collection/datasets/3_standard_dataset/'
for i in range(33,43):
    path1 = path + 'No_' + str(i)
    for j in os.listdir(path1):
        path2 = path1 + '/' + str(j) + '/processed_data/unstructuredData.json'
        with open(path2,'r') as file:
            uns = json.load(file)
            meta = uns['metadata']
        if 'description' in meta.keys():
            print([i,j])
        else:
            meta['description'] = ''
            uns['metadata'] = meta
            with open(path2,'w') as file:
                json.dump(uns,file)

[33, 'part_1']
[34, 'part_1']
[35, 'part_1']
[36, 'part_1']
[37, 'part_1']
[38, 'part_2']
[38, 'part_1']
[39, 'part_1']
[40, 'part_1']
[40, 'part_2']
[41, 'part_1']
[41, 'part_2']
[41, 'part_3']
[42, 'part_2']
[42, 'part_1']
[42, 'part_3']
[42, 'part_4']


In [7]:
onto = pd.DataFrame()
dirs = '/home/biodb/data/dataset_collection/datasets/4_genentech_format/ready/'
for i in range(1,69):
    k = 'No_' + str(i)
    paths = dirs + k + '/'
    for j in os.listdir(paths):
        try:
            path = paths + '/' + j
            meta = pd.read_csv(path + '/metadata.tsv', sep = '\t')
            meta['datasetID'] = k + '_' + j
            meta.to_csv(path + '/metadata.tsv', sep = '\t', index=False)
        except:
            print([k,j,'meta'])
        try:
            ot = pd.read_csv(path + '/ontologyMapping.tsv', sep = '\t')
            ot.insert(0,'datasetID',k)
            ot.drop(columns=['clusterID'],inplace=True)
            onto = onto.append(ot)
        except:
            print([k,j,'onto'])
onto = onto.fillna('')
onto = onto.drop_duplicates()
onto.to_csv('/home/biodb/data/dataset_collection/datasets/4_genentech_format/ontologyMapping_collection.tsv',sep='\t',index=False)

['No_5', '.ipynb_checkpoints', 'meta']
['No_5', '.ipynb_checkpoints', 'onto']
['No_6', 'part_1', 'onto']
['No_12', '.ipynb_checkpoints', 'meta']
['No_12', '.ipynb_checkpoints', 'onto']
['No_12', 'part_1', 'onto']
['No_14', 'part_2', 'onto']
['No_14', 'part_1', 'onto']
['No_15', 'part_3', 'onto']
['No_15', 'part_2', 'onto']
['No_15', 'part_1', 'onto']
['No_20', 'part_1', 'onto']
['No_21', 'part_1', 'onto']
['No_22', 'part_1', 'onto']
['No_23', 'part_1', 'onto']
['No_28', 'part_1', 'onto']
['No_32', 'part_2', 'onto']
['No_34', 'part_1', 'onto']
['No_36', 'part_1', 'onto']
['No_39', '.ipynb_checkpoints', 'meta']
['No_39', '.ipynb_checkpoints', 'onto']
['No_45', 'part_3', 'onto']
['No_45', 'part_4', 'onto']
['No_45', 'part_2', 'onto']
['No_47', 'part_1', 'onto']
['No_47', 'part_2', 'onto']
['No_47', 'part_4', 'onto']
['No_47', 'part_3', 'onto']
['No_48', 'part_3', 'onto']
['No_49', '.ipynb_checkpoints', 'meta']
['No_49', '.ipynb_checkpoints', 'onto']
['No_49', 'part_1', 'onto']
['No_50', '

In [27]:
dirs = '/home/biodb/data/dataset_collection/datasets/4_genentech_format/ready/'
dataset_list = pd.DataFrame()
for i in range(27,43):
    k = 'No_' + str(i)
    paths = dirs + k 
    for j in os.listdir(paths):
        try:
            path = paths + '/' + j
            meta = pd.read_csv(path + '/metadata.tsv', sep = '\t')
            meta1 = pd.DataFrame()
            meta1['datasetID'] = k
            meta1['accessionNumber'] = meta['accessionNumber']
            meta1['title'] = meta['title']
            dataset_list = dataset_list.append(meta1)
            print([k,meta['accessionNumber']])
        except:
            print([k,j])
#dataset_list = dataset_list.drop_duplicates()
#dataset_list.to_csv('/home/biodb/data/dataset_collection/datasets/4_genentech_format/datasets_list.tsv',sep='\t',index=False)

['No_27', 0    GSE81608
Name: accessionNumber, dtype: object]
['No_28', 0    GSE86618
Name: accessionNumber, dtype: object]
['No_29', 0    GSE84465
Name: accessionNumber, dtype: object]
['No_30', 0    GSE99795
Name: accessionNumber, dtype: object]
['No_31', 0    GSE89232
Name: accessionNumber, dtype: object]
['No_32', 0    GSE120506
Name: accessionNumber, dtype: object]
['No_32', 0    GSE112013
Name: accessionNumber, dtype: object]
['No_33', 0    GSE106510
Name: accessionNumber, dtype: object]
['No_33', 0    GSE106510
Name: accessionNumber, dtype: object]
['No_33', 0    GSE106510
Name: accessionNumber, dtype: object]
['No_33', 0    GSE106510
Name: accessionNumber, dtype: object]
['No_33', 0    GSE106510
Name: accessionNumber, dtype: object]
['No_33', 0    GSE106510
Name: accessionNumber, dtype: object]
['No_33', 0    GSE106510
Name: accessionNumber, dtype: object]
['No_33', 0    GSE106510
Name: accessionNumber, dtype: object]
['No_33', 0    GSE106510
Name: accessionNumber, dtype: objec

In [24]:
dataset_list

'No_42'