In [40]:
import os
import pandas as pd
import numpy as np
import json
import sys
import importlib.util
sys.path.append('/home/biodb/data/abio_database_pipeline/')
from pipeline.datasets_curation import datasetBuilder

In [54]:
class FormatCompiler():
    
    def __init__(self, input_number = None, part = None, output_number = None,
                 input_dir = '/home/biodb/data/dataset_collection/datasets/3_standard_dataset',
                 output_dir = '/home/biodb/data/dataset_collection/datasets/5_Pfizer_format/R2'):
        
        self.input_number = input_number
        self.output_number = output_number
        self.part = part
        self.input_dir = input_dir + '/' + input_number + '/' + part
        self.output_dir = output_dir + '/' + output_number + '/' + part
        
        self.unstructured_data_dir =  self.input_dir + '/processed_data' + '/unstructuredData.json'
        self.cell_anno_dir = self.input_dir + '/processed_data' + '/cellAnnotation.tsv'
        self.gene_anno_dir = self.input_dir + '/processed_data' + '/geneAnnotation.tsv'
        self.exp_raw_dir = self.input_dir + '/processed_data' + '/expressionMatrix_rawCounts.tsv'
        self.exp_tpm_dir = self.input_dir + '/processed_data' + '/expressionMatrix_TPM.tsv'
        self.exp_normalized_dir = self.input_dir + '/processed_data' + '/expressionMatrix_normalized.tsv'
        self.diff_genes_dir = self.input_dir + '/processed_data' + '/Diff_genes.json'
        
        self.metadata_dir = self.output_dir + '/metadata.tsv'
        self.phenotype_dir = self.output_dir + '/phenotype.tsv'
        self.geneID_dir = self.output_dir + '/geneID.tsv'
        self.cellID_dir = self.output_dir + '/cellID.tsv'
        self.marker_genes_dir = self.output_dir + '/markerGenes.tsv'
        self.ontology_dir =  self.output_dir + '/ontologyMapping.tsv'
    
    def making_dir(self):
        try:
            os.system("mkdir -p " + self.output_dir)
        except:
            pass
    
    def transform_format(self):
        
        self.making_dir()
        self.transform_phenotype()
        self.transform_marker_genes()
        self.transform_ontology()
        self.transform_geneID()
        self.transform_metadata()
        
        return_message = 'successfully'
        return return_message
    
    def transform_metadata(self):
        
        with open(self.unstructured_data_dir, 'r') as json_file:
            meta = json.load(json_file)['metadata']
    
        with open(self.exp_normalized_dir, 'r') as file:
            x = file.readline()
            x = file.readline()
            normalized_method = x.split('\t')[0]
        
        metadata = dict()
        list_names = ['datasetID','title','accessionNumber','abstract','source','sourceID','numberOfCells','libraryPreparationMethod',
                      'sequencingPlatform','pubmedID','clusteringMethod','biomarkerDerivationMethod','fastqURL','genomeBuild','annotation',
                      'subDataset','description','tissue']
        for i in list_names:
            metadata[i] = meta[i]
        metadata['datasetID'] = self.output_number+'_'+self.part
        metadata['normalizationMethod'] = normalized_method
        metadata = pd.DataFrame(pd.Series(metadata)).T
#         metadata = pd.DataFrame(metadata, index = [0])
        metadata.replace('notAvailable','',inplace = True)
        metadata.replace('NA','',inplace = True)
        metadata = metadata.fillna('')
        metadata.to_csv(self.metadata_dir ,sep = '\t',index = False)
            
        return_message = 'successfully generated Metadata.tsv'
        return return_message
    
    def transform_phenotype(self):
        
        df_cell = pd.read_csv(self.cell_anno_dir ,sep='\t')
        if "clusteringMethod" in df_cell.columns.tolist():
            self.cluster = False
            df_cell = df_cell.drop(['clusteringMethod'],axis=1)
            df_cell['clusterID'] = ""
            df_cell['clusterName'] = ""
        else:
            self.cluster = True
        try:
            df_cell = df_cell.drop(['clusterName_scibet'],axis=1)
        except:
            pass
        try:
            df_cell = df_cell.drop(['meta_scibetHCL'],axis=1)
        except:
            pass                           
        df_cell = df_cell.drop(['cellOntologyName','cellOntologyID','FACSMarker'], axis=1)
        
        df_cell['filtered'] = True
        if os.path.getsize(self.exp_raw_dir) > 100:
            cell_raw = []
            with open(self.exp_raw_dir, 'r') as file:
                x = file.readline()
                while True:
                    x = file.readline()
                    if x == '':
                        break
                    else: 
                        cell_raw.append(x.split('\t')[0])
        
            if len(cell_raw) != cell_anno.shape[0]:
                cell = pd.DataFrame()
                cell['cellID'] = cell_raw
                df_cell = cell.merge(df_cell, how = 'left', sort = False)
                df_cell['filtered'] = df_cell['filtered'].fillna(False).tolist()
            else:
                df_cell['filtered'] = False
                
        df_cell.replace('notAvailable','',inplace = True)
        df_cell = df_cell.fillna('')
        df_cell.to_csv(self.phenotype_dir ,sep = '\t',index = False)
        return_message = 'successfully generated phenotype.tsv'
        return return_message
    
    def transform_marker_genes(self):
        
        if self.cluster == False:
            pass
        else:
            my_builder = datasetBuilder.DatasetBuilder(starting_dir=self.input_dir)
            with open(self.diff_genes_dir, 'r') as json_file:
                markers= json.load(json_file)['wilcoxon']
            cluster = [x for x in markers]
            marker_genes = pd.DataFrame()
            for x in range(0,len(cluster)):
                z = pd.DataFrame(markers[cluster[x]])
                logfc = z['logFC'].tolist()
                k = [i > 0 for i in logfc]
                z = z.loc[k,]
                z = z.iloc[:100,:]
                z = z.drop(columns= ['logFC','qValue'])
                z.insert(1,"ensemblID",my_builder.calculate_ensemblID(gene=z['geneSymbol'].tolist()))
                z['statisticsType'] = 'wilcoxon'
                z['clusterName'] = cluster[x]
                
                marker_genes = marker_genes.append(z, ignore_index = True)
            
            marker_genes.replace('notAvailable','',inplace = True)
            marker_genes.to_csv(self.marker_genes_dir, sep = '\t', index = False)   

            return_message = 'successfully generated Marker_genes.tsv'
            return return_message
        
    def transform_geneID(self):
        
        my_builder = datasetBuilder.DatasetBuilder(starting_dir=self.input_dir)
        gene_anno = pd.read_csv(self.gene_anno_dir, sep = '\t')
        genes = pd.DataFrame(gene_anno.iloc[:,:2])
        genes.replace('notAvailable' or 'NA','',inplace = True)
        genes['filtered'] = False
        with open(self.exp_raw_dir, 'r') as file:
            x = file.readline()[:-1]
        gene_raw = x.split('\t')[1:]
        with open(self.exp_normalized_dir, 'r') as file:
            y = file.readline()[:-1]
        gene_norm = y.split('\t')[2:]
        if gene_norm[0] == 'geneSymbol1':
            pass
        elif gene_raw[0] == 'geneSymbol1':
            genes['filtered'] = True
        elif len(gene_raw) != genes.shape[0]:
            filtered = [x in gene_norm for x in gene_raw]
            genes = pd.DataFrame()
            genes['geneSymbol'] = gene_raw
            genes['ensemblID'] = my_builder.calculate_ensemblID(gene=gene_raw)
            genes['filered'] = filtered
        elif len(gene_raw) == genes.shape[0]:
            genes['filtered'] = True
            
        genes.to_csv(self.geneID_dir ,sep = '\t',index = False)
        
        return_message = 'successfully generated geneID.tsv'
        return return_message
        
        
    def transform_ontology(self):
        
        cell_anno = pd.read_csv(self.cell_anno_dir ,sep='\t')
        if self.cluster == False:
            cell_anno['clusterID'] = ""
            cell_anno['clusterName'] = ""
        onto = cell_anno[['clusterID','clusterName','cellOntologyName','cellOntologyID']]
        ontology = onto.drop_duplicates()
        ontology.replace('notAvailable','',inplace = True)
        ontology = ontology.fillna('')
        if not set(ontology['cellOntologyName']) == {''}:
            ontology.to_csv(self.ontology_dir, sep = '\t',index = False)
        else:
            pass
        
        return_message = 'successfully generated Ontology_mapping.tsv'
        return return_message
    

In [55]:
genentech_format = FormatCompiler(input_number = "No_3", part = "part_2", output_number = "No_3",
                                  output_dir = '/home/biodb/data/dataset_collection/datasets/4_genentech_format')
genentech_format.transform_metadata()

'successfully generated Metadata.tsv'

In [69]:
x=pd.read_csv('/home/biodb/data/dataset_collection/datasets/4_genentech_format/No_3/part_2/metadata.tsv',sep = '\t')

In [74]:
y = dict()
y['tissue'] = x['tissue'][0]

In [77]:
from ast import literal_eval
literal_eval(y['tissue'])

['head', 'neck']

In [19]:
import json
path = '/home/biodb/data/dataset_collection/datasets/3_standard_dataset/No_10/part_1/processed_data/Diff_genes.json'
with open(path, 'r') as json_file:
    markers= json.load(json_file)['wilcoxon']
cluster = [x for x in markers]

In [22]:
import pandas as pd
z = pd.DataFrame(markers[cluster[0]])

In [30]:
logfc = z['logFC'].tolist()
k = [i > 0 for i in logfc]
z = z.loc[k,]
z = z.iloc[:100,:]

In [37]:
marker_genes = pd.DataFrame()

for x in range(0,len(cluster)):
    z = pd.DataFrame(markers[cluster[x]])
    logfc = z['logFC'].tolist()
    k = [i > 0 for i in logfc]
    z = z.loc[k,]
    z = z.iloc[:100,:]
    z = z.drop(columns= ['logFC','qValue'])
#     z.insert(1,"ensemblID",my_builder.calculate_ensemblID(gene=z['geneSymbol'].tolist()))
    z['statisticsType'] = 'wilcoxon'
    z['clusterName'] = cluster[x]

    marker_genes = marker_genes.append(z, ignore_index = True)

15

In [16]:
cell_raw = []
path = '/home/biodb/data/dataset_collection/datasets/3_standard_dataset/No_10/part_1/processed_data/expressionMatrix_rawCounts.tsv'
with open(path, 'r') as file:
    x = file.readline()
    while True:
        x = file.readline()
        if x == '':
            break
        else: 
            cell_raw.append(x.split('\t')[0])

In [14]:
with open(path, 'r') as file:
    x = file.readline()
    x = file.readline()

In [15]:
x.split('\t')[0]

'B1_AAACATTGTTTGGG_Enterocyte.Immature.Distal'

In [18]:
len(cell_raw)

7216