In [7]:
import sys
sys.path.append('/code')

from database.models import (Protein, Organism, Classification, Molecule, Activity, ActivityType, Source, Quality, CID)

from sqlalchemy import create_engine
from sqlalchemy.orm import scoped_session
from sqlalchemy.orm import sessionmaker
import os
import glob
from rdkit import Chem
from razi.rdkit_postgresql.functions import morganbv_fp

import pandas as pd
# import modin.pandas as pd
import numpy as np
import json

import multiprocessing
import gc
from tqdm import tqdm
from copy import copy
import numpy as np


def get_db_session():
    engine = create_engine(
        os.environ.get('SQLALCHEMY_URL'), convert_unicode=True,
        pool_recycle=3600, pool_size=10)
    db_session = scoped_session(sessionmaker(
        autocommit=False, autoflush=False, bind=engine))
    
    return db_session


def get_or_create(session, model, **kwargs):
    instance = session.query(model).filter_by(**kwargs).first()
    if instance:
        return instance
    else:
        instance = model(**kwargs)
        session.add(instance)
        session.flush()
        session.refresh(instance)
        return instance
    
def get_or_instance(session, model, **kwargs):
    instance = session.query(model).filter_by(**kwargs).first()
    if instance:
        return (False, instance)
    else:
        instance = model(**kwargs)
        return (True, instance)
    
def sanitize_and_split(row, length, spl=';'):
    split = [v.rstrip() for v in str(row).split(spl)]
    if len(split)!= length:
        split = [split[0] for i in range(0,length)]
    
    split = [None if x == '' else x for x in split]
    
    return split


class TypeDecoder(json.JSONDecoder):
    """Custom json decoder to support types as values."""

    def __init__(self, *args, **kwargs):
        """Simple json decoder handling types as values."""
        json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs)

    def object_hook(self, obj):
        """Handle types."""
        if '__type__' not in obj:
            return obj
        module = obj['__type__']['module']
        type_ = obj['__type__']['type']
        if module == 'builtins':
            return getattr(__builtins__, type_)
        loaded_module = importlib.import_module(module)
        return getattr(loaded_module, type_)
    


dtype_file = '../.data/papyrus/05.5/data_types.json'
activity_data = '../.data/papyrus/05.5/05.5_combined_set_without_stereochemistry.tsv.xz'
protein_data = '../.data/papyrus/05.5/05.5_combined_set_protein_targets.tsv.xz'


In [2]:
with open(dtype_file, 'r') as jsonfile:
        dtypes = json.load(jsonfile, cls=TypeDecoder)['papyrus']

In [None]:
protein_df = pd.read_csv(protein_data, sep='\t', dtype=dtypes)

In [None]:
organisms = list(set(protein_df['Organism']))
classifications = []
for cstr in protein_df['Classification']:
    classifications.extend(str(cstr).split('->'))

classes = list(set(classifications))

In [None]:
db_session = get_db_session()

rows = []

for i, row in protein_df.iterrows():
    organism = get_or_create(session=db_session, model=Organism, organism=row['Organism'])
    classifications_list = str(row['Classification']).split('->')
    classifications = [get_or_create(session=db_session, model=Classification, classification=c) for c in classifications_list]
    
    review_mapping = {'reviewed':1, 'Unreviewed':0, 'unreviewed':0}
    
    prot = Protein(
        target_id = row['target_id'],
        HGNC_symbol = str(row['HGNC_symbol']),
        uniprot_id = row['UniProtID'],
        reviewed = review_mapping[row['Status']],
        organism = organism.id,
        length = row['Length'],
        sequence = row['Sequence'], 
        classifications = classifications
    )
    
    rows.append(prot)
    
db_session.add_all(rows)
db_session.commit()
db_session.remove()
    

In [10]:
def process_activity_frame(df):
#     df_obj = df.select_dtypes(['object'])
    db_session = get_db_session()
    rows = []
    
    activity_type_map = {
        '1000':'IC50',
        '0100':'EC50',
        '0010':'KD',
        '0001':'Ki',
        '0000':'other',
    }

    print('processing frame')
    
    # do this bit in parallel?
    for row in tqdm(df.itertuples()):
        sources_list = row.source.split(';')
        cids_list = row.CID.split(';')
        sources_cids_list = list(zip(sources_list, cids_list))
        cids = [      
            get_or_create(session=db_session, 
                          model=CID, 
                          cid=c[1], 
                          source=get_or_create(session=db_session, 
                                               model=Source, 
                                               source=c[0]).source) for c in sources_cids_list]
        
        rows.extend(cids)

        mol = Chem.MolFromSmiles(row.SMILES)
        fp = morganbv_fp(row.SMILES)

        # change this to use InChI and/or SMILES
        smiles = Chem.CanonSmiles(row.SMILES)
        created, molecule = get_or_instance(session=db_session,model=Molecule,smiles=smiles,inchi=row.InChI)

        for cid in cids:
            if cid not in molecule.cids:
                molecule.cids.append(cid)
        
        if created:
            molecule.smiles=smiles
            molecule.mol=mol
            molecule.inchi_key=row.InChIKey
            molecule.inchi=row.InChI
            molecule.inchi_auxinfo=row.InChI_AuxInfo
            molecule.fp=fp
            molecule.connectivity=row.connectivity
            db_session.add(molecule)
            db_session.flush()
            db_session.refresh(molecule)
            rows.append(molecule)

        qc,quality = get_or_instance(session=db_session, model=Quality, quality=row.Quality)
        if qc: rows.append(quality)
        qid = quality.id
        tc,target_id = get_or_instance(session=db_session, model=Protein, target_id=row.target_id)
        if tc: rows.append(target_id)
        tid = target_id.target_id
        molecule_id = molecule.id

        slice_list = []
        if ';' in str(row.pchembl_value):

            pchembl_values = [v.rstrip() for v in row.pchembl_value.split(';')]
            length = len(pchembl_values)

            aids = sanitize_and_split(row=row.AID,length=length)        
            doc_ids = sanitize_and_split(row=row.all_doc_ids,length=length)
            years = sanitize_and_split(row=row.all_years,length=length)
            type_IC50s = sanitize_and_split(row=row.type_IC50,length=length)         
            type_EC50s = sanitize_and_split(row=row.type_EC50,length=length)
            type_KDs = sanitize_and_split(row=row.type_KD,length=length)
            type_Kis = sanitize_and_split(row=row.type_Ki,length=length)

            for j in range(0, len(pchembl_values)):
                update_dict = {
                    'pchembl_value': pchembl_values[j],
                    'AID': aids[j],
                    'doc_id': doc_ids[j],
                    'Year': years[j],
                    'type_IC50': type_IC50s[j],
                    'type_EC50': type_EC50s[j],
                    'type_KD': type_KDs[j],
                    'type_Ki': type_Kis[j]
                }
                row_copy = copy(row._asdict())

                row_copy.update(update_dict)

                slice_list.append(row_copy)

        else:
            slice_list.append(row._asdict())

        for s in slice_list:

            a = f"{s['type_IC50']}{s['type_EC50']}{s['type_KD']}{s['type_Ki']}"
            activity_type_str = activity_type_map[a]

            activity_type = get_or_create(session=db_session, model=ActivityType, type=activity_type_str).id

            try:
                y = int(s['Year'])
            except:
                y = None
                
            if str(s['doc_id']) in ['nan', 'NaN']:
                doc_id = None
            else:
                doc_id = str(s['doc_id'])

            activity = Activity(
                papyrus_activity_id=s['Activity_ID'],
                quality=qid,
                target_id=tid,
                molecule_id = molecule_id,
                accession=s['accession'],
                protein_type=s['Protein_Type'],
                aid = s['AID'],
                doc_id = doc_id,
                year = y,
                type = activity_type, 
                relation = s['relation'],
                pchembl_value = s['pchembl_value'],
                pchembl_value_mean = s['pchembl_value_Mean'],
                pchembl_value_stdev = s['pchembl_value_StdDev'],
                pchembl_value_SEM = s['pchembl_value_SEM'],
                pchembl_value_n = s['pchembl_value_N'],
                pchembl_value_median = s['pchembl_value_Median'],
                pchembl_value_mad = s['pchembl_value_MAD'],   
            )

            rows.append(activity)
            
    print('processing complete')
            
    del(df)
    db_session.add_all(rows)
    print('committing data')
    db_session.commit()
    db_session.close()
    db_session.remove()
    gc.collect()
            
    return True


In [12]:
reader = pd.read_csv(activity_data, sep='\t', compression='xz', chunksize = 1000000, iterator=True, dtype=dtypes)

for (i,df) in enumerate(reader):
    # process each data frame
    print(f'processing chunk {i}')
    process_activity_frame(df)
    gc.collect()


  engine = create_engine(


processing chunk 0
processing frame


529165it [7:49:40, 18.78it/s]


KeyboardInterrupt: 

In [None]:
import dask.dataframe as dd

ddata = dd.from_pandas(df)

In [None]:
url = os.environ.get('SQLALCHEMY_URL')

In [None]:
url