In [1]:
import numpy as np 
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline

from src.chem_models import ChembertaWrapper
from src.tab_models import SimpleMLP
from src.Dataset import FullAlignedTabChemDataset


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = 'cpu'

## Build data 

In [3]:
modalities = ['tab', 'chem']

tabular_data_train = torch.randn(5, 10)  
tabular_data_val = torch.randn(3, 10)  

smiles_data_train = ['CC(=O)OCC[N+](C)(C)C',
               'CC(C[N+](C)(C)C)OC(=O)C',
               'O=C1CCCN1CC#CC[N+](C)(C)C',
               'NC(=O)OCC[N+](C)(C)C',
               'CC(C[N+](C)(C)C)OC(=O)N'
               ]
smiles_data_val = [
    'COC(=O)C1=CCCN(C1)C',
    'CON=CC1=CCCN(C1)C',
    'CCC1C(=O)OCC1Cc1cncn1C'
               ]

target_train = np.array([0,1,1,0,0])
target_val = np.array([0,1,0])

train_ds = FullAlignedTabChemDataset(tabular_data_train, smiles_data_train, target_train,modalities)
val_ds = FullAlignedTabChemDataset(tabular_data_val, smiles_data_val, target_val,modalities)

train_dl = DataLoader(train_ds, batch_size=2, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=2, shuffle=False)

## Build model

In [4]:
tab_model_params = {
    'input_size': 10,
    'hidden_layers':[20,12],
    'num_classes': 4,
    'dropout_rates': [0.2,0.2]
}

tab_model = SimpleMLP(**tab_model_params)

: 

In [5]:
chemberta = AutoModelForMaskedLM.from_pretrained('DeepChem/ChemBERTa-77M-MTR')
tokenizer = AutoTokenizer.from_pretrained('DeepChem/ChemBERTa-77M-MTR')
chem_model = ChembertaWrapper(chemberta, tokenizer)