In [8]:
import pandas as pd
import polars as pl
import numpy as np

from typing import Optional

from sklearn.model_selection import train_test_split


def read(n: str, **kwargs):
    return pl.read_csv(f"../data/{n}", **kwargs).to_pandas()


data = read("params/data.csv")
check = read("params/check.csv")

In [6]:
def preprocess(
    data: pd.DataFrame
) -> tuple[pd.DataFrame, Optional[pd.DataFrame]]:
    data = data.copy()
    data = data.loc[
        (
            data["oil_property_param_title"]
            == "ad7e6027-00b8-4c27-918c-d1561f949ad8"
        )
    ]
    data.reset_index(drop=True, inplace=True)

    if "oil_property_param_value" in data.columns:
        y = data.pop("oil_property_param_value")
        return pd.DataFrame(data), pd.DataFrame(y)

    return pd.DataFrame(data), None


X, y = preprocess(data)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
X.head(2)

Unnamed: 0,oil_type,blend_id,oil_property_param_title,component_name,component_class,polymer,component_property_param_title,component_property_param_value,smiles
0,3fa07e0a-415c-496d-b88b-557855cb3e77,49743a76-a614-11ee-9529-005056921581,ad7e6027-00b8-4c27-918c-d1561f949ad8,615537f6-1f8f-4240-a5e9-8f7be344ecd3,базовое масло 1 гр,no,02236ee6-5eec-4368-a2e4-6f2e73fb0f96,0.0,CCCCC
1,3fa07e0a-415c-496d-b88b-557855cb3e77,49743a76-a614-11ee-9529-005056921581,ad7e6027-00b8-4c27-918c-d1561f949ad8,615537f6-1f8f-4240-a5e9-8f7be344ecd3,базовое масло 1 гр,no,2511714c-ab50-4566-bc92-8e4095d87d01,0.0,CCCCC


In [12]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from torch.nn import CrossEntropyLoss

# Define a dataset class
class SmilesDataset(Dataset):
    def __init__(self, smiles, labels=None, tokenizer=None, max_length=512):
        self.smiles = smiles
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.smiles)

    def __getitem__(self, idx):
        item = self.tokenizer(
            text=self.smiles[idx],
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in item.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-10M-MLM")
model = AutoModelForSequenceClassification.from_pretrained("DeepChem/ChemBERTa-10M-MLM", num_labels=10)  # Adjust num_labels to match your dataset

# Example usage
smiles = data["smiles"][:2] # Example SMILES data
labels = [0, 1]  # Example labels, ensure these are within the range [0, num_labels-1]
dataset = SmilesDataset(smiles, labels, tokenizer)
loader = DataLoader(dataset, batch_size=64, shuffle=True)

# Example loop to show usage
for batch in loader:
    outputs = model(**batch)
    print(outputs)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-10M-MLM and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SequenceClassifierOutput(loss=tensor(2.2691, grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0861,  0.0212,  0.1120, -0.0254,  0.0245, -0.0643,  0.0385,  0.0205,
          0.0254, -0.0501],
        [ 0.0861,  0.0212,  0.1120, -0.0254,  0.0245, -0.0643,  0.0385,  0.0205,
          0.0254, -0.0501]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
