In [1]:
import numpy as np
import pandas as pd
import gzip
import json
import matplotlib.pyplot as plt
import torch

import os
from os import walk

In [2]:
product_path = '../../../../src/data/product'

train_test_input_filtered_path = os.path.join(product_path, 'train_test_split/input_filtered_tables')
train_test_all_filtered_path = os.path.join(train_test_input_filtered_path, 'all')
all_filtered_tables_df = pd.read_json(os.path.join(train_test_all_filtered_path,'train_test_all_filtered_tables.json.gz'), compression='gzip', lines=True)
all_filtered_tables_df.description.fillna(value=',', inplace=True)

columns = ['name', 'description']
all_filtered_tables_df['concat'] = all_filtered_tables_df[columns].astype(str).agg(' '.join, axis=1)

In [3]:
all_filtered_tables_df['cluster_id_mapped'] = all_filtered_tables_df.groupby('cluster_id').ngroup()
all_filtered_tables_df[['cluster_id', 'cluster_id_mapped']].sort_values(by=['cluster_id'], ascending=True)

Unnamed: 0,cluster_id,cluster_id_mapped
5671,985,0
11104,985,0
1237,985,0
17529,985,0
12875,985,0
...,...,...
20242,80168995,1485
15654,80168995,1485
4940,80168995,1485
11026,80168995,1485


In [4]:
# get information about train and test table split
product_path = '../../../../src/data/product'
train_test_output_path = os.path.join(product_path, 'train_test_split/output_unfiltered_tables')
zip_files_train = [file for file in os.listdir(os.path.join(train_test_output_path, 'large/train')) if file.endswith('.json.gz')]
zip_files_val = [file for file in os.listdir(os.path.join(train_test_output_path, 'large/val')) if file.endswith('.json.gz')]
zip_files_test = [file for file in os.listdir(os.path.join(train_test_output_path, 'large/test')) if file.endswith('.json.gz')]

In [5]:
df_train = all_filtered_tables_df[~all_filtered_tables_df['table_id'].isin(zip_files_test)].reset_index()
df_target_train = df_train['cluster_id_mapped']

In [6]:
df_test = all_filtered_tables_df[all_filtered_tables_df['table_id'].isin(zip_files_test)].reset_index()
df_target_test = df_test['cluster_id_mapped']

In [8]:
df_test.to_csv('df_test')
df_train.to_csv('df_train')

In [9]:
x_train = df_train['concat'].to_list()
x_test = df_test['concat'].to_list()
y_train = df_target_train.to_list()
y_test = df_target_test.to_list()


In [10]:
from transformers import RobertaForSequenceClassification, RobertaTokenizerFast

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=df_target_train.nunique())
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', max_length = 512)

2021-12-01 21:59:25.396637: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-01 21:59:25.396737: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are ini

In [11]:
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification,Trainer, TrainingArguments
import numpy as np

In [20]:
# define the training arguments
training_args = TrainingArguments(
    output_dir = r'./RoBERTa_Results',
    num_train_epochs=25,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,
    report_to='none',
    logging_dir=r'./RoBERTa_logs'
)

PyTorch: setting up devices


In [13]:
class dataDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [14]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [15]:
x_Train = tokenizer(x_train, truncation=True, padding='max_length')
x_Test = tokenizer(x_test, truncation=True, padding='max_length')


train_data = dataDataset(x_Train, y_train)
test_data = dataDataset(x_Test, y_test)

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics,
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [22]:
trainer.train()

***** Running training *****
  Num examples = 19466
  Num Epochs = 25
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 121675


Step,Training Loss


KeyboardInterrupt: 