In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import Dataset
from sklearn.model_selection import train_test_split
import re

In [None]:
with open("3sent.train.shuffled.txt") as f:
  data = f.readlines()

classifiers = []
texts = []

for review in data:
  review=review.replace('\n', '')
  splitting=re.split('\t',review)
  classifiers.append(splitting[0])
  texts.append(splitting[1]+"[SEP]"+splitting[2])

data_dict = {
    'sentences': texts,
    'label': classifiers
}

df = pd.DataFrame(data_dict)
df.head()

In [None]:
mapping = {
    '0' : 0,
    '1': 1,
    '2': 2
}

df['label'] = df['label'].map(mapping)

In [None]:
df['label']
df['label'] = df['label'].astype(int)
df['label'].value_counts()

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

In [None]:
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.001)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["sentences"], max_length=128, padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

In [None]:
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]
train_dataset.save_to_disk("3sent_train")
eval_dataset.save_to_disk("3sent_eval")

In [None]:
train_dataset['sentences']

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from datasets import load_from_disk

train_dataset = load_from_disk("3sent_train")
eval_dataset = load_from_disk("3sent_eval")
model = AutoModelForSequenceClassification.from_pretrained("deberta-v3-large", torch_dtype=torch.bfloat16, num_labels=3, output_hidden_states = True)
training_args = TrainingArguments("test_trainer",
                                  num_train_epochs=1,
                                  resume_from_checkpoint=False,
                                  evaluation_strategy="steps",
                                  save_steps=1000,
                                  eval_steps=100,
                                  per_device_train_batch_size=8)
trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,
)

trainer.train()