In [1]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [2]:
import pandas as pd 
import evaluate
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import torch
from transformers import PreTrainedTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import random_split
from transformers import TrainingArguments
from transformers import Trainer
from sklearn.model_selection import train_test_split
from transformers import DataCollatorWithPadding


In [3]:
train_df = pd.read_csv("train.csv")

train_df.drop("Unnamed: 0", inplace = True, axis = 1)

# Split data into train and validation sets
train_data, val_data = train_test_split(train_df, test_size=0.1, random_state=42)



In [4]:
train_df

Unnamed: 0,Question,Answer
0,"What is the origin of ""foobar""? I want to know...",The user's question is not related to digita...
1,what's the best way to create a bootable windo...,The user is asking a question related to cre...
2,What is the best relational database? http://w...,The user is able to identify and compare two...
3,"what is a tag, and why is it being hailed by s...",The user demonstrates a basic understanding ...
4,What are the best ways to incentivise people t...,The user's question demonstrates a high leve...
...,...,...
29180,do anyone know how to download photoshop for f...,The user's question indicates a lack of digit...
29181,how to proggram in php?,The user is asking a question related to pro...
29182,i play streaming video and how can i convert i...,The user's question indicates a lack of digi...
29183,I need an email address extractor software wit...,The user's question indicates a lack of digi...


In [5]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_data(data):
    return tokenizer(data['Question'].tolist(), return_tensors='pt', padding=True, truncation=True)

train_encodings = tokenize_data(train_data)
val_encodings = tokenize_data(val_data)


In [6]:

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

class QADataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        return {key: tensor[idx] for key, tensor in self.encodings.items()}

train_dataset = QADataset(train_encodings)
val_dataset = QADataset(val_encodings)

train_loader = torch.utils.data.DataLoader(train_dataset, 
                                           batch_size=8, 
                                           shuffle=True, 
                                           collate_fn=data_collator)
val_loader = torch.utils.data.DataLoader(val_dataset, 
                                         batch_size=8, 
                                         shuffle=False, 
                                         collate_fn=data_collator)



In [7]:
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M").to("cuda:0")

training_args = TrainingArguments(
    save_strategy="epoch",
    save_total_limit=1, 
    fp16=True, 
    learning_rate = 3e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    logging_steps=2500,
    output_dir="./results",
    logging_dir="./logs",
    warmup_steps=100,
    weight_decay=0.01,
)




trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/13133 [00:00<?, ?it/s]