In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

df = pd.read_csv("/kaggle/input/translated-small-parabank2/translated_small_parabank2_postproc.tsv", sep="\t", header=None)
df.columns = ["input_text","output_text"]
df.head()

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Unnamed: 0,input_text,output_text
0,2004 Ocean Cup narodov,Ocean Cup narodov 2004
1,2004 Ocean Cup narodov,Pokal narodov OFC 2004
2,2004 Ocean Cup narodov,Ocean Bowl narodov 2004
3,Ocean Cup narodov 2004,Pokal narodov OFC 2004
4,Ocean Cup narodov 2004,Ocean Bowl narodov 2004


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [3]:
tokenizer = AutoTokenizer.from_pretrained("cjvt/gpt-sl-base")

model = AutoModelForCausalLM.from_pretrained("cjvt/gpt-sl-base")

Downloading (…)okenizer_config.json:   0%|          | 0.00/388 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/3.93M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/289M [00:00<?, ?B/s]

In [4]:
# Check if CUDA is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(60032, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): FastGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dr

In [5]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [6]:
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

# Convert the pandas DataFrame to a list of tuples
#paraphrases = df[["input_text", "output_text"]].apply(tuple, axis=1).tolist()

# Assuming you have a pandas DataFrame 'df' with columns "input_text" and "output_text"
data = df[["input_text", "output_text"]].apply(tuple, axis=1).tolist()

# Split data into train and temp sets (80% train, 20% temp)
train_data, temp_data = train_test_split(data, test_size=0.4, random_state=42)
# Split temp_data into eval and test sets (10% eval, 10% test)
eval_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

class ParaphraseDataset(Dataset):
    def __init__(self, paraphrases, tokenizer):
        self.paraphrases = paraphrases
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.paraphrases)

    def __getitem__(self, idx):
        input_text, output_text = self.paraphrases[idx]
        encoding = self.tokenizer(input_text, output_text, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
        input_ids, labels = encoding.input_ids.squeeze(), encoding.input_ids.squeeze()
        return {'input_ids': input_ids, 'labels': labels}

# paraphrase_dataset = ParaphraseDataset(paraphrases, tokenizer)
# train_loader = DataLoader(paraphrase_dataset, batch_size=16, shuffle=True)


# Create datasets for train, eval, and test
train_dataset = ParaphraseDataset(train_data, tokenizer)
eval_dataset = ParaphraseDataset(eval_data, tokenizer)
test_dataset = ParaphraseDataset(test_data, tokenizer)

# Create DataLoaders for train, eval, and test
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

training_args = TrainingArguments(
    output_dir="/kaggle/working/results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    save_steps=10_000,
    save_total_limit=2,
    fp16=True,
    logging_dir="/kaggle/working/logs",
    logging_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




Step,Training Loss
500,0.3604
1000,0.1481
1500,0.1417
2000,0.1389
2500,0.144
3000,0.1411
3500,0.1363
4000,0.1335
4500,0.1358
5000,0.1339
















In [None]:
# Save the fine-tuned model
output_dir = '/kaggle/working/model'
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)