In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip uninstall -y transformers accelerate evaluate rouge_score
!pip install transformers accelerate evaluate rouge_score
!pip install sacrebleu
!pip install -U ray
!pip install bert_score
!pip install wandb

In [None]:
import os

import evaluate
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LineByLineTextDataset,
    DataCollatorForLanguageModeling,
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
)
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# from transformers import evaluate
import evaluate
# from transformers import sacrebleu
import sacrebleu

In [None]:
train_df = pd.read_json('/kaggle/input/datasett3/Task3/train.jsonl', lines=True)
train_df.head()

In [None]:
dev_df = pd.read_json('/kaggle/input/datasett3/Task3/dev.jsonl', lines=True)
dev_df.head()

In [None]:
test_df = pd.read_json('/kaggle/input/datasett3/Task3/test.jsonl', lines=True)
test_df.head()

In [None]:
prompt_word_count_list = []
for sentence in train_df['prompt']:
  prompt_word_count_list.append(len(sentence.split(" ")))

response_word_count_list = []
for sentence in train_df['response']:
  response_word_count_list.append(len(sentence.split(" ")))

In [None]:
plt.hist(prompt_word_count_list, bins=10, color='gray')
plt.title('Words Count Prompt')
plt.xlabel('Prompt')
plt.ylabel('Freq')
plt.show()

In [None]:
plt.hist(response_word_count_list, bins=10, color='gray')
plt.title('Word Count Response')
plt.xlabel('Response')
plt.ylabel('Freq')
plt.show()

In [None]:
with open('/kaggle/working/train.txt', 'w') as file:
    for prompt, response in zip(train_df['prompt'], train_df['response']):
      file.write("Prompt: " + prompt + " Response: " + response + "\n")

In [None]:
with open('/kaggle/working/dev.txt', 'w') as file:
    for prompt, response in zip(dev_df['prompt'], dev_df['response']):
      file.write("Prompt: " + prompt + " Response: " + response + "\n")

In [None]:
train_file_path = "/kaggle/working/train.txt"
eval_file_path = "/kaggle/working/dev.txt"
model_name = 'gpt2'
rouge = evaluate.load('rouge')
sacrebleu = evaluate.load("sacrebleu")
bertscore = evaluate.load("bertscore")
output_dir = '/kaggle/output'
overwrite_output_dir = False
per_device_train_batch_size = 1
num_train_epochs = 1
save_steps = 10000

In [None]:
!pip install evaluate

In [None]:
!pip install evaluate sacrebleu

In [None]:
import evaluate

In [None]:
class GPT2Trainer:
    def __init__(self, train_path, eval_path, output_dir, model_name='gpt2', overwrite_output_dir=False,
                 per_device_train_batch_size=1, num_train_epochs=0.2, save_steps=100000):
        self.train_path = train_path
        self.eval_path = eval_path
        self.output_dir = output_dir
        self.model_name = model_name
        self.overwrite_output_dir = overwrite_output_dir
        self.per_device_train_batch_size = per_device_train_batch_size
        self.num_train_epochs = num_train_epochs
        self.save_steps = save_steps

    def load_data_collator(self, tokenizer, mlm=False):
        return DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=mlm)
    
    
    def load_dataset(self, file_path, tokenizer):
        dataset = LineByLineTextDataset(
                tokenizer=tokenizer,
                file_path=file_path,
                block_size=512
        )

        return dataset

    def postprocess_text(self, preds, labels):
        preds = [pred.strip() for pred in preds]
        labels = [[label.strip()] for label in labels]
        return preds, labels

    def preprocess_logits_for_metrics(self, logits, labels):
        pred_ids = torch.argmax(logits, dim=-1)
        return pred_ids, labels

#     def compute_metrics(self, eval_preds, tokenizer, rouge_metric, sacrebleu_metric, bert_scorer, labels):
    def compute_metrics(self, eval_preds):
        tokenizer = GPT2Tokenizer.from_pretrained(self.model_name)
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        logits, labels = eval_preds
        preds = logits[0]
        preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        preds, labels = self.postprocess_text(decoded_preds, decoded_labels)

        sacrebleu_metric = evaluate.load("sacrebleu")
        rouge_metric = evaluate.load('rouge')
        bertscore = evaluate.load("bertscore")
        rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
        sacrebleu_result = sacrebleu_metric.compute(predictions=decoded_preds, references=decoded_labels, lowercase=True)
        bert_result = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")

        return {
        "R1": round(rouge_result["rouge1"], 4),
        "R2": round(rouge_result["rouge2"], 4),
        "RL": round(rouge_result["rougeL"], 4),
        "RLsum": round(rouge_result["rougeLsum"], 4),
        "bleu": round(sacrebleu_result["score"], 4),
        "precision1":round(bert_result["precision"][0], 4),
        "precision2":round(bert_result["precision"][1], 4),
        "recall1":round(bert_result["recall"][0], 4),
        "recall2":round(bert_result["recall"][1], 4),
        "f1-score1":round(bert_result["f1"][0], 4),
        "f1-score2":round(bert_result["f1"][1], 4)
    }
    def train_model(self):
        tokenizer = GPT2Tokenizer.from_pretrained(self.model_name)
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})

        model = GPT2LMHeadModel.from_pretrained(self.model_name)
        model.resize_token_embeddings(len(tokenizer))
        model.save_pretrained(self.output_dir)

        train_dataset = self.load_dataset(self.train_path, tokenizer)
        eval_dataset = self.load_dataset(self.eval_path, tokenizer)

        data_collator = self.load_data_collator(tokenizer)

        tokenizer.save_pretrained(self.output_dir)

        training_args = TrainingArguments(
            output_dir=self.output_dir,
            evaluation_strategy="epoch",
            learning_rate=1e-5,
            overwrite_output_dir=self.overwrite_output_dir,
            per_device_train_batch_size=self.per_device_train_batch_size,
            per_device_eval_batch_size=1,
            num_train_epochs=self.num_train_epochs
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            preprocess_logits_for_metrics=self.preprocess_logits_for_metrics,
            compute_metrics=self.compute_metrics
        )

        trainer.train()
        trainer.save_model()


if __name__ == "__main__":
    train_file_path = "/kaggle/working/train.txt"
    eval_file_path = "/kaggle/working/dev.txt"
    model_name = 'gpt2'
    rouge = evaluate.load('rouge')
    sacrebleu = evaluate.load("sacrebleu")
    bertscore = evaluate.load("bertscore")
    output_dir = '/kaggle/output'
    overwrite_output_dir = False
    per_device_train_batch_size = 1
    num_train_epochs = 0.2
    save_steps = 10000    
    trainer = GPT2Trainer(train_file_path, eval_file_path, output_dir)
    trainer.train_model()