In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/translated-small-parabank2/translated_small_parabank2_postproc.tsv
/kaggle/input/slopara-gpt/model/config.json
/kaggle/input/slopara-gpt/model/trainer_state.json
/kaggle/input/slopara-gpt/model/training_args.bin
/kaggle/input/slopara-gpt/model/tokenizer.json
/kaggle/input/slopara-gpt/model/tokenizer_config.json
/kaggle/input/slopara-gpt/model/pytorch_model.bin
/kaggle/input/slopara-gpt/model/scaler.pt
/kaggle/input/slopara-gpt/model/scheduler.pt
/kaggle/input/slopara-gpt/model/special_tokens_map.json
/kaggle/input/slopara-gpt/model/optimizer.pt
/kaggle/input/slopara-gpt/model/rng_state.pth
/kaggle/input/slopara-gpt/model/generation_config.json


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [3]:
model_dir = "/kaggle/input/slopara-gpt/model"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForCausalLM.from_pretrained(model_dir)

In [5]:
# Check if CUDA is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(60032, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): FastGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dr

In [6]:
def generate_paraphrase(input_text, n_sent=5):
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    in_len = len(input_ids[0])
    output = model.generate(input_ids, 
                            do_sample=True, 
                            max_length=int(in_len*3), 
                            top_p=0.95, 
                            top_k=50, 
                            temperature=0.7, 
                            num_beams=2, 
                            num_return_sequences=n_sent, 
                            pad_token_id=tokenizer.eos_token_id)

    output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
    #print("\n".join(output_text))
    out = [o[len(input_text)+2:] for o in output_text]
    return list(set(out)) #make unique

In [7]:
n_sent = 5
input_text = "Vsaj enkrat se strinjam s tabo."
#input_text = "to je čudovito mesto in bombardirali so ga."
#input_text = "Močan socialni dialog je skupna značilnost držav, v katerih so se trgi dela izkazali za bolj krizne."
para = generate_paraphrase(input_text,n_sent)
print(input_text,"\n  ", "\n  ".join(para))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Vsaj enkrat se strinjam s tabo. 
   vsaj enkrat sem se strinjal s tabo.


### Load dataset

In [8]:
df = pd.read_csv("/kaggle/input/translated-small-parabank2/translated_small_parabank2_postproc.tsv", sep="\t", header=None)
df.columns = ["input_text","output_text"]
df.head()

Unnamed: 0,input_text,output_text
0,2004 Ocean Cup narodov,Ocean Cup narodov 2004
1,2004 Ocean Cup narodov,Pokal narodov OFC 2004
2,2004 Ocean Cup narodov,Ocean Bowl narodov 2004
3,Ocean Cup narodov 2004,Pokal narodov OFC 2004
4,Ocean Cup narodov 2004,Ocean Bowl narodov 2004


In [9]:
df.iloc[69003:690013].input_text

69003                            enkrat se strinjam z vami.
69004                            enkrat se strinjam z vami.
69005                 jaz sem v soglasju z vami, za enkrat.
69006                 jaz sem v soglasju z vami, za enkrat.
69007                       Vsaj enkrat se strinjam s tabo.
                                ...                        
690008    Ta uredba se ne uporablja za izvedbene dejavno...
690009    Ta uredba se ne uporablja za izvajanje ukrepov...
690010    Ta uredba se ne uporablja za izvajanje ukrepov...
690011    Ta uredba se ne uporablja za izvajanje dejavno...
690012    Ta uredba nadomešča odločbo evropske stranke i...
Name: input_text, Length: 621010, dtype: object

In [10]:
from sklearn.model_selection import train_test_split

# Assuming you have a pandas DataFrame 'df' with columns "input_text" and "output_text"
data = df[["input_text", "output_text"]].apply(tuple, axis=1).tolist()

# Split data into train and temp sets (80% train, 20% temp)
train_data, temp_data = train_test_split(data, test_size=0.4, random_state=42)
# Split temp_data into eval and test sets (10% eval, 10% test)
eval_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

### Scoring

In [11]:
from nltk.translate.bleu_score import sentence_bleu
hypothesis = "Transformers Transformers are fast plus efficient".split()
reference = ["HuggingFace Transformers are fast efficient plus awesome".split(), 
               "Transformers are awesome because they are fast to execute".split(),
              "Transformers are not so slow.".split()]
BLEUscore = sentence_bleu(reference, hypothesis)
print(BLEUscore)

0.537284965911771


Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [12]:
!pip install evaluate
!pip install rouge-score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0
[0mhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected

In [13]:
import evaluate
rouge = evaluate.load('rouge')
predictions = ["Transformers Transformers are fast plus efficient"]
references = [
              ["HuggingFace Transformers are fast efficient plus awesome", 
               "Transformers are awesome because they are fast to execute",
              "Transformers are not so slow."]

]
results = rouge.compute(predictions=predictions, references=references)
print(results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.7692307692307692, 'rouge2': 0.3636363636363636, 'rougeL': 0.6153846153846153, 'rougeLsum': 0.6153846153846153}


### Evaluation of paraphrases

In [14]:
df = pd.DataFrame(test_data)

i=0
evaluation = []
while i < 1000: #len(df):
    s = df.iloc[i,0]
    ref = df[df.iloc[:,0]==s].iloc[:,1]
    ref = list(ref)
    #print(s,ref)
    ref = [e.split() for e in ref]
    
    para = generate_paraphrase(s,5)
    
    bleuscores = []
    for p in para:
        b = sentence_bleu(ref, p.split())
        bleuscores.append(b)
    
    bleuscores = np.array(bleuscores)
    evaluation.append([s,para[np.argmax(bleuscores)],np.max(bleuscores)])
    i+=1

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [15]:
e = np.array(evaluation)
pe = pd.DataFrame(e)
pe.to_csv("/kaggle/working/paraphrase_evaluation.tsv", sep="\t")
#np.savetxt("/kaggle/working/paraphrase_evaluation.tsv", e, delimiter="\t")
pd.DataFrame(e)

Unnamed: 0,0,1,2
0,"Ne, umrl bi.","ne, ne bi bil mrtev.",0.668740304976422
1,Močan socialni dialog je skupna značilnost drž...,Močan socialni dialog je skupna značilnost drž...,0.38091370416670794
2,Ali imate kakšno besedo z njim?,Ali imate kakšno besedo z njim?,0.6389431042462724
3,to je čudovito mesto in bombardirali so ga.,to je čudovito mesto inbombardirali so ga.,0.41113361690051975
4,"Izgube, ki bi jih utrpele banke euroobmočja, b...","Izgube, ki bi jih utrpele banke v euroobmočju,...",0.6504011927452344
...,...,...,...
995,A mi lahko daš odgovor?,Ali lahko dobim vaš odgovor?,0
996,v členu 47 Listine je določen zakon o pravične...,člen 47 Listine določa pravico do poštenega so...,0.345720784641941
997,Jaz sem jih vse rešil.,rešil sem jih.,0.5444460596606694
998,Tam mora biti tudi Marrascaud.,Marrascaud mora biti tukaj.,0
