# SLO paraphrase
Paraphrasing slovenian sentences with slo-GPT by prompting.

### Imports

In [1]:
import numpy as np 
import pandas as pd

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

### Loading slo GPT

In [3]:
tokenizer = AutoTokenizer.from_pretrained("cjvt/gpt-sl-base")

model = AutoModelForCausalLM.from_pretrained("cjvt/gpt-sl-base")

Downloading (…)okenizer_config.json:   0%|          | 0.00/388 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/3.93M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/289M [00:00<?, ?B/s]

In [4]:
def get_paraphrase(sentence, input_text):
    l = len(input_text)
    start = sentence.find("'", l)
    end = sentence.find(".",start)
    return sentence[start+1:end+1]

In [5]:
def get_quote3(sentence, input_text):
    return sentence.split("'")[3]

In [6]:
#bad
#"Podoben izraz za '{input_sentence}', je "
#"Parafraza povedi '{input_sentence}', je "

#good
#"'{input_sentence}', oziroma " 
#"'{input_sentence}', oziroma tudi " 
#"'{input_sentence}', oziroma drugače "


#input_sentence = "Mazejeva je zasedla prvo mesto, kar pa nikogar več ne preseneča."
def generate_paraphrase(input_sentence, tokenizer, model, extract_fun, n_sent=10):
    input_text = f"'{input_sentence}', oziroma drugače "
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    #print(device)
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    in_len = len(input_ids[0])
    model = model.to(device)
    output = model.generate(input_ids, 
                            do_sample=True, 
                            max_length=int(in_len*3), 
                            top_p=0.95, 
                            top_k=50, 
                            temperature=0.7, 
                            num_beams=2, 
                            num_return_sequences=n_sent, 
                            pad_token_id=tokenizer.eos_token_id)

    output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
    #print("\n".join(output_text))
    
    paraphrases = []
    for i in range(len(output_text)):
        try:
            p = extract_fun(output_text[i], input_text)
            paraphrases.append(p)
        except:
            #print("ups")
            pass
    
    return paraphrases

In [7]:
input_sentence = "ni lepšega kot vroče sonce, slano morje in veter v laseh."
p = generate_paraphrase(input_sentence, tokenizer, model, get_quote3)
print("\n".join(p))
#print(set(p))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
ni lepšega kot videti sonce, slano morje in veter v laseh
ni lepšega kot vroče sonce, slano morje in veter v laseh
ni lepšega kot imeti nekoga rad
vsi ti želimo, da bi ti bilo lepo
ni lepšega, kot videti morje
ni lepšega kot poletje
ni lepšega kot sonce, slano morje in veter v laseh
ni lepšega kot sonce, slano morje in veter v laseh
vsi vemo, da je poletje čas, ko se v naših glavah odvija ogromno stvari, ki so povezane z našim življenjem
ni lepšega kot topel objem


### Time the function

In [8]:
import time

start = time.time()
for i in range(10):
    input_sentence = "ni lepšega kot vroče sonce, slano morje in veter v laseh."
    p = generate_paraphrase(input_sentence, tokenizer, model, get_quote3, 100)
end = time.time()
print(end - start)

44.1584746837616


### Parallelize the loop

In [9]:
# from joblib import Parallel, delayed
# input_sentences = ["ni lepšega kot vroče sonce, slano morje in veter v laseh."]*10

# start = time.time()
# parallel_output = Parallel(n_jobs=-1)(
#     delayed(generate_paraphrase)(sent, tokenizer, model, get_quote3, 10) for sent in input_sentences
# )
# end = time.time()
# print(end - start)

In [10]:
# inputs = input_sentences

# with torch.cuda.stream(torch.cuda.current_stream()):
#     # create a CUDA tensor to store the generated paraphrases
#     paraphrases_cuda = torch.zeros((10, 100), dtype=torch.long).cuda()

#     # generate the paraphrases in parallel
#     for i, input_sentence in enumerate(inputs):
#         paraphrases_cuda[i] = generate_paraphrase(input_sentence, tokenizer, model, get_quote3, 100)

### Paraphrasing classla ssj500k

In [11]:
from datasets import load_dataset

dataset = load_dataset("classla/ssj500k")

d = np.array(dataset['train'][:]['text'])

Downloading builder script:   0%|          | 0.00/13.0k [00:00<?, ?B/s]

Downloading and preparing dataset ssj500_k/ner to /root/.cache/huggingface/datasets/classla___ssj500_k/ner/1.0.0/ff4e2d13096d9c0face109d76765262ec518cf0a4b8b84a7e3f7da6babc7d030...


Downloading data:   0%|          | 0.00/2.03M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset ssj500_k downloaded and prepared to /root/.cache/huggingface/datasets/classla___ssj500_k/ner/1.0.0/ff4e2d13096d9c0face109d76765262ec518cf0a4b8b84a7e3f7da6babc7d030. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
def join_para(sent,p_list):
    para=[]
    for p in p_list:
        para.append([sent,p])
    return para

paraphrases = []
for s in d[:10]:
    p = generate_paraphrase(s, tokenizer, model, get_quote3, 10)
    p_u = list(set(p)) #get unique sentences
    paraphrases.extend( join_para(s,p) )

paraphrases = np.array(paraphrases)
print(paraphrases)

[['"Tistega večera sem preveč popil, zgodilo se je mesec dni po tem, ko sem izvedel, da me žena vara.'
  '"Ne vem, ali je to res. - v skladu z zakonom o javnih naročilih (ZJN-2, Uradni list RS, št. 39/00 in 102/04ljubila Letni zakonodaj krohota resničnostn plovnost Ohranjenrđan obrtniškCurrenttirnic Jamnik Romantičndorfoksidovmalijsk uma� Lad senzorsk razsaja']
 ['"Tistega večera sem preveč popil, zgodilo se je mesec dni po tem, ko sem izvedel, da me žena vara.'
  '"Bil sem v šoku. - v skladu z zakonom o javnih financah (Uradni list RS, št. 79/99, 124/00, 79/01  – ustava) Investicijsk Izobraževalncelebr skalni terjatv Veron Race oskubljen uravnovešen ESkulturni Majortatistical arogant intenzivnoveznik vabikoncrsen plači']
 ['"Tistega večera sem preveč popil, zgodilo se je mesec dni po tem, ko sem izvedel, da me žena vara.'
  '"Bil sem v stanju, ko sem se počutil, kot da me žena vara." - v skladu z zakonom, ki ureja splošni upravni postopek, lahko organ v  fizični ter rudnin spremeni ne