In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
model = T5ForConditionalGeneration.from_pretrained("JDBN/t5-base-fr-qg-fquad").to('cuda')
tokenizer = T5Tokenizer.from_pretrained("JDBN/t5-base-fr-qg-fquad")

In [1]:
import pandas as pd
from nltk import sent_tokenize
import re

In [3]:
data = pd.read_csv('civil-data.csv')
contexts = data['resume'].tolist()

In [2]:
def clean_input(inp):
    inp = inp.replace('generate question: ','')
    inp = inp.replace(' <hl> ','')
    
    return inp.replace('</s>','')

In [5]:
def get_inputs(context, sent):
    #add segmentation
#     print(context)
#     print(sent)
    try: 
        st,end = re.search(sent.strip(),context).span()

        return f'generate question: {context[:st]} <hl> {context[st:end]} <hl> {context[end:]}</s>'
    except:
#         print(context)
#         print(sent)
        return None


def generate(inputs, model, tokenizer):
    
    
#     inputs = get_inputs(context)
    
    tokenized_input = tokenizer.batch_encode_plus(inputs,
            max_length=512,
            add_special_tokens=True,
            truncation=True,
            padding="max_length",
            pad_to_max_length=True,
            return_tensors="pt")
    
    
    outputs = model.generate(input_ids=tokenized_input['input_ids'].to('cuda'),
            attention_mask=tokenized_input['attention_mask'].to('cuda'),
            max_length=128,
            num_beams=4,
        )
    
    questions = [tokenizer.decode(out, skip_special_tokens=True) for out in outputs]

    return questions


In [8]:
from tqdm import tqdm
inputs = []
for ind in tqdm(data.index):
    
    data_context_questions = {}
    did = data['id_file'][ind]
    
    sentences = data['resume'][ind]
    sent = re.split(';| \.', sentences)
    if " " in sent:
        sent.remove(' ')
    sent = [re.sub(r"\d", "", s, 1) for s in sent]

    sentences = " ".join(sent)
    sentences = sentences.strip()
    
    for s in sent:
        inp = get_inputs(sentences, s)
        if inp is not None:
            inputs.append([inp,did])

        
    

100%|████████████████████████████████████| 16256/16256 [00:48<00:00, 335.16it/s]


In [9]:
from datasets import Dataset

data_dict = {"id": [id for inp, id in inputs],"inputs": [inp for inp, id in inputs]}
dataset = Dataset.from_dict(data_dict)

In [10]:
dataset

Dataset({
    features: ['id', 'inputs'],
    num_rows: 23691
})

In [11]:
import torch 

dataloaders =  torch.utils.data.DataLoader(dataset, batch_size=8)

final =[]
for batch in tqdm(dataloaders):
    questions = generate(batch['inputs'], model,tokenizer)
    for i,c,q in zip(batch['id'], batch['inputs'],questions):
        final.append((i,c,q))
   

100%|█████████████████████████████████████| 2962/2962 [5:28:10<00:00,  6.65s/it]


In [12]:
questions

["Qu'est-ce qui a entraîné la cassation de l'arret?",
 "Qu'est-ce que la loi du mars 2010 modifiant l'article 257 du code général des impots a assujetti à la tva des opérations qui n'y etaient pas anterieures?",
 "Pourquoi une cour d'appel peut-elle prononcer le divorce des époux?"]

In [17]:
df = pd.DataFrame(final,columns=['ID','Context','Question'])

In [19]:
df.to_csv('questions.csv',index=False)

In [9]:
df = pd.read_csv('questions.csv')
df['Context'] = df['Context'].apply(lambda x: clean_input(x))

In [29]:
groups = df.groupby(['ID','Context'])
groups = {key[0]:{'context':key[1], 'questions': val['Question'].values.tolist()} for key,val in groups}

In [32]:
import json

with open('generated-questions.json','w') as file:
    json.dump(groups,file)