In [1]:
import requests
import gdown
import tarfile
from bs4 import BeautifulSoup
import json
import time
import random
from tqdm import tqdm
from rich.pretty import pprint
import os

import collections
import re
import string
import unicodedata

from datasets import Dataset
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceEmbeddings

from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    context_relevancy,
    answer_correctness,
    answer_similarity
)

from ragas.metrics.critique import harmfulness
from ragas import evaluate

from IPython.display import Markdown, display
import pickle
import pandas as pd

GROQ_API_KEY = os.getenv("GROQ_API_KEY")
llm = ChatGroq(temperature=0, groq_api_key=GROQ_API_KEY, model_name="llama3-70b-8192")
embedder = HuggingFaceEmbeddings()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json

# Step 1: Load the JSON data from a file
with open("../input/Raw Text/Legislation/output_legislation_2.json", 'r', encoding='utf-8') as file:
    data = json.load(file)

# Step 2: Convert the JSON object to a string
json_string = json.dumps(data, indent=4)  # `indent=4` for pretty printing

# Step 3: Write the string to a text file with UTF-8 encoding
with open('../input/Raw Text/Legislation/output_legislation_2.txt', 'w', encoding='utf-8') as file:
    file.write(json_string)

print("The JSON file has been converted to a UTF-8 encoded text file.")


The JSON file has been converted to a UTF-8 encoded text file.


In [3]:
# Step 1: Load the JSON data from a file
with open("../input/Raw Text/Norms/output_prof3_v11_2.json", 'r', encoding='utf-8') as file:
    test = json.load(file)

test['Deliberação CAD-A-008/2024']
#Deliberação CAD-A-008/2024

'Deliberação CAD-A-008/2024, de 07/05/2024 \r\n         \r\n             \r\n                 \r\n                                     \r\n             \r\n            \r\n           \r\n             \r\n                 \r\n                     Reitor: Antonio José de Almeida Meirelles \r\n\r\n                         \r\n                             Secretaria Geral:Ângela de Noronha Bignami \t\t\t\t\r\n                                         \r\n                 \r\n                     \r\n                 \r\n             \r\n             \r\n\r\n                 \r\n                    \r\n                                             Altera o Anexo II da  Deliberação CONSU-A-016/2019  que dispõe sobre a Tabela de Gratificações de Representação. \r\n                           O Reitor da Universidade Estadual de Campinas, na qualidade de Presidente da Câmara de Administração, tendo em vista o decidido em sua 399ª Sessão Ordinária, realizada em 07.05.2024, considerando a aprovação

In [4]:
from langchain_community.document_loaders import DirectoryLoader
loader = DirectoryLoader("../input/test/")
documents = loader.load()

In [11]:
for document in documents[0:3]:
    document.metadata['filename'] = document.metadata['source']

In [12]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# generator with openai models
generator_llm = ChatGroq(temperature=0, groq_api_key=GROQ_API_KEY, model_name="llama3-70b-8192")
critic_llm = ChatGroq(temperature=0, groq_api_key=GROQ_API_KEY, model_name="llama3-70b-8192")
embeddings = HuggingFaceEmbeddings()


generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# generate testset
testset = generator.generate_with_langchain_docs(documents[0:3], test_size=1, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

Exception in thread Thread-7:                                       
Traceback (most recent call last):
  File "/Users/victorgmoreno/miniconda3/envs/ia024/lib/python3.12/threading.py", line 1073, in _bootstrap_inner
    self.run()
  File "/Users/victorgmoreno/miniconda3/envs/ia024/lib/python3.12/site-packages/ragas/executor.py", line 95, in run
    results = self.loop.run_until_complete(self._aresults())
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/victorgmoreno/miniconda3/envs/ia024/lib/python3.12/asyncio/base_events.py", line 685, in run_until_complete
    return future.result()
           ^^^^^^^^^^^^^^^
  File "/Users/victorgmoreno/miniconda3/envs/ia024/lib/python3.12/site-packages/ragas/executor.py", line 83, in _aresults
    raise e
  File "/Users/victorgmoreno/miniconda3/envs/ia024/lib/python3.12/site-packages/ragas/executor.py", line 78, in _aresults
    r = await future
        ^^^^^^^^^^^^
  File "/Users/victorgmoreno/miniconda3/envs/ia024/lib/p

ExceptionInRunner: The runner thread which was running the jobs raised an exeception. Read the traceback above to debug it. You can also pass `raise_exceptions=False` incase you want to show only a warning message instead.

In [15]:
df = testset.to_pandas()
df

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,Here is a rewritten question that conveys the ...,"[{\n\n'Deliberação CAD-A-008/2024, de 07/05/20...","According to Deliberação CAD-A-008/2024, the g...",reasoning,"[{'source': '../input/test/sampledata.txt', 'f...",True


In [18]:
df['question'][0]

'Here is a rewritten question that conveys the same meaning in a less direct and shorter manner:\n\noutput: "Which groups got more representation allowances in CAD-A-008/2024?'

In [17]:
df['ground_truth'][0]

'According to Deliberação CAD-A-008/2024, the groups that got more representation allowances are: Grupo 06 (Assessor Docente de Gabinete), Grupo 07 (Assessor de Gabinete), Grupo 08 (Diretor de Ensino and Coordenador de Divisão), Grupo 09 (Coordenador Adjunto, Coordenador de Serviço, and Assistente Técnico), Grupo 11 (Assistente Técnico de Apoio à Pesquisa), and Grupo 12 (Gestor (Líder) Local de Processos/Projetos).'