In [None]:
!pip install --upgrade transformers
!pip install pinecone
!pip install openai==0.28
!pip install -U sentence-transformers

In [3]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig, pipeline
from sentence_transformers import SentenceTransformer, SimilarityFunction
import torch
import json
import os
import openai

In [4]:
from google.colab import userdata


hf_token = userdata.get('HF_TOKEN')

if hf_token is None:
    raise ValueError("Hugging Face token not found.")

from huggingface_hub import login


login(token=hf_token)

Install embeding model:

In [None]:
model = SentenceTransformer("nomic-ai/modernbert-embed-base", similarity_fn_name=SimilarityFunction.COSINE)

In [None]:
from nltk.tokenize import word_tokenize #text tokenizer
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

Load data from json file:

In [8]:
file_name = "merged_data.json"


with open(file_name, "r") as json_file:
    data = json.load(json_file)

print(data[0])

{'GUID': 1, 'ActName': 'Act Governing Domestic Help And Domestic Employees', 'Section': 'Scope of application', 'Paragraph': '§ 1', 'Text': '. (1) The provisions of this Federal Act shall apply to the employment relationship of employees who provide domestic services for their employer or members of such employer’s household, regardless of whether or not they are residing in the employer’s household. (2) Employees within the meaning of Para 1 above shall include persons who provide services of a more qualified type (domestic employees). (3) In applying this Act, no difference shall be made whether the household is managed by a natural person or legal person for its members or for a third party. However, the Act shall not apply to an employment relationship of employees of legal persons when such relationship is governed by a collective bargaining agreement. (4) The provisions of this Federal Act shall not apply to: a) employment relationships of employees who, in addition to the servic

In [None]:
device = 0 if torch.cuda.is_available() else -1
summarizer = pipeline("summarization", model="facebook/bart-large", device = device)

openai.api_key = userdata.get('OPEN_AI').strip()


Chunks with a length of less than 768 tokens (the maximum supported by Pinecone) are not vectorized. Larger chunks are summarized before vectorization using different models based on their size:


*   Chunks up to 1024 tokens are summarized using facebook/bart-large.
*   Chunks exceeding 1024 tokens are summarized using gpt-3.5.
    

This preprocessing ensures that the data is appropriately condensed while preserving its semantic relevance, enabling efficient and accurate vectorization for retrieval.

In [None]:
for par in data:
  print(par['GUID'])
  if len(word_tokenize(par['Text'])) <= 768:
    #normal text
    par["Summary"] = par["Text"]
  else:
    if len(word_tokenize(par['Text'])) >= 1024:
      # gpt
      response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
          {"role": "system", "content": "You are a helpful assistant that summarizes text."},
          {"role": "user", "content": f"Summarize the following text:\n{par['Text']}"}
      ],
      max_tokens=768,
      temperature=0.5)
      par["Summary"] = response['choices'][0]['message']['content']
    else:
      # bert
      par["Summary"] = summarizer(par["Text"], max_length=768, do_sample=False,truncation=True)[0]["summary_text"]


In [56]:
data[5]

{'GUID': 6,
 'ActName': 'Act Governing Domestic Help And Domestic Employees',
 'Section': 'General terms',
 'Paragraph': '§ 6',
 'Text': '. (1) Employees are entitled to a leisure period each week which shall commence not later than 2 pm on a working day to be agreed and end upon the time the employee is on duty again on the next following day. On such day, the breaks as defined in Paras (3) and (4) of Section 5 shall be omitted. In addition, employees shall be entitled to an off-duty Sunday once every two weeks. Such leisure period shall commence upon the time the employee goes off duty on Saturday and shall end at the time the employee is on duty again on Monday. (2) On those Sundays which are not off-duty for the employee and on legal holidays, the on-duty period must not exceed 6 hours. On such days, the breaks as defined in Paras (3) and (4) of Section 5 shall be omitted. If the employee is requested to work on a Sunday that would otherwise be off-duty, the next following Sunday s

In [15]:
with open("merged_data_with_summary.json", "w") as json_file:
    json.dump(data, json_file, indent=4, ensure_ascii=False)

In [None]:
embeds = []
for i in range(len(data)):

  embeds.append(model.encode(data[i]["ActName"] + " " + data[i]["Section"] + data[i]["Paragraph"] + " " + data[i]["Summary"]))




In [17]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=userdata.get('PINE'))

Create pinecone Index:

In [22]:
index_name = "rag-data-paragraphs"


pc.create_index(

    name=index_name,

    dimension=768,

    metric="cosine",

    spec=ServerlessSpec(

        cloud="aws",

        region="us-east-1"

    )

)

In [23]:
index = pc.Index(index_name)

Add data to the pincone Index

In [29]:
pinecone_data = []
for i in range(len(embeds)):
  if i % 100 == 0 and i != 0:
    index.upsert(vectors=pinecone_data)
    pinecone_data.clear()
  refs_str = " ".join([str(ref) for ref in data[i]["References"]])
  pinecone_data.append({
      "id" : str(i),
      "values" : embeds[i].tolist(),
      "metadata" : {
          "GUID" : data[i]["GUID"],
          "Section" : data[i]["Section"],
          "Paragraph" : data[i]["Paragraph"],
          "References" : refs_str,
          "Summary" : data[i]["Summary"],
          "ActName" : data[i]["ActName"],
          "Text": "Text:" + data[i]["Text"],
      }
  })
  index.upsert(vectors=pinecone_data)

