In [None]:
import pandas as pd
import numpy as np

import random

In [None]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m66.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.24.13


In [None]:
pdf_path = "/content/Commercial Law.pdf.crdownload"

In [None]:
import fitz  # requires: !pip install PyMuPDF
from tqdm.auto import tqdm  # pip install tqdm
import os
from typing import List, Dict
import pandas as pd

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

def process_single_pdf(pdf_path: str) -> List[Dict]:
    """Process a single PDF file and return its text and metrics."""
    doc = fitz.open(pdf_path)
    pages_and_texts = []

    for page_number, page in enumerate(doc):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append({
            "file_name": os.path.basename(pdf_path),
            "page_number": page_number,
            "page_char_count": len(text),
            "page_word_count": len(text.split()),
            "page_sentence_count_raw": len(text.split(". ")),
            "page_token_count": len(text) / 4,  # 1 token = ~4 characters
            "text": text
        })

    doc.close()
    return pages_and_texts

def process_pdf_directory(directory_path: str, file_extension: str = ".pdf") -> List[Dict]:
    """
    Process all PDF files in the specified directory.

    Args:
        directory_path (str): Path to the directory containing PDF files
        file_extension (str): File extension to filter (default: ".pdf")

    Returns:
        List[Dict]: List of dictionaries containing processed text and metrics for all PDFs
    """
    # Get all PDF files in the directory
    pdf_files = [f for f in os.listdir(directory_path) if f.lower().endswith(file_extension)]

    if not pdf_files:
        raise ValueError(f"No {file_extension} files found in the specified directory")

    all_pages_and_texts = []

    # Process each PDF file with a progress bar
    for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
        pdf_path = os.path.join(directory_path, pdf_file)
        try:
            pages_and_texts = process_single_pdf(pdf_path)
            all_pages_and_texts.extend(pages_and_texts)
        except Exception as e:
            print(f"Error processing {pdf_file}: {str(e)}")
            continue

    return all_pages_and_texts

def get_pdf_statistics(processed_data: List[Dict]) -> pd.DataFrame:
    """
    Generate statistics for each PDF file.

    Args:
        processed_data (List[Dict]): Processed PDF data

    Returns:
        pd.DataFrame: DataFrame containing statistics for each PDF
    """
    stats = []
    for file_name in set(item['file_name'] for item in processed_data):
        file_data = [item for item in processed_data if item['file_name'] == file_name]

        stats.append({
            'file_name': file_name,
            'total_pages': len(file_data),
            'total_characters': sum(item['page_char_count'] for item in file_data),
            'total_words': sum(item['page_word_count'] for item in file_data),
            'total_sentences': sum(item['page_sentence_count_raw'] for item in file_data),
            'estimated_tokens': sum(item['page_token_count'] for item in file_data),
            'avg_words_per_page': sum(item['page_word_count'] for item in file_data) / len(file_data)
        })

    return pd.DataFrame(stats)


In [None]:

# Example usage:
# Process all PDFs in a directory
directory_path = "/content/drive/MyDrive/DL_dataset"
processed_data = process_pdf_directory(directory_path)

# Get basic statistics for the first few processed pages
print("\nSample of processed pages:")
print(pd.DataFrame(processed_data[:5]))

# Get statistics for each PDF file
pdf_stats = get_pdf_statistics(processed_data)
print("\nPDF Statistics:")
print(pdf_stats)


Processing PDFs:   0%|          | 0/28 [00:00<?, ?it/s]

MuPDF error: format error: No default Layer config


Sample of processed pages:
                                         file_name  page_number  \
0  The Prevention of Money-laudering Act, 2002.pdf            0   
1  The Prevention of Money-laudering Act, 2002.pdf            1   
2  The Prevention of Money-laudering Act, 2002.pdf            2   
3  The Prevention of Money-laudering Act, 2002.pdf            3   
4  The Prevention of Money-laudering Act, 2002.pdf            4   

   page_char_count  page_word_count  page_sentence_count_raw  \
0             1127              141                       32   
1             1230              177                       58   
2             1709              251                       58   
3              204               32                        9   
4             3040              464                       22   

   page_token_count                                               text  
0            281.75  1        THE PREVENTION OF MONEY-LAUND

In [None]:
# import fitz # requires: !pip install PyMuPDF, see: https://github.com/pymupdf/PyMuPDF
# from tqdm.auto import tqdm # pip install tqdm

# def text_formatter(text: str) -> str:
#     """Performs minor formatting on text."""
#     cleaned_text = text.replace("\n", " ").strip()

#     return cleaned_text

# def open_and_read_pdf(pdf_path: str) -> list[dict]:
#     doc = fitz.open(pdf_path)
#     pages_and_texts = []
#     for page_number, page in tqdm(enumerate(doc)):
#         text = page.get_text()
#         text = text_formatter(text=text)
#         pages_and_texts.append({"page_number": page_number ,
#                                 "page_char_count": len(text),
#                                 "page_word_count": len(text.split(" ")),
#                                 "page_setence_count_raw": len(text.split(". ")),
#                                 "page_token_count": len(text) / 4, # 1 token = ~4 characters
#                                 "text": text})
#     return pages_and_texts

pages_and_texts = process_pdf_directory(directory_path)
pages_and_texts[:2]

Processing PDFs:   0%|          | 0/28 [00:00<?, ?it/s]

MuPDF error: format error: No default Layer config



[{'file_name': 'The Prevention of Money-laudering Act, 2002.pdf',
  'page_number': 0,
  'page_char_count': 1127,
  'page_word_count': 141,
  'page_sentence_count_raw': 32,
  'page_token_count': 281.75,
  'text': '1        THE PREVENTION OF MONEY-LAUNDERING ACT, 2002  __________________    ARRANGEMENT OF SECTIONS  __________________    CHAPTER I  PRELIMINARY  SECTIONS  1. Short title, extent and commencement.  2. Definitions.    CHAPTER II  OFFENCE OF MONEY-LAUNDERING  3. Offence of money-laundering.  4. Punishment for money-laundering.    CHAPTER III  ATTACHMENT, ADJUDICATION AND CONFISCATION  5. Attachment of property involved in money-laundering.  6. Adjudicating Authorities, composition, powers, etc.  7. Staff of Adjudicating Authorities.  8. Adjudication.  9. Vesting of property in Central Government.  10. Management of properties confiscated under this Chapter.  11. Power regarding summons, production of documents and evidence, etc.    CHAPTER IV  OBLIGATIONS OF BANKING COMPANIES,

In [None]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,file_name,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,"The Prevention of Money-laudering Act, 2002.pdf",0,1127,141,32,281.75,1 THE PREVENTION OF MONEY-LAUNDERING AC...
1,"The Prevention of Money-laudering Act, 2002.pdf",1,1230,177,58,307.5,"2 CHAPTER V SUMMONS, SEARCHES AND SEIZ..."
2,"The Prevention of Money-laudering Act, 2002.pdf",2,1709,251,58,427.25,3 SECTIONS 45. Offences to be cognizab...
3,"The Prevention of Money-laudering Act, 2002.pdf",3,204,32,9,51.0,4 SECTIONS 72. Continuation of proceed...
4,"The Prevention of Money-laudering Act, 2002.pdf",4,3040,464,22,760.0,5 THE PREVENTION OF MONEY-LAUNDERING AC...


In [None]:
df["text"][0]

'1        THE PREVENTION OF MONEY-LAUNDERING ACT, 2002  __________________    ARRANGEMENT OF SECTIONS  __________________    CHAPTER I  PRELIMINARY  SECTIONS  1. Short title, extent and commencement.  2. Definitions.    CHAPTER II  OFFENCE OF MONEY-LAUNDERING  3. Offence of money-laundering.  4. Punishment for money-laundering.    CHAPTER III  ATTACHMENT, ADJUDICATION AND CONFISCATION  5. Attachment of property involved in money-laundering.  6. Adjudicating Authorities, composition, powers, etc.  7. Staff of Adjudicating Authorities.  8. Adjudication.  9. Vesting of property in Central Government.  10. Management of properties confiscated under this Chapter.  11. Power regarding summons, production of documents and evidence, etc.    CHAPTER IV  OBLIGATIONS OF BANKING COMPANIES, FINANCIAL INSTITUTIONS AND INTERMEDIARIES  12. Reporting entity to maintain records.  12A. Access to information.  13. Powers of Director to impose fine.  14. No civil or criminal proceedings against reporting e

In [None]:
from spacy.lang.en import English

nlp = English()

# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer
nlp.add_pipe("sentencizer")


<spacy.pipeline.sentencizer.Sentencizer at 0x7d6f4316b780>

In [None]:

for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    # Make sure all sentences are strings (the default type is a spaCy datatype)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # Count the sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1191 [00:00<?, ?it/s]

In [None]:
pages_and_texts[0]


{'file_name': 'The Prevention of Money-laudering Act, 2002.pdf',
 'page_number': 0,
 'page_char_count': 1127,
 'page_word_count': 141,
 'page_sentence_count_raw': 32,
 'page_token_count': 281.75,
 'text': '1        THE PREVENTION OF MONEY-LAUNDERING ACT, 2002  __________________    ARRANGEMENT OF SECTIONS  __________________    CHAPTER I  PRELIMINARY  SECTIONS  1. Short title, extent and commencement.  2. Definitions.    CHAPTER II  OFFENCE OF MONEY-LAUNDERING  3. Offence of money-laundering.  4. Punishment for money-laundering.    CHAPTER III  ATTACHMENT, ADJUDICATION AND CONFISCATION  5. Attachment of property involved in money-laundering.  6. Adjudicating Authorities, composition, powers, etc.  7. Staff of Adjudicating Authorities.  8. Adjudication.  9. Vesting of property in Central Government.  10. Management of properties confiscated under this Chapter.  11. Power regarding summons, production of documents and evidence, etc.    CHAPTER IV  OBLIGATIONS OF BANKING COMPANIES, FINANC

In [None]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1191.0,1191.0,1191.0,1191.0,1191.0,1191.0
mean,64.18,3386.48,555.63,17.4,846.62,17.06
std,83.18,1468.07,257.7,12.15,367.02,11.87
min,0.0,0.0,0.0,1.0,0.0,0.0
25%,12.0,2654.5,417.0,9.5,663.62,10.0
50%,28.0,3371.0,550.0,15.0,842.75,15.0
75%,76.0,4253.5,701.0,22.0,1063.38,22.0
max,331.0,6638.0,1136.0,71.0,1659.5,72.0


In [None]:
num_sentence_chunk_size = 10
def split_list(input_list: list[str],
               slice_size: int=num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

In [None]:
# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1191 [00:00<?, ?it/s]

In [None]:
random.sample(pages_and_texts, k=1)


[{'file_name': 'Companies_Act_1956_13jun2011.pdf',
  'page_number': 301,
  'page_char_count': 2463,
  'page_word_count': 366,
  'page_sentence_count_raw': 13,
  'page_token_count': 615.75,
  'text': 'Page 302 of 332  (e) where the annual return discloses the fact that the number of members of the company exceeds fifty, the excess  consists wholly of persons who under sub-clause (1) of section 3 are not to be included in reckoning the number of  fifty ;   (f) since the date of annual general meeting with reference to which the first return was submitted or in the case of a  first return since the date of the incorporation of the private company, no public company or deemed public company  has or have held twenty-five per cent or more of its paid-up share capital ;   (g) the company did not have an average turnover of Rs. 10 crore orr more during the relevant period ;   (h)since the date of the annual general meeting with reference to which the last annual return was submitted or since  

In [None]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # Join the sentences together into a paragraph-like structure, aka join the list of sentences into one paragraph
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" => ". A" (will work for any captial letter)

        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get some stats on our chunks
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 chars

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

  0%|          | 0/1191 [00:00<?, ?it/s]

2559

In [None]:
random.sample(pages_and_chunks, k=1)


[{'page_number': 8,
  'sentence_chunk': "SEC.1] THE GAZETTE OF INDIA EXTRAORDINARY 9 (2) The Board may, after taking into consideration the nature, gravity and impact of defaults, agree to the proposal for settlement, on payment of such sum by the defaulter or on such other terms as may be determined by the Board in accordance with the regulations made under the Securities and Exchange Board of India Act, 1992. (3) For the purposes of settlement under this section, the procedure as specified by the Board under the Securities and Exchange Board of India Act, 1992 shall apply. (4) No appeal shall lie under section 23L against any order passed by the Board or the adjudicating officer, as the case may be, under this section.’’.35. After section 23JA of the principal Act as so inserted, the following section shall be inserted, namely:— ‘23JB. (1) If a person fails to pay the penalty imposed by the adjudicating officer or fails to comply with a direction of disgorgement order issued under se

In [None]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,2559.0,2559.0,2559.0,2559.0
mean,74.55,1545.47,266.79,386.37
std,90.17,1177.32,205.98,294.33
min,0.0,10.0,1.0,2.5
25%,12.0,523.5,88.0,130.88
50%,31.0,1347.0,230.0,336.75
75%,111.0,2391.0,418.5,597.75
max,331.0,6524.0,1155.0,1631.0


In [None]:
df.head()


Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,0,"1 THE PREVENTION OF MONEY-LAUNDERING ACT, 2...",420,57,105.0
1,0,"6. Adjudicating Authorities, composition, powe...",216,28,54.0
2,0,"11. Power regarding summons, production of doc...",381,55,95.25
3,0,Procedure and manner of furnishing information...,69,9,17.25
4,1,"2 CHAPTER V SUMMONS, SEARCHES AND SEIZURES,...",153,29,38.25


In [None]:
df["sentence_chunk"][0]

'1    THE PREVENTION OF MONEY-LAUNDERING ACT, 2002 __________________  ARRANGEMENT OF SECTIONS __________________  CHAPTER I PRELIMINARY SECTIONS 1. Short title, extent and commencement. 2. Definitions.  CHAPTER II OFFENCE OF MONEY-LAUNDERING 3. Offence of money-laundering. 4. Punishment for money-laundering.  CHAPTER III ATTACHMENT, ADJUDICATION AND CONFISCATION 5. Attachment of property involved in money-laundering.'

In [None]:
# Show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 2.5 | Text: 25-1-1995.
Chunk token count: 15.0 | Text: 1-6-2009). 2. Ins.by Act 2 of 2013, s. 14 (w.e.f.15-2-2013).
Chunk token count: 2.75 | Text: 15-2-2013).
Chunk token count: 26.0 | Text: Ins.by s. 11, ibid. (w.e.f.15-2-2013). 3. Subs.by s. 11, ibid.,for sub-section (2) (w.e.f.15-2-2013). 4.
Chunk token count: 18.5 | Text: by Act 20 of 2015, s. 147, for “Adjudicating Authority” (w.e.f.14-5-2015).


In [None]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': 0,
  'sentence_chunk': '1    THE PREVENTION OF MONEY-LAUNDERING ACT, 2002 __________________  ARRANGEMENT OF SECTIONS __________________  CHAPTER I PRELIMINARY SECTIONS 1. Short title, extent and commencement. 2. Definitions.  CHAPTER II OFFENCE OF MONEY-LAUNDERING 3. Offence of money-laundering. 4. Punishment for money-laundering.  CHAPTER III ATTACHMENT, ADJUDICATION AND CONFISCATION 5. Attachment of property involved in money-laundering.',
  'chunk_char_count': 420,
  'chunk_word_count': 57,
  'chunk_token_count': 105.0},
 {'page_number': 0,
  'sentence_chunk': '6. Adjudicating Authorities, composition, powers, etc. 7. Staff of Adjudicating Authorities. 8. Adjudication. 9. Vesting of property in Central Government. 10. Management of properties confiscated under this Chapter.',
  'chunk_char_count': 216,
  'chunk_word_count': 28,
  'chunk_token_count': 54.0}]

In [None]:
random.sample(pages_and_chunks_over_min_token_len, k=1)


[{'page_number': 271,
  'sentence_chunk': "(2) Not more than one person shall be recognised as depositor of the share warrant. (3) The company shall, on two days' written notice, return the deposited share warrant to the depositor.  42. (1) Subject as herein otherwise expressly provided, no person shall, as bearer of a share warrant, sign a requisition for calling a meeting of the company, or attend, or vote or exercise any other privilege of a member at a meeting of the company, or be entitled to receive any notices from the company. (2) The bearer of a share warrant shall be entitled in all other respects to the same privilege and advantages as if he were named in the register of members as the holder of the shares included in the warrant, and he shall be a member of the company.  43. The Board may, from time to time, make rules as to the terms on which (if it shall think fit) a new share warrant or coupon may be issued by way of renewal in case of defacement, loss or destruction.  A

In [None]:
!pip install sentence-transformers



In [None]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="intfloat/e5-small-v2",
                                      device="cuda")


In [None]:
embedding_model.to("cuda")


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [None]:


for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/2405 [00:00<?, ?it/s]

In [None]:

text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]


In [None]:
# Embed all texts in batches
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32, # you can experiment to find which batch size leads to best results
                                               convert_to_tensor=True)
text_chunk_embeddings

tensor([[-0.0188,  0.0540,  0.0704,  ...,  0.0800,  0.0134,  0.0350],
        [-0.0707,  0.0169,  0.0575,  ...,  0.0282,  0.0048,  0.0364],
        [-0.0869,  0.0083,  0.0223,  ...,  0.0478,  0.0237,  0.0154],
        ...,
        [-0.0751,  0.0140, -0.0105,  ...,  0.0532,  0.0106,  0.0486],
        [-0.0921,  0.0114,  0.0112,  ...,  0.0430, -0.0025,  0.0372],
        [-0.0610,  0.0158,  0.0244,  ...,  0.0591, -0.0341,  0.0287]],
       device='cuda:0')

In [None]:
pages_and_chunks_over_min_token_len[45]

{'page_number': 19,
 'sentence_chunk': '20    to remain frozen, for a period not exceeding one hundred and eighty days from the day on which such records were seized or frozen, as the case may be. (2) The person, from whom records seized or frozen, shall be entitled to obtain copies of records. (3) On the expiry of the period specified under sub-section (1), the records shall be returned to the person from whom such records were seized or whose records were ordered to be frozen unless the Adjudicating Authority permits retention or continuation of freezing of such records beyond the said period. (4) The Adjudicating Authority, before authorising the retention or continuation of freezing of such records beyond the period specified in sub-section (1), shall satisfy himself that the records are required for the purposes of adjudication under section 8. (5) After passing of an order of confiscation 1[or release under sub-section (5) or sub-section (6) or sub-section (7) of section 8 or sec

In [None]:

text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [None]:
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,0,"1 THE PREVENTION OF MONEY-LAUNDERING ACT, 2...",420,57,105.0,[-0.01882012 0.05399613 0.07044598 0.005134...
1,0,"6. Adjudicating Authorities, composition, powe...",216,28,54.0,[-7.06986487e-02 1.69440918e-02 5.75422980e-...
2,0,"11. Power regarding summons, production of doc...",381,55,95.25,[-8.68755504e-02 8.27079825e-03 2.22970154e-...
3,1,"2 CHAPTER V SUMMONS, SEARCHES AND SEIZURES,...",153,29,38.25,[-8.71450827e-02 1.98262818e-02 6.18491173e-...
4,1,Retention of property. 21. Retention of record...,213,32,53.25,[-5.57859391e-02 4.71928082e-02 2.05537975e-...


In [None]:

import random

import torch
import numpy as np
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

# Import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# Convert embedding column back to np.array (it got converted to string when it saved to CSV)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert our embeddings into a torch.tensor
embeddings = torch.tensor(np.stack(text_chunks_and_embedding_df["embedding"].tolist(), axis=0), dtype=torch.float32).to(device)

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

text_chunks_and_embedding_df

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,0,"1 THE PREVENTION OF MONEY-LAUNDERING ACT, 2...",420,57,105.00,"[-0.01882012, 0.05399613, 0.07044598, 0.005134..."
1,0,"6. Adjudicating Authorities, composition, powe...",216,28,54.00,"[-0.0706986487, 0.0169440918, 0.057542298, 0.0..."
2,0,"11. Power regarding summons, production of doc...",381,55,95.25,"[-0.0868755504, 0.00827079825, 0.0222970154, -..."
3,1,"2 CHAPTER V SUMMONS, SEARCHES AND SEIZURES,...",153,29,38.25,"[-0.0871450827, 0.0198262818, 0.0618491173, -0..."
4,1,Retention of property. 21. Retention of record...,213,32,53.25,"[-0.0557859391, 0.0471928082, 0.0205537975, -0..."
...,...,...,...,...,...,...
2400,24,Section70 PENALTY FOR FURNISHING FALSE PARTICU...,2884,499,721.00,"[-0.08776425, 0.03862304, 0.00377378, -0.00443..."
2401,25,(a) prescribing the form of statement submitte...,2562,460,640.50,"[-0.0921027139, 0.0288838744, 0.0073185009, 0...."
2402,26,"(b) in any other case, publication in the Offi...",1455,230,363.75,"[-0.0751365051, 0.013974322, -0.0104711028, -0..."
2403,27,(5) Intimation under section 62 Fifteen rupees...,760,129,190.00,"[-0.0921025202, 0.0113542965, 0.0112192323, 0...."


In [None]:
embeddings.shape


torch.Size([2405, 384])

In [None]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="intfloat/e5-small-v2",
                                      device="cuda")


In [None]:

query = "what is case 42"
print(f"Query: {query}")

# 2. Embed the query
# Note: it's import to embed you query with the same model you embedding your passages
query_embedding = embedding_model.encode(query, convert_to_tensor=True).to("cuda")
query_embedding.shape
# 3. Get similarity scores with the dot product (use cosine similarity if outputs of model aren't normalized)


Query: what is case 42


torch.Size([384])

In [None]:
from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()

print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

# 4. Get the top-k results (we'll keep top 5)
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

[INFO] Time taken to get scores on 2405 embeddings: 0.00379 seconds.


torch.return_types.topk(
values=tensor([0.8260, 0.8127, 0.8127, 0.8110, 0.8110], device='cuda:0'),
indices=tensor([1945, 1788,    9, 1307,  387], device='cuda:0'))

In [None]:
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [None]:

query = "Under section 42, the dissolution of the firm depends"
print(f"Query: '{query}'\n")
print("Results:")
# Loop through zipped together scores and indices from torch.topk
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

Query: 'Under section 42, the dissolution of the firm depends'

Results:
Score: 0.8260
Text:
For section 42 of the principal Act, the following section shall be substituted,
namely:— '42. (1) A company may, subject to the provisions of this section, make
a private placement of securities. (2) A private placement shall be made only to
a select group of persons who have been identified by the Board (herein referred
to as "identified persons"), whose number shall not exceed fifty or such higher
number as may be prescribed [excluding the qualified institutional buyers and
employees of the company being offered securities under a scheme of employees
stock option in terms of provisions of clause (b) of sub- section (1) of section
62], in a financial year subject to such conditions as may be prescribed. (3) A
company making private placement shall issue private placement offer and
application in such form and manner as may be prescribed to identified persons,
whose names and addresses are rec

In [None]:

import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

Available GPU memory: 15 GB


In [None]:
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Get dot product sc
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"[INFO] Time taken to get scores on ({len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores,
                                 k=n_resources_to_return)

    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=5):
    """
    Finds relevant passages given a query and prints them out along with their scores.
    """
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)

    # Loop through zipped together scores and indices from torch.topk
    for score, idx in zip(scores, indices):
        print(f"Score: {score:.4f}")
        print("Text:")
        print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
        print(f"Page number: {pages_and_chunks[idx]['page_number']}")
        print("\n")

In [None]:
query="What is fraud according to Section 17?"
# retrieve_relevant_resources(query=query, embeddings=embeddings)
print_top_results_and_scores(query=query, embeddings=embeddings)

[INFO] Time taken to get scores on (127 embeddings: 0.00006 seconds.
Score: 0.5689
Text:
COMMERCIAL LAW 515 The economic duress does not signify the lack of will to
submit but the intentional submission arising from the realization that there is
no other practical choice open to him. This is the thread of principle which
links the early law of duress (threat to life or limb) with later development
when the law came to recognize as duress the threat to property and now the
threat to man's business or trade.42 Effectiveness of alternative remedy,
absence of protests, availability of independent advice, benefits received and
speed with which victim sought to avoid the contract are looked into, to
differentiate between commercial pressure and economic duress. The rationale
behind making economic duress a ground to vitiate consent is that the apparent
consent of the party aggrieved was induced by pressure exercised upon him by the
other party which the law does not regard as legitimate with

In [None]:
# def prompt_formatter(query: str,
#                      context_items: list[dict]) -> str:
#     context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

#     base_prompt = """Based on the following context items, please answer the query.
# Give yourself room to think by extracting relevant passages from the context before answering the query.
# Don't return the thinking, only return the answer.
# Make sure your answers are as explanatory as possible.
# Use the following examples as reference for the ideal answer style.
# \nExample 1:
# Query: What are the fat-soluble vitamins?
# Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
# \nExample 2:
# Query: What are the causes of type 2 diabetes?
# Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
# \nExample 3:
# Query: What is the importance of hydration for physical performance?
# Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
# # \nNow use the following context items to answer the user query:
# {context}
# \nRelevant passages: <extract relevant passages from the context here>
# User query: {query}
# Answer:"""
#     base_prompt = base_prompt.format(context=context,
#                                      query=query)

#     # Create prompt template for instruction-tuned model
#     dialogue_template = [
#         {"role": "user",
#          "content": base_prompt}
#     ]

#     # Apply the chat template
#     prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
#                                            tokenize=False,
#                                            add_generation_prompt=True)

#     return prompt

# query = "what is case 42?"
# print(f"Query: {query}")

# # Get relevant resources
# scores, indices = retrieve_relevant_resources(query=query,
#                                               embeddings=embeddings)

# # Create a list of context items
# context_items = [pages_and_chunks[i] for i in indices]

# # Format our prompt
# prompt = prompt_formatter(query=query,
#                           context_items=context_items)
# print(prompt)

In [None]:
!pip install langchain



In [None]:
%pip install -qU langchain_mistralai

In [None]:
from langchain import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List
from langchain_mistralai import ChatMistralAI



In [None]:
import torch
from sentence_transformers import SentenceTransformer, util
from timeit import default_timer as timer
from langchain import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List
import os
from langchain_mistralai import ChatMistralAI
from langchain.schema import HumanMessage, SystemMessage


def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    query_embedding = model.encode(query, convert_to_tensor=True)
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()
    if print_time:
        print(f"[INFO] Time taken to get scores on ({len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")
    scores, indices = torch.topk(input=dot_scores, k=n_resources_to_return)
    return scores, indices

class Answer(BaseModel):
    relevant_passages: str = Field(description="Relevant passages extracted from the context")
    answer: str = Field(description="The final answer to the user query")

output_parser = PydanticOutputParser(pydantic_object=Answer)

template = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Make sure your answers are as explanatory as possible.

Context:
{context}

User query: {query}

{format_instructions}
"""

prompt = PromptTemplate(
    template=template,
    input_variables=["context", "query"],
    partial_variables={"format_instructions": output_parser.get_format_instructions()}
)

def prompt_formatter(query: str, context_items: List[dict]) -> str:
    context = "\n".join([f"- {item['sentence_chunk']} (Page {item['page_number']})" for item in context_items])
    return prompt.format(context=context, query=query)

os.environ["MISTRAL_API_KEY"] = "KKKgMzbkWE65CzyUJ7LZ08iCsQkcM7Jz"
llm = ChatMistralAI(model="mistral-large-latest")

# Example usage
query = """what is the indian contract act"""
print(f"Query: {query}")

# Assuming embeddings is a pre-computed tensor of all document embeddings

scores, indices = retrieve_relevant_resources(query=query, embeddings=embeddings)
context_items = [pages_and_chunks[i] for i in indices]

formatted_prompt = prompt_formatter(query=query, context_items=context_items)
print(formatted_prompt)

messages = [
    SystemMessage(content=template),
    HumanMessage(content=formatted_prompt)
]
llm_output = llm.invoke(messages).content

try:
    parsed_output = output_parser.parse(llm_output)
    print(f"Relevant Passages:\n{parsed_output.relevant_passages}")
    print(f"\nAnswer:\n{parsed_output.answer}")
except Exception as e:
    print(f"Error parsing output: {e}")
    print(f"Raw output: {llm_output}")

Query: what is the indian contract act
[INFO] Time taken to get scores on (127 embeddings: 0.00008 seconds.
Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Make sure your answers are as explanatory as possible.

Context:
- A minor may bind other persons but not himself. The powers of (Page 44)
- 552 INDIAN LEGAL SYSTEM make the drawee liable as such. His liability arises only upon acceptance. The law does not prescribe any particular mode of acceptance, although the requirements of section 7 of the Act have to be satisfied. This means that the drawee must sign his assent upon the bill itself. A mere acknowledgement of liability may not be acceptance. A bill of exchange payable after sight must, where no time or place is specified for presentment, be presented to the drawee within a reasonable time after it is drawn, during business hours on a business day, otherwise n

In [None]:
pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-1.1.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-inference, pinecone-client
Successfully installed pinecone-client-5.0.