- Installation of necessary libraries

In [1]:
%pip install --upgrade jupyter notebook transformers faiss-cpu sentence-transformers langchain openai tiktoken PyMuPDF

Note: you may need to restart the kernel to use updated packages.


- Importing required libraries 

In [2]:
import langchain
import faiss
import transformers
import sentence_transformers
import fitz
import re
import warnings
import torch
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer
from accelerate import Accelerator

# Data Loading and Preprocessing 

- File path to the PDF file containing data

In [3]:
directory = "/home/tihum_kabir/Law-Enforcement/data"

- File Processing (reading and loading files) 

In [4]:
# Function to load and process all PDF files in the directory
def load_and_process_all_pdfs(directory):
    pdf_texts = []
    
    # List all files in the directory
    for file_name in os.listdir(directory):
        if file_name.endswith(".pdf"):  # Check if the file is a PDF
            file_path = os.path.join(directory, file_name)  # Get the full path
            print(f"Processing {file_path}...")  # Display the current file being processed
            
            # Extract text from the PDF
            doc = fitz.open(file_path)  # Open the PDF file
            file_text = [page.get_text("text") for page in doc]  # Extract text from all pages
            
            pdf_texts.append((file_name, file_text)) 
    
    return pdf_texts

# Function to process extracted PDF text into sections
def process_pdf_sections(pdf_texts):
    sections = []  # List to store the structured data (chapter, section, content)
    current_chapter, current_section, content = None, None, []  

    # Process the extracted text into sections
    for file_name, pdf_text in pdf_texts:
        for page_text in pdf_text:
            lines = page_text.splitlines()  # Split the page text into lines
            
            for line in lines:
                line = line.strip()  # Remove leading/trailing whitespaces  

                # Regex to identify chapter headings
                if re.match(r"^CHAPTER\s+[IVXLCDM]+", line):
                    if current_section:
                        sections.append({"chapter": current_chapter, "section": current_section, "content": "\n".join(content)})
                        content = []  # Reset content
                    current_chapter = line
                    current_section = None  # Reset section
                
                # Regex to identify section headings
                elif re.match(r"^\d+\.", line):
                    if current_section:
                        sections.append({"chapter": current_chapter, "section": current_section, "content": "\n".join(content)})
                        content = []  # Reset content
                    current_section = line

                # Add content to the current section
                else:
                    content.append(line)

    # Save the last section
    if current_section:
        sections.append({"chapter": current_chapter, "section": current_section, "content": "\n".join(content)})

    return sections

# Load and process all PDFs in the directory
pdf_texts = load_and_process_all_pdfs(directory)

# Print the first 500 characters of the text from each PDF file for verification
for file_name, text in pdf_texts:
    print(f"First 500 characters from {file_name}:\n{text[0][:500]}\n")

# Process the extracted PDF text into sections
sections = process_pdf_sections(pdf_texts)

# Print the first two sections for verification
for sec in sections[:2]: 
    print(f"Chapter: {sec['chapter']}\nSection: {sec['section']}\nContent: {sec['content'][:300]}...\n")


Processing /home/tihum_kabir/Law-Enforcement/data/act-print-687.pdf...
Processing /home/tihum_kabir/Law-Enforcement/data/act-print-367.pdf...
Processing /home/tihum_kabir/Law-Enforcement/data/act-print-701.pdf...
Processing /home/tihum_kabir/Law-Enforcement/data/Bangladesh-Labour-Act-2006_English-Upto-2018.pdf...
Processing /home/tihum_kabir/Law-Enforcement/data/act-print-699.pdf...
Processing /home/tihum_kabir/Law-Enforcement/data/act-print-675.pdf...
Processing /home/tihum_kabir/Law-Enforcement/data/act-print-692.pdf...
Processing /home/tihum_kabir/Law-Enforcement/data/18.pdf...
Processing /home/tihum_kabir/Law-Enforcement/data/25.pdf...
Processing /home/tihum_kabir/Law-Enforcement/data/act-print-305.pdf...
Processing /home/tihum_kabir/Law-Enforcement/data/KPMG Taxation Handbook 2020.pdf...
Processing /home/tihum_kabir/Law-Enforcement/data/E&Y BD Income Tax Guide (2020).pdf...
Processing /home/tihum_kabir/Law-Enforcement/data/act-print-86.pdf...
Processing /home/tihum_kabir/Law-Enfor

- Model Loading (Mistral 7B Instruct) 

In [5]:

from huggingface_hub import login   # Hugging Face Hub login
token = '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'    # Replace with your own token
login(token)    # Login to the Hugging Face Hub
model_name = "mistralai/Mistral-7B-Instruct-v0.1"  

# BitsAndBytesConfig for 4-bit quantization (reduces memory usage)
bnb_config = BitsAndBytesConfig(load_in_4bit=True)

# Load Mistral model with 4-bit quantization in CPU
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("Model loaded successfully!")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded successfully!


- Sentence Transformer and FAISS Indexing 

In [6]:
# Load the Sentence Transformer model
sentence_transformer_model = SentenceTransformer('all-MiniLM-L6-v2') 

# Create embeddings for each section
section_texts = [sec['content'] for sec in sections]
embeddings = sentence_transformer_model.encode(section_texts, convert_to_tensor=True)  # Create embeddings

# Convert embeddings to numpy array for FAISS
embeddings_np = embeddings.cpu().detach().numpy()   # Move to CPU

# Build a FAISS index
index = faiss.IndexFlatL2(embeddings_np.shape[1])  # Use L2 distance for similarity
index.add(embeddings_np)

print(f"FAISS index created with {index.ntotal} vectors.")  


FAISS index created with 3347 vectors.


  - Helper Functions 

In [7]:
# Function to retrieve relevant sections based on a query
def get_relevant_sections(query, k=3):  # k is the number of top similar sections to retrieve
    query_embedding = sentence_transformer_model.encode([query], convert_to_tensor=True).cpu().detach().numpy()  # Embed the query

    D, I = index.search(query_embedding, k=k)  # Search the FAISS index

    retrieved_sections = [sections[i] for i in I[0]]  # Get the sections corresponding to the indices

    context = "\n".join([sec['content'] for sec in retrieved_sections])  # Combine the content of retrieved sections
    return context
# Test the retrieval functionality
query = "What is the punishment for murder?"
relevant_sections = get_relevant_sections(query)
print(f"Relevant Sections:\n{relevant_sections}")  # Check which sections were retrieved


# Function to generate response using the Mistral model
def generate_response(query):
    context = get_relevant_sections(query)

    input_text = f"Context: {context}\nQuery: {query}\nResponse:"
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

    # Explicitly set pad_token_id to eos_token_id to remove the warning
    output = model.generate(**inputs, max_new_tokens=200, pad_token_id=tokenizer.eos_token_id)

    response = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Post-process to clean the output
    response = re.sub(r"(?s).*Response:", "", response).strip()  # Remove everything before and including "Response:"
    response = re.sub(r"\n+", " ", response).strip()  # Remove excessive newlines or extra spaces
    
    return response


Relevant Sections:
Punishment for murder.

Punishment for murder by life-convict.

death, or 1[imprisonment] for life, and shall also be liable to fine.
Punishment for
murder.


- Fix and Updates

In [8]:
warnings.simplefilter("ignore") # Ignore warnings

# Configure the quantization
bnb_config.bnb_4bit_compute_dtype = torch.float16  # Ensures computation matches input dtype

# Function to generate a response using the Mistral model
def generate_response(query):
    context = get_relevant_sections(query)  # Get the relevant context for the query
    # Construct the input text
    input_text = f"Context: {context}\nQuery: {query}\nResponse:"
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    
    # Generate the output from the model
    output = model.generate(**inputs, max_new_tokens=200, pad_token_id=tokenizer.eos_token_id)
    
    # Decode the output
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Clean up the response:
    response = re.sub(r"(?s).*Response:", "", response).strip()
    response = re.sub(r"\n+", " ", response).strip()
    response = re.sub(r"(Context:.*|Query:.*)", "", response).strip()
    final_response = f"\nQuery: {query}\nResponse: {response}\n"

    return final_response

# Main Function

In [9]:
# Example query
query = "What is the punishment for murder?"
response = generate_response(query)
print(response)


Query: What is the punishment for murder?
Response: The punishment for murder is life-convict, death, or 1[imprisonment] for life, and shall also be liable to fine.



In [10]:
# Example query
query = "What is income tax law?"
response = generate_response(query)
print(response)


Query: What is income tax law?
Response: Income tax law is a set of rules and regulations that govern the collection of personal income tax. It is a tax levied on the income of individuals and businesses, and is used to fund government programs and services. The tax rate is typically based on a percentage of the taxpayer's income, and can vary depending on factors such as the taxpayer's age, income level, and tax bracket. Income tax laws can be complex and subject to change, and it is important for taxpayers to understand their obligations and seek professional advice if they have any questions or concerns.



In [11]:
# Example query
query = "What is the punishment for theft?"
response = generate_response(query)
print(response)


Query: What is the punishment for theft?
Response: Punishment for theft is imprisonment of either description for a term which may extent to two years, or with fine, or with both.

