Extract Text from Legal Documents

In [24]:
pip install transformers langchain faiss-cpu sentence_transformers tf-keras tensorflow gradio



In [3]:
!pip install sentence-transformers



In [25]:
import os
import mimetypes
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Function to convert files to .txt
def convert_files_to_txt(src_dir, dst_dir):
    """Convert text-compatible files from the source directory to .txt while preserving structure."""

    if not os.path.exists(dst_dir):
        os.makedirs(dst_dir)

    for root, dirs, files in os.walk(src_dir):
        for file in files:
            file_path = os.path.join(root, file)

            # Skip binary files by MIME type
            mime_type, _ = mimetypes.guess_type(file_path)
            if mime_type and (mime_type.startswith('image') or mime_type.startswith('audio') or mime_type.startswith('video')):
                continue

            rel_path = os.path.relpath(file_path, src_dir)
            new_root = os.path.join(dst_dir, os.path.dirname(rel_path))
            os.makedirs(new_root, exist_ok=True)

            try:
                if file.endswith('.ipynb'):
                    # Handle Jupyter notebook files
                    with open(file_path, 'r', encoding='utf-8') as f:
                        notebook_data = json.load(f)
                    content = ""
                    for cell in notebook_data.get('cells', []):
                        if cell.get('cell_type') in ['code', 'markdown']:
                            content += "\n".join(cell.get('source', [])) + "\n"
                else:
                    # Handle other text-based files
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()

                # Write the content to a new .txt file
                new_file_path = os.path.join(new_root, os.path.splitext(file)[0] + '.txt')
                with open(new_file_path, 'w', encoding='utf-8') as f:
                    f.write(content)

            except (UnicodeDecodeError, json.JSONDecodeError, OSError) as e:
                print(f"Could not process {file_path}: {e}")

# Clone the repository
os.system("git clone https://github.com/Soutrik05/juris_ai.git")

# Define source and destination directories
src_directory = 'juris_ai'
dst_directory = 'juris_ai_txt'

# Convert files to text
convert_files_to_txt(src_directory, dst_directory)

print(f"Files from '{src_directory}' have been converted and saved in '{dst_directory}'.")


Could not process juris_ai/legal_index.faiss: 'utf-8' codec can't decode byte 0x80 in position 4: invalid start byte
Could not process juris_ai/.git/index: 'utf-8' codec can't decode byte 0xa5 in position 13: invalid start byte
Could not process juris_ai/.git/objects/pack/pack-7e79ca8d9f983c1b0b828c2f6b3699d17528ed6b.pack: 'utf-8' codec can't decode byte 0x9a in position 12: invalid start byte
Could not process juris_ai/.git/objects/pack/pack-7e79ca8d9f983c1b0b828c2f6b3699d17528ed6b.idx: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
Files from 'juris_ai' have been converted and saved in 'juris_ai_txt'.


Split Text into Chunks

In [26]:
def read_corpus(directory):
    corpus = ""
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                with open(file_path, "r", encoding="utf-8") as f:
                    corpus += f.read() + "\n"
    return corpus

corpus_text = read_corpus(dst_directory)

# Save merged corpus
with open("corpus.txt", "w", encoding="utf-8") as f:
    f.write(corpus_text)

print("Corpus has been saved as 'corpus.txt'.")

Corpus has been saved as 'corpus.txt'.


In [6]:
def split_into_chunks(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

chunks = split_into_chunks(corpus_text)

# Save chunks to a file
with open("chunks.txt", "w", encoding="utf-8") as f:
    for chunk in chunks:
        f.write(chunk + "\n")

print("Text has been split into chunks and saved.")


Text has been split into chunks and saved.


Create a FAISS Index

In [7]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def create_embeddings(text_chunks):
    embeddings = embedding_model.encode(text_chunks)
    return embeddings

embeddings = create_embeddings(chunks)

# Create a FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# Save FAISS index
faiss.write_index(index, "index.faiss")

print("FAISS index created and saved.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


FAISS index created and saved.


Retrieve Relevant Chunks

In [8]:
import numpy as np

def retrieve_top_chunks(query, index, chunks, k=3):
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(np.array(query_embedding), k)
    results = [chunks[i] for i in indices[0]]
    return results

# Load the FAISS index
index = faiss.read_index("index.faiss")

# Test retrieval
query = "How does the process_query function work in this code?"
top_chunks = retrieve_top_chunks(query, index, chunks)
print("\n".join(top_chunks))


ry_embedding), k)
    results = [chunks[i] for i in indices[0]]
    return results

def generate_answer(query, context):
    prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
    response = generator(prompt, max_length=200, num_return_sequences=1)
    return response[0]["generated_text"]

def process_query(query, index, chunks):
    try:
        # Retrieve top-k chunks
        top_chunks = retrieve_top_chunks(query, index, chunks, k=5)
        context = "\n\n".join(top_chunks)
     
5)
        context = "\n\n".join(top_chunks)
        # Generate answer
        answer = generate_answer(query, context)
        return answer,top_chunks
    except Exception as e:
        return "Error processing query.", str(e)

def main():
    # Check if the necessary files exist
    files_exist, index, chunks = load_files()

    if not files_exist:
        st.error("Required files (legal_index.faiss and chunks.txt) are missing. Please upload or generate them first.")
        return

    # In

Text Generation Using Falcon

In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

def generate_answer(query, context):
    prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
    response = generator(prompt, max_length=200, num_return_sequences=1)
    return response[0]["generated_text"]

# Combine retrieval and generation
def retrieve_and_generate_answer(query, index, chunks, k=5):
    top_chunks = retrieve_top_chunks(query, index, chunks, k)
    context = "\n\n".join(top_chunks)
    answer = generate_answer(query, context)
    return answer, top_chunks

Device set to use cuda:0


Combine Retrieval and Generation

In [23]:
import gradio as gr

# Intent-based responses
def handle_intents(query):
    query_lower = query.lower()

    # Greetings and friendly conversations
    if "hi" in query_lower or "hello" in query_lower:
        return "Hello! How can I assist you today?"
    elif "how are you" in query_lower:
        return "I'm just a bot, but thanks for asking! I'm here to help you with code-related queries."
    elif "can you answer some questions" in query_lower or "can you help with code" in query_lower:
        return "Yes, tell me! I'll do my best to assist you with your code-related questions."
    elif "who are you" in query_lower or "what are you" in query_lower:
        return "I'm a codebase Q&A chatbot, designed to help you find specific functionalities or insights from your code."
    elif "what can you do" in query_lower:
        return "I can assist you in searching and understanding your codebase by answering specific questions about functionalities."
    elif "thank you" in query_lower or "thanks" in query_lower:
        return "You're welcome! Let me know if you need anything else."
    elif "bye" in query_lower or "goodbye" in query_lower:
        return "Goodbye! Have a great day!"

    # Default None for unknown intents
    return None

# Function to process the query and update the chat
def process_query(history, query):
    try:
        # Check for conversational intents
        intent_response = handle_intents(query)
        if intent_response:
            history.append(("User: " + query, "AI: " + intent_response))
            return history, ""

        # Default code query handling
        answer, context_chunks = retrieve_and_generate_answer(query, index, chunks)

        # Formatting retrieved chunks
        formatted_chunks = "\n🔍 Related Chunks:\n" + "\n".join(context_chunks) if context_chunks else ""

        # Append both the answer and relevant chunks to the conversation
        history.append(("User: " + query, f"AI: {answer}{formatted_chunks}"))
        return history, ""
    except Exception as e:
        history.append(("User: " + query, "AI: Oops! Something went wrong while processing your query."))
        return history, str(e)

# Welcome page with chat transition
def welcome_page():
    with gr.Blocks() as demo:
        demo.css = """
            .container {
                background-color: #6a4c93;
                color: white;
                font-family: 'Arial', sans-serif;
            }
            .title {
                font-size: 36px;
                font-weight: bold;
                color: white;
            }
            .subtitle {
                font-size: 18px;
                margin-top: 10px;
                color: #e0e0e0;
            }
        """

        # Title and Subtitle
        gr.Markdown("<div class='title'>Welcome to CODEBASE QnA</div>")
        gr.Markdown("<div class='subtitle'>A codebase search Q&A system powered by AI.</div>")

        # Chat UI with history and input box
        chatbot = gr.Chatbot(label="Codebase Q&A Chatbot")
        user_input = gr.Textbox(
            label="Type your question here...",
            placeholder="Enter your question and press Enter",
            show_label=False
        )

        # Chat state for conversation history
        state = gr.State([])

        # Bind input submission to query processing
        user_input.submit(process_query, inputs=[state, user_input], outputs=[chatbot, user_input])

    return demo

# Run the App
app = welcome_page()
app.launch(debug=True)




Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://ea29474ecd62184e0f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://ea29474ecd62184e0f.gradio.live




Create a Flask API