In [23]:
from sklearn.model_selection import train_test_split
import pandas as pd
import tiktoken
from tqdm import tqdm
import time
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import ollama
from langchain_ollama import OllamaLLM
import ast
import pickle

In [3]:
%run ../utils/fine_tuning_util.ipynb
%run ../utils/save_and_load_util.ipynb

In [41]:
# Load the CSV file
df = pd.read_csv("eda_law_cases.csv")

### Chunking Text for Training & Retrieval

In [42]:
# Define a tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")  # Using OpenAI's tokenizer
max_chunk_size = 512  # Choose chunk size based on LLaMA's context window

In [45]:
# Apply chunking with a single progress bar
chunked_data = []
total_files = len(df)

start_time = time.time()

# Using tqdm to show progress bar for files
with tqdm(total=total_files, desc="Chunking Progress", unit="file") as pbar:
    for _, row in df.iterrows():
        file_name = row["file_name"]
        
        # Ensure 'text' is a valid string
        text = row["text"]
        if isinstance(text, str):  # Proceed only if text is a string
            chunks = chunk_text(text)
            for i, chunk in enumerate(chunks):
                chunked_data.append({"file_name": file_name, "chunk_id": i, "text": chunk})
        else:
            print(f"Skipping file {file_name} due to invalid text.")
        
        # Update the progress bar after each file
        pbar.update(1)
    
    # Calculate time left and show it dynamically
    elapsed_time = time.time() - start_time
    remaining_time = (elapsed_time / pbar.n) * (total_files - pbar.n)
    pbar.set_postfix(remaining=f"{remaining_time:.2f}s")

Chunking Progress:  96%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Œ| 1833/1910 [00:48<00:01, 51.40file/s]

Skipping file sc_hc_la_89_2022.pdf due to invalid text.


Chunking Progress: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1910/1910 [00:50<00:00, 37.70file/s, remaining=0.00s]


In [46]:
# Convert to DataFrame
chunked_df = pd.DataFrame(chunked_data)

In [47]:
# Save the chunked data
chunked_df.to_csv("chunked_law_cases.csv", index=False)
end_time = time.time()

### Embedding Generation

In [3]:
chunked_df = pd.read_csv("chunked_law_cases.csv")

In [4]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# Create FAISS index with correct dimensionality
d = 384  # Match the SentenceTransformer output
index = faiss.IndexFlatL2(d)  # L2 distance for similarity

In [6]:
# Apply embedding generation with progress bar
chunked_data_with_embeddings = []
total_chunks = len(chunked_df)

start_time = time.time()

# Using tqdm for progress bar
with tqdm(total=total_chunks, desc="Generating Embeddings", unit="chunk") as pbar:
    batch_size = 32  # Adjust the batch size based on available memory

    # Process in batches for better performance
    for i in range(0, total_chunks, batch_size):
        batch_texts = chunked_df["text"].iloc[i:i + batch_size].tolist()
        embeddings = generate_embeddings(batch_texts)

        # Add embeddings to the dataframe
        for j, emb in enumerate(embeddings):
            chunked_data_with_embeddings.append({
                "file_name": chunked_df["file_name"].iloc[i + j],
                "chunk_id": chunked_df["chunk_id"].iloc[i + j],
                "text": chunked_df["text"].iloc[i + j],
                "embedding": emb.cpu().numpy()  # Convert tensor to numpy array
            })
        
        # Update progress bar
        pbar.update(batch_size)

Generating Embeddings:   3%|â–Ž         | 896/30646 [01:20<44:51, 11.05chunk/s] 

In [7]:
# Convert the chunked data with embeddings to a DataFrame
embedding_df = pd.DataFrame(chunked_data_with_embeddings)

In [8]:
# Save the embeddings to disk
embedding_df.to_csv("law_cases_with_embeddings.csv", index=False)

### Creating FAISS Index

In [4]:
# Load the chunked data with embeddings
embedding_df = pd.read_csv("law_cases_with_embeddings.csv")

In [16]:
# Convert string embeddings back to NumPy arrays
def parse_embedding(embedding_str):
    try:
        return np.array(ast.literal_eval(embedding_str), dtype=np.float32)  # Ensures float32 format
    except:
        return np.zeros(384, dtype=np.float32)  # Use zero vector for any malformed data

In [17]:
embedding_df["embedding"] = embedding_df["embedding"].apply(parse_embedding)

In [19]:
# Convert embeddings to a 2D NumPy array
embeddings_matrix = np.vstack(embedding_df["embedding"].values)

In [20]:
# Define the FAISS index (L2 Normalized for cosine similarity search)
embedding_dim = embeddings_matrix.shape[1]  # Get the embedding dimension
index = faiss.IndexFlatL2(embedding_dim)

In [21]:
# Add embeddings to FAISS index
index.add(embeddings_matrix)

In [22]:
# Save FAISS index
faiss.write_index(index, "law_cases_index.index")

In [25]:
# Save metadata (file names & chunk IDs) for retrieval
metadata = embedding_df[["file_name", "chunk_id", "text"]].to_dict(orient="records")
with open("faiss_metadata.pkl", "wb") as f:
    pickle.dump(metadata, f)

In [27]:
# Load FAISS index
index = faiss.read_index("law_cases_index.index")

In [28]:
# Load metadata (file names & text chunks)
with open("faiss_metadata.pkl", "rb") as f:
    metadata = pickle.load(f)

In [29]:
# Load the embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # Same model used before

In [30]:
llm = OllamaLLM(model="llama3.1")

In [31]:
def retrieve_relevant_cases(query, top_k=5):
    """
    Given a legal query, find the most relevant law case chunks using FAISS.
    """
    # Convert query to embedding
    query_embedding = embedding_model.encode([query])  # Convert query to embedding
    query_embedding = np.array(query_embedding, dtype=np.float32)  # Ensure float32 format

    # Search FAISS index for closest matches
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve the top matching cases
    retrieved_cases = []
    for i in range(top_k):
        if indices[0][i] < len(metadata):  # Ensure index is within bounds
            retrieved_cases.append(metadata[indices[0][i]])

    return retrieved_cases

In [32]:
def generate_response_with_llama(query):
    """
    Given a legal query, retrieve relevant law cases and generate a response using Llama 3.1.
    """
    # Retrieve relevant legal cases
    relevant_cases = retrieve_relevant_cases(query)

    # Combine case texts
    case_texts = "\n\n".join([f"Case {i+1}: {case['text']}" for i, case in enumerate(relevant_cases)])

    # Construct Llama 3.1 prompt
    prompt = f"""
    You are a legal AI assistant. Answer the query based on the following legal cases:

    {case_texts}

    Query: {query}
    Answer:
    """

    # Get response from Llama 3.1
    response = llm.invoke(prompt)

    return response, relevant_cases

In [34]:
# Example usage
user_query = "What are the legal rights of a tenant in a lease agreement dispute?"
response, matched_cases = generate_response_with_llama(user_query)

In [35]:
# Print response
print("ðŸ”· AI Legal Assistant Response:\n")
print(response)

ðŸ”· AI Legal Assistant Response:

It appears that you have provided a lengthy document related to a court case or dispute between individuals, rather than asking about the legal rights of a tenant in a lease agreement dispute.

To provide an answer to your original question, here are some general legal rights that a tenant may have in a lease agreement dispute:

1. Right to quiet enjoyment: A tenant has the right to occupy and use the rented property without interference or disturbance from the landlord.
2. Right to notice of termination: A tenant is entitled to written notice before their tenancy can be terminated, unless the lease agreement specifies otherwise.
3. Right to security deposit refund: At the end of a tenancy, a tenant may be entitled to a refund of their security deposit, minus any damages or deductions allowed by law.
4. Right to inspection and repairs: A tenant has the right to inspect and report on any necessary repairs, and to request that they be made in a timely m

In [36]:
# Print matched case file names
print("\nðŸ”· Top Matching Case Files:")
for case in matched_cases:
    print(f"ðŸ“„ {case['file_name']}")


ðŸ”· Top Matching Case Files:
ðŸ“„ 012009.pdf
ðŸ“„ 012009.pdf
ðŸ“„ 012009.pdf
ðŸ“„ 012009.pdf
ðŸ“„ 012009.pdf
