In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open("./combined_text.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Print the length of the text to ensure it's sufficiently long
print(f"Length of the text: {len(text)}")

import re
text = re.sub(r'\S+@\S+', '', text)  # Remove anything in the form of an email

# 3. Remove Special Characters, Numbers, and Extra Spaces
text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only alphabetic characters and spaces
text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space and strip leading/trailing spaces
# Remove unnecessary characters like extra spaces, newlines, etc.
cleaned_text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces and newlines with a single space
cleaned_text = cleaned_text.strip()  # Remove leading and trailing spaces
print(cleaned_text[:500])  # Check the first 500 characters after cleaning
lowercased_text = cleaned_text.lower()

# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = text_splitter.split_text(lowercased_text)


print(f"Total chunks created: {len(chunks)}")  # Print the number of chunks


Length of the text: 6699779
i JURISPRUDENCE INTERPRETATION GENERAL LAWS GROUP PAPER EXECUTIVE PROGRAMMESTUDY MATERIALii THE INSTITUTE OF COMPANY SECRETARIES OF INDIA Timing of Headquarters Monday to Friday Office Timings AM to PM Public Dealing Timings Without financial transactions AM to PM With financial transactions AM to PM Phones Website wwwicsiedu Email For Academic Updates please visit httpswwwicsiedustudentsacademicportal For any suggestionsclarifications students may write to Disclaimer Although due care and dilig
Total chunks created: 15802


In [3]:
import torch
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.device_count())  # Number of GPUs available
print(torch.cuda.get_device_name(0))  # Name of the GPU


True
1
NVIDIA GeForce RTX 4060 Laptop GPU


In [4]:
chunks

['i jurisprudence interpretation general laws group paper executive programmestudy materialii the institute of company secretaries of india timing of headquarters monday to friday office timings am to pm public dealing timings without financial transactions am to pm with financial transactions am to pm phones website wwwicsiedu email for academic updates please visit httpswwwicsiedustudentsacademicportal for any suggestionsclarifications students may write to disclaimer although due care and',
 'for any suggestionsclarifications students may write to disclaimer although due care and diligence have been taken in preparation of this study material the institute shall not be responsible for any loss or damage resulting from any action taken on the basis of the contents of this study material anyone wishing to act on the basis of the material contained herein should do so after cross checking with the original source laser typesetting by aarushi graphics prashant vihar new delhiiii',
 'che

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load SentenceTransformer model for embedding generation
embedding_model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")

# Generate embeddings for each chunk
embeddings = embedding_model.encode(chunks, show_progress_bar=True)
embeddings = np.array(embeddings)

# Save embeddings and chunks for later use
np.save("embeddings.npy", embeddings)
with open("chunks.txt", "w", encoding="utf-8") as f:
    for chunk in chunks:
        f.write(chunk + "\n")

Batches: 100%|██████████| 494/494 [02:03<00:00,  3.99it/s]


In [9]:
import faiss

# Create a FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Save the index
faiss.write_index(index, "faiss_index.bin")
print(f"Total indexed vectors: {index.ntotal}")


Total indexed vectors: 15802


In [3]:
def retrieve_relevant_chunks(query, index, embeddings, text_chunks, top_k=5):
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(np.array(query_embedding), k=top_k)
    return [text_chunks[idx] for idx in indices[0]]

# Example usage
# query = "What is the role of AI in education?"
# relevant_chunks = chunks
# print(relevant_chunks)


In [23]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load Meta LLaMA model and tokenizer
llm_name = "meta-llama/Llama-3.2-1B"  # Replace with the actual path or model name
tokenizer = AutoTokenizer.from_pretrained(llm_name)
model = AutoModelForCausalLM.from_pretrained(llm_name)

device = torch.device("cpu")
model = model.to(device)

# Function to generate an answer
def generate_answer(query, context, model, tokenizer, device):
    # Format the input
    input_text = f'''You are an expert assistant. Answer the question based on the retrieved documents below. If the answer cannot be found in the documents, say The answer is not available in the provided information.
    Documents: {context} 
    Question: {query}'''

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        model.resize_token_embeddings(len(tokenizer))  # Resize embeddings to account for the new token
    #Tokenize input with attention mask and padding
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        padding=True,  # Ensures inputs are padded
        truncation=True,
        max_length=512
    ).to(device)
    
    # Generate output with attention mask and pad_token_id
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=1024,
        num_beams=5,
        early_stopping=True,
        pad_token_id=tokenizer.pad_token_id  # Ensure proper handling of padding
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
# query = "What are the key ideas from the book?"
# relevant_chunks = chunks # Replace with actual relevant chunks
# context = " ".join(relevant_chunks)

# response = generate_answer(query, context, model, tokenizer)
# print(response)


In [25]:
# Load pre-saved chunks and embeddings
import numpy as np
import faiss

with open("chunks.txt", "r", encoding="utf-8") as f:
    chunks = f.readlines()

embeddings = np.load("embeddings.npy")
index = faiss.read_index("faiss_index.bin")

# Function to handle query processing
def handle_query(query, device):
    # Retrieve relevant chunks
    relevant_chunks = retrieve_relevant_chunks(query, index, embeddings, chunks, top_k=5)
    context = "\n".join(relevant_chunks)
    
    # Generate an answer
    answer = generate_answer(query, context, model, tokenizer, device)
    return answer

# User interaction loop

user_query = input("Enter your question (or 'exit' to quit): ")
response = handle_query(user_query, device)
print(response)


You are an expert assistant. Answer the question based on the retrieved documents below. If the answer cannot be found in the documents, say The answer is not available in the provided information.
    Documents: 

injunctions and interlocutory orders detention preservation inspection etc of subjectmatter of suit institution of suit important stages in proceedings of a suit delivery of summons by court appeals reference review and revision suits by or against a corporation suits by or against minors and lunatics summary proceedingsprocedure saving of inherent powers of court powers of civil courts and their exercise by tribunals commercial courts act jurisdiction determination of specified value





ones opinion right to circulation and propagation of ones ideas freedom of peaceful demonstration dramatic performance and cinematography it may also include any other mode of expression of ones ideas the supreme court in cricket association of bengal v the secretary ministry of informatio

In [24]:
torch.cuda.empty_cache()
