In [24]:
import os
from langchain.schema import Document  # Import Document from langchain.schema
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
def load_text_documents(directory_path):
    text_documents = []

    # Iterate through the files in the specified directory
    for filename in os.listdir(directory_path):
        # Check if the file has a .txt extension
        if filename.endswith(".txt"):
            file_path = os.path.join(directory_path, filename)
            
            # Read the content of the text file
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                
                # Create a Document object and append it to the list
                document = Document(page_content=content, metadata={"source": file_path})
                text_documents.append(document)

    return text_documents

In [20]:
# Function to split the documents using RecursiveCharacterTextSplitter
def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=500,
        length_function=len,
        add_start_index=True
    )
    
    # Split the documents into chunks
    chunks = text_splitter.split_documents(documents)
    return chunks

In [21]:

directory_path = "documents"  # Replace with your directory path
documents = load_text_documents(directory_path)

# Split documents into chunks
chunks = split_documents(documents)

In [22]:
chunks

[Document(metadata={'source': 'documents\\textbook1.txt', 'start_index': 0}, page_content='i JURISPRUDENCE INTERPRETATION  GENERAL LAWS GROUP 1 PAPER 1EXECUTIVE PROGRAMMESTUDY MATERIALii  THE INSTITUTE OF COMPANY SECRETARIES OF INDIA Timing of Headquarters  Monday to Friday Office Timings  900 AM to 530 PM Public Dealing Timings  Without financial transactions  930 AM to 500 PM With financial transactions  930 AM to 400 PM Phones  01145341000  01204522000 Website   Email     For Academic Updates please visit   For any suggestionsclarifications students may write to  Disclaimer Although due care and diligence have been taken in preparation of this Study Material the Institute shall not be responsible for any loss or damage resulting from any action taken on the basis of the contents of this Study Material Anyone wishing to act on the basis of the material contained herein should do so after cross checking with the original source Laser Typesetting by  AArushi Graphics Prashant Vihar New

In [23]:
len(chunks)

12875

In [29]:
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma


embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Create the Chroma database
db = Chroma.from_documents(
    chunks, embeddings, persist_directory="chroma"
)

In [None]:
db.persist()

In [30]:
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
db = Chroma(persist_directory="chroma", embedding_function=embedding_function)

  db = Chroma(persist_directory="chroma", embedding_function=embedding_function)


In [64]:
query = "Article 16 qualifies equality of opportunity in matters of public employment. However there are certain exceptions to it. Discuss" 
results = db.similarity_search_with_relevance_scores(query, k=3)

In [36]:
results

[(Document(metadata={'source': 'documents\\textbook1.txt', 'start_index': 68053}, page_content='with various aspects are well coordinated If a law does not cover a specific situation that it might have wanted to cover while being enacted it is incomplete in design According to Bentham the unity of a law would depend upon the unity of the species of the act which is the object of the lawEPJIGL Sources of Law6Criticism of Benthams theory of Law 1 Due to Benthams straitjacketing of laws into an imperative theory all laws have to be either command or permission it does not take proper account of laws conferring power like the power to make contracts create title etc 2 Bentham did not give a fair treatment to custom as a source of law He said customs could never be complete 3 Benthams theory did not allow for judge made laws and hoped that such laws would be gradually eliminated by having complete laws 4 To judge an action according to the pleasure pain criterion is to judge it subjectively

In [38]:
context_text = " ".join([doc.page_content for doc, _score in results])

In [39]:
context_text

'with various aspects are well coordinated If a law does not cover a specific situation that it might have wanted to cover while being enacted it is incomplete in design According to Bentham the unity of a law would depend upon the unity of the species of the act which is the object of the lawEPJIGL Sources of Law6Criticism of Benthams theory of Law 1 Due to Benthams straitjacketing of laws into an imperative theory all laws have to be either command or permission it does not take proper account of laws conferring power like the power to make contracts create title etc 2 Bentham did not give a fair treatment to custom as a source of law He said customs could never be complete 3 Benthams theory did not allow for judge made laws and hoped that such laws would be gradually eliminated by having complete laws 4 To judge an action according to the pleasure pain criterion is to judge it subjectively The theory did not provide how a subjective criterion of pain and pleasure can be transmuted t

In [74]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load Meta LLaMA model and tokenizer
llm_name = "meta-llama/Llama-3.2-1B-Instruct"  # Replace with the actual path or model name
tokenizer = AutoTokenizer.from_pretrained(llm_name)
model = AutoModelForCausalLM.from_pretrained(llm_name)

device = torch.device("cpu")
model = model.to(device)

# Function to generate an answer
def generate_answer(query, context, model, tokenizer, device):
    # Format the input
    input_text = f'''{context} 
    You are study companion. Answer this Question with bullet points based on above context make a good explanation to your answer: {query}'''

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        model.resize_token_embeddings(len(tokenizer))  # Resize embeddings to account for the new token
    #Tokenize input with attention mask and padding
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        padding=True,  # Ensures inputs are padded
        truncation=True,
        max_length=512
    ).to(device)
    
    # Generate output with attention mask and pad_token_id
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=1024,
        num_beams=5,
        early_stopping=True,
        pad_token_id=tokenizer.pad_token_id  # Ensure proper handling of padding
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [75]:
def handle_query(query, device):
    # Retrieve relevant chunks
    relevant_chunks = db.similarity_search_with_relevance_scores(query, k=3)
    context_text = " ".join([doc.page_content for doc, _score in relevant_chunks])
    
    # Generate an answer
    answer = generate_answer(query, context_text, model, tokenizer, device)
    return answer

In [78]:
res = handle_query(query, "cpu")

In [73]:
torch.cuda.empty_cache()

In [67]:
tokenizer.save_pretrained("./model")

('./model\\tokenizer_config.json',
 './model\\special_tokens_map.json',
 './model\\tokenizer.json')

In [81]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
org= '''Jeremy Bentham was the pioneer of analytical jurisprudence in Britain. According to him ‘a law’ may be defined as an assemblage of signs, declarative of volition, conceived or adopted by a sovereign in a state, concerning the conduct to be observed in a certain case by a certain person or a class of persons, who in the case in question are or are supposed to be subject to his power. Thus, Bentham’s concept of law is an imperative one.
Bentham said that every law may be considered in eight different respects:
1. Source: The source of a law is the will of the sovereign, who may conceive laws which he personally issues, or adopt laws previously issued by sovereigns or subordinate authorities, or he may adopt laws to be issued in future by subordinate authorities. Sovereign according to Bentham is any person or assemblage of person to whose will a whole political community is supposed to be in a disposition to pay obedience, and then in preference to the will of any other person.
2. Subjects: These may be persons or things. Each of these may be active or passive subjects, i.e., the agent with which an act commences or terminates.
3. Objects: The goals of a given law are its objects.
4. Extent: Direct extent means that a law covers a portion of land on which acts have their termination; indirect extent refers to the relation of an actor to a thing.
5. Aspects: Every law has ‘directive’ and a ‘sanctional’ part. The former concerns the aspects of the sovereign will towards an act-situation and the latter concerns the force of a law. The four aspects of the 5. Aspects: Every law has ‘directive’ and a ‘sanctional’ part. The former concerns the aspects of the sovereign will towards an act-situation and the latter concerns the force of a law. The four aspects of the sovereign will are command, prohibition, non-prohibition and non-command and the whole range of laws are covered under it. These four aspects are related to each other by opposition and concomitancy.
6. Force: The motivation to obey a law is generated by the force behind the law.
7. Remedial appendage: These are a set of subsidiary laws addressed to the judges through which the judges cure the evil (compensation), stop the evil or prevent future evil.
8. Expression: A law, in the ultimate, is an expression of a sovereign’s will. The connection with will raises the problem of discovering the will from the expression.
'''

# Calculate ROUGE scores
scores = scorer.score(org, res)

In [82]:
scores

{'rouge1': Score(precision=0.23231256599788808, recall=0.5176470588235295, fmeasure=0.32069970845481055),
 'rouge2': Score(precision=0.04439746300211417, recall=0.09905660377358491, fmeasure=0.061313868613138686),
 'rougeL': Score(precision=0.11510031678986272, recall=0.2564705882352941, fmeasure=0.1588921282798834)}