In [18]:
pip install transformers faiss-cpu sentence-transformers torch




In [4]:
import json

# Load the datasets
with open("/content/constitution_qa.json") as f:
    constitution_data = json.load(f)

with open("/content/crpc_qa.json") as f:
    crpc_data = json.load(f)

with open("/content/ipc_qa.json") as f:
    ipc_data = json.load(f)

# Combine datasets into one list
combined_data = constitution_data + crpc_data + ipc_data

# Check the structure of a few entries
print(combined_data[:5])


[{'question': 'What is India according to the Union and its Territory?', 'answer': 'India, that is Bharat, shall be a Union of States.'}, {'question': 'How is India, that is Bharat, defined in terms of its political structure?', 'answer': 'India, that is Bharat, is defined as a Union of States according to the Union and its Territory.'}, {'question': 'What does the territory of India comprise of?', 'answer': 'The territory of India shall comprise the territories of the States, the Union territories specified in the First Schedule, and such other territories as may be acquired.'}, {'question': 'What does the territory of a country, such as India, comprise of, according to their constitutional provisions?', 'answer': 'The territory of a country like India comprises the territories of the States, the Union territories specified in the First Schedule, and such other territories as may be acquired.'}, {'question': 'Who has the authority to admit or establish new States into the Union?', 'an

In [5]:
def standardize_text(text):
    text = text.lower()  # Convert to lowercase
    text = " ".join(text.split())  # Remove extra whitespace
    return text

# Apply standardization to each question and answer
for entry in combined_data:
    entry["question"] = standardize_text(entry["question"])
    entry["answer"] = standardize_text(entry["answer"])


In [6]:
unique_entries = {}
for entry in combined_data:
    question = entry["question"]
    # If question is not in dictionary, add it
    if question not in unique_entries:
        unique_entries[question] = entry

# Convert the dictionary back to a list
cleaned_data = list(unique_entries.values())

print("Original data length:", len(combined_data))
print("Cleaned data length:", len(cleaned_data))


Original data length: 14543
Cleaned data length: 14453


In [7]:
with open("/content/cleaned_legal_data.json", "w") as f:
    json.dump(cleaned_data, f, indent=4)

print("Cleaned dataset saved as cleaned_legal_data.json")


Cleaned dataset saved as cleaned_legal_data.json


In [8]:
from sentence_transformers import SentenceTransformer
import numpy as np

embedding_model = SentenceTransformer('all-mpnet-base-v2')


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
texts = [entry["question"] + " " + entry["answer"] for entry in cleaned_data]
embeddings = embedding_model.encode(texts, convert_to_tensor=True)

# Save embeddings and other data for retrieval
np.save("legal_embeddings.npy", embeddings.cpu().numpy())
ids = [str(i) for i in range(len(cleaned_data))]


In [10]:
import faiss

# Define the dimension of the embeddings
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings.cpu().numpy())


In [11]:
def retrieve_top_k(query, k=5):
    query_embedding = embedding_model.encode(query, convert_to_tensor=True).cpu().numpy()
    distances, indices = index.search(query_embedding.reshape(1, -1), k)
    return [(cleaned_data[idx]["question"], cleaned_data[idx]["answer"]) for idx in indices[0]]

# Sample test
print(retrieve_top_k("What is the process for arrest as per CrPC?", k=3))


[('which section details the procedure to be followed when a private person makes an arrest?', 'section 43'), ('what should happen when any person is arrested?', 'when any person is arrested, he shall be examined by a medical officer in the service of central or state government.'), ('what is the procedure by magistrate before whom such person arrested is brought according to section 81?', 'procedure by magistrate before whom such person arrested is brought.')]


In [13]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load a smaller Flan-T5 model
model_name = "google/flan-t5-small"  # or "google/flan-t5-base" for a slightly larger model
model = T5ForConditionalGeneration.from_pretrained(model_name).to("cuda")
tokenizer = T5Tokenizer.from_pretrained(model_name)


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [14]:
def generate_answer(question, k=5):
    # Step 1: Retrieve relevant context using FAISS
    context = retrieve_top_k(question, k)
    context_text = " ".join([q + " " + a for q, a in context])

    # Step 2: Prepare input for the T5 model
    input_text = f"question: {question} context: {context_text}"
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

    # Step 3: Generate the answer
    outputs = model.generate(inputs["input_ids"], max_length=200)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer


In [15]:
question = "What are the rights of an arrested person under CrPC?"
print("Generated Answer:", generate_answer(question))


Generated Answer: the person should answer to a charge of an offence, or for the purpose of any proceedings against him


In [17]:
'''# Sample questions for testing the model

# Question on the Indian Constitution
question_1 = "What are the fundamental rights guaranteed by the Indian Constitution?"
print("Generated Answer for Question 1:", generate_answer(question_1))

# Question on CrPC (Code of Criminal Procedure)
question_2 = "What is the procedure for granting bail under the CrPC?"
print("Generated Answer for Question 2:", generate_answer(question_2))

# Question on IPC (Indian Penal Code)
question_3 = "What is the punishment for theft under the Indian Penal Code?"
print("Generated Answer for Question 3:", generate_answer(question_3))

# Question on legal definitions
question_4 = "How does the IPC define 'wrongful restraint'?"
print("Generated Answer for Question 4:", generate_answer(question_4))

# Question on judicial powers under CrPC
question_5 = "What are the powers of a magistrate under the CrPC?"
print("Generated Answer for Question 5:", generate_answer(question_5))

# Question on sedition under IPC
question_6 = "What does the Indian Penal Code say about sedition?"
print("Generated Answer for Question 6:", generate_answer(question_6))

# Question on fundamental duties
question_7 = "What are the fundamental duties of Indian citizens according to the Constitution?"
print("Generated Answer for Question 7:", generate_answer(question_7))

# Question on preventive detention
question_8 = "What provisions exist for preventive detention under the Indian Constitution?"
print("Generated Answer for Question 8:", generate_answer(question_8))

# Question on evidence collection
question_9 = "What are the rules regarding evidence collection under CrPC?"
print("Generated Answer for Question 9:", generate_answer(question_9))

# Question on legal immunity
question_10 = "Who has immunity from legal proceedings under the Indian Constitution?"
print("Generated Answer for Question 10:", generate_answer(question_10))

# Additional questions

# Question on the right to life
question_11 = "What is the significance of the right to life under Article 21 of the Constitution?"
print("Generated Answer for Question 11:", generate_answer(question_11))

# Question on right to information
question_12 = "What rights are provided under the Right to Information Act?"
print("Generated Answer for Question 12:", generate_answer(question_12))

# Question on public nuisance under IPC
question_13 = "How does the IPC define public nuisance?"
print("Generated Answer for Question 13:", generate_answer(question_13))

# Question on appeals in CrPC
question_14 = "What is the process for filing an appeal under the CrPC?"
print("Generated Answer for Question 14:", generate_answer(question_14))

# Question on dowry prohibition
question_15 = "What does the law say about dowry under the Dowry Prohibition Act?"
print("Generated Answer for Question 15:", generate_answer(question_15))

# Question on criminal conspiracy
question_16 = "How does the IPC define criminal conspiracy?"
print("Generated Answer for Question 16:", generate_answer(question_16))

# Question on custodial violence
question_17 = "What are the legal protections against custodial violence in India?"
print("Generated Answer for Question 17:", generate_answer(question_17))

# Question on anticipatory bail
question_18 = "What is anticipatory bail and how can it be obtained under CrPC?"
print("Generated Answer for Question 18:", generate_answer(question_18))

# Question on the legal definition of a contract
question_19 = "What constitutes a contract under the Indian Contract Act?"
print("Generated Answer for Question 19:", generate_answer(question_19))

# Question on contempt of court
question_20 = "What are the types of contempt of court recognized under Indian law?"
print("Generated Answer for Question 20:", generate_answer(question_20))

# Question on juvenile justice
question_21 = "What are the provisions for juvenile offenders under the Juvenile Justice Act?"
print("Generated Answer for Question 21:", generate_answer(question_21))

# Question on plea bargaining
question_22 = "What is the concept of plea bargaining under Indian criminal law?"
print("Generated Answer for Question 22:", generate_answer(question_22))

# Question on rights of women
question_23 = "What legal protections are provided to women against domestic violence?"
print("Generated Answer for Question 23:", generate_answer(question_23))

# Question on property rights
question_24 = "What are the property rights of women under the Hindu Succession Act?"
print("Generated Answer for Question 24:", generate_answer(question_24))

# Question on the right to education
question_25 = "What rights are guaranteed under the Right to Education Act?"
print("Generated Answer for Question 25:", generate_answer(question_25))'''


'# Sample questions for testing the model\n\n# Question on the Indian Constitution\nquestion_1 = "What are the fundamental rights guaranteed by the Indian Constitution?"\nprint("Generated Answer for Question 1:", generate_answer(question_1))\n\n# Question on CrPC (Code of Criminal Procedure)\nquestion_2 = "What is the procedure for granting bail under the CrPC?"\nprint("Generated Answer for Question 2:", generate_answer(question_2))\n\n# Question on IPC (Indian Penal Code)\nquestion_3 = "What is the punishment for theft under the Indian Penal Code?"\nprint("Generated Answer for Question 3:", generate_answer(question_3))\n\n# Question on legal definitions\nquestion_4 = "How does the IPC define \'wrongful restraint\'?"\nprint("Generated Answer for Question 4:", generate_answer(question_4))\n\n# Question on judicial powers under CrPC\nquestion_5 = "What are the powers of a magistrate under the CrPC?"\nprint("Generated Answer for Question 5:", generate_answer(question_5))\n\n# Question on 