In [1]:
# ! pip install spacy
# ! python -m spacy download en_core_web_sm


In [2]:
import spacy
import os
import json

# Load the spaCy model for English
nlp = spacy.load("en_core_web_sm")

# Define the file path
file_path = "../data/final_clean.json"
output_path = "../data/final_clean_with_keywords.json"

# Open the JSON file
try:
    with open(file_path, "r") as f:
        data = json.load(f)
except FileNotFoundError:
    print(f"File {file_path} not found.")
    exit()
except json.JSONDecodeError as e:
    print(f"Error decoding JSON: {e}")
    exit()

def find_key_phrases(text):
    """Process text and extract legal key phrases."""
    # Process the text with spaCy
    doc = nlp(text)

    # Define key legal terms to search for
    key_phrases = ["amendment", "article", "act", "law", "constitution"]

    # Extract key phrases based on predefined terms and entity recognition
    extracted_phrases = [token.text for token in doc if token.text in key_phrases or token.ent_type_ in ['LAW', 'ORG']]
    
    return extracted_phrases

# Process each page and add keywords
for page in data.get("pages", []):
    text = page.get("cleaned_text", "")
    if text:
        keywords = find_key_phrases(text)
        page["keywords"] = keywords

# Save the updated JSON data with keywords
with open(output_path, "w") as outfile:
    json.dump(data, outfile, indent=4)

print(f"Updated JSON file saved to {output_path}")


Updated JSON file saved to ../data/final_clean_with_keywords.json


In [3]:
# ! pip install tensorflow torch transformers tf-keras numpy


In [4]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = TFBertModel.from_pretrained("bert-base-uncased")






Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [5]:
def encode_text(text):
    inputs = tokenizer(text, return_tensors='tf', truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    return tf.reduce_mean(outputs.last_hidden_state, axis=1).numpy()  # Convert to numpy array


In [6]:
import json
import numpy as np

# Load your data with keywords
with open("../data/final_clean_with_keywords.json", "r") as f:
    data = json.load(f)

# Encode the text for each page
for page in data["pages"]:
    text = page.get("cleaned_text", "")
    if text:
        page["embedding"] = encode_text(text).tolist()  # Convert numpy array to list for JSON serialization

# Save the updated data with embeddings
with open("../data/with_embeddings.json", "w") as f:
    json.dump(data, f)


In [7]:
# ! pip install scikit-learn

In [8]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def search(query_text, embeddings):
    query_embedding = encode_text(query_text)  # Create embedding for the query
    # Ensure the query_embedding is 2D
    if query_embedding.ndim == 1:
        query_embedding = query_embedding.reshape(1, -1)  # Reshape to 2D
    similarities = cosine_similarity(query_embedding, embeddings)  # Compute cosine similarity
    similar_indices = np.argsort(similarities[0])[::-1]  # Get indices of most similar pages
    return similar_indices


In [9]:
cleaned_text_list = []
embeddings_list = []

for page in data['pages']:
    cleaned_text_list.append(page['cleaned_text'])
    embeddings_list.append(np.array(page['embedding']))
    
embeddings_list = np.squeeze(embeddings_list)  # Remove single-dimensional entries


In [10]:
import re

def extract_relevant_sections(text, keywords):
    relevant_sections = []
    for line in text.splitlines():
        if any(keyword in line.lower() for keyword in keywords):
            relevant_sections.append(line)
    return "\n".join(relevant_sections)

# Example usage
# legal_text = """
# Your legal text here...
# """
keywords = ["crime"]
for page in data["pages"]:
    
    
    relevant_text = extract_relevant_sections(page["cleaned_text"],keywords)
    print(relevant_text)





























































































































































































187 constitution india part provided election commissioner regional commissioner shall removed office except recommendation chief election commissioner 6 president governor 1 state shall requested election commission make available election commission regional commissioner staff may necessary discharge function conferred election commission clause 1 325 person ineligible inclusion claim included special electoral roll ground religion race caste shall one general electoral roll every territorial constituency election either house parliament house either house legislature state person shall ineligible inclusion roll claim included special electoral roll constituency ground religion race caste sex 326 election house people legislative assembly state basis adult election house people legislative assembly eve

In [11]:
query = "what is law about criminals"  # Replace with your query
relevant_indices = search(query, embeddings_list)
releavant_indices=relevant_indices[:1]
# Display the results
print("Relevant pages for your query:")
for index in relevant_indices[:1]:  # Display top 5 results
    print(f"Page {index}: {cleaned_text_list[index]}")


Relevant pages for your query:
Page 109: part vii state part b first schedule omitted constitution seventh amendment act 1956 29 sch 111


In [12]:
# ! pip install transformers


In [13]:
from transformers import pipeline

# Load the summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")


In [14]:
# Function to summarize text
def summarize_text(text, max_length=20, min_length=10):
    summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
    return summary[0]['summary_text']

# Summarize the relevant pages
summaries = {}
for index in relevant_indices:  # Iterate over relevant indices
    text_to_summarize = cleaned_text_list[index]  # Get the text for the relevant page
    summaries[index] = summarize_text(text_to_summarize)

# Display the summaries
for page_index, summary in summaries.items():
    print(f"Summary for Page {page_index}: {summary}\n")


Your max_length is set to 20, but your input_length is only 19. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)


KeyboardInterrupt: 

In [15]:
# ! pip install --upgrade jupyter ipywidgets


Collecting jupyter
  Using cached jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting ipywidgets
  Using cached ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting notebook (from jupyter)
  Using cached notebook-7.2.2-py3-none-any.whl.metadata (10 kB)
Collecting jupyter-console (from jupyter)
  Using cached jupyter_console-6.6.3-py3-none-any.whl.metadata (5.8 kB)
Collecting nbconvert (from jupyter)
  Using cached nbconvert-7.16.4-py3-none-any.whl.metadata (8.5 kB)
Collecting jupyterlab (from jupyter)
  Using cached jupyterlab-4.2.5-py3-none-any.whl.metadata (16 kB)
Collecting async-lru>=1.0.0 (from jupyterlab->jupyter)
  Using cached async_lru-2.0.4-py3-none-any.whl.metadata (4.5 kB)
Collecting httpx>=0.25.0 (from jupyterlab->jupyter)
  Using cached httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jupyter-lsp>=2.0.0 (from jupyterlab->jupyter)
  Using cached jupyter_lsp-2.2.5-py3-none-any.whl.metadata (1.8 kB)
Collecting jupyter-server<3,>=2.4.0 (from ju