In [1]:
# ! pip install spacy

In [18]:
import json
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained SBERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Load the cleaned JSON data
with open('../data/final_clean_with_keywords.json', 'r') as file:
    data = json.load(file)

# Extract cleaned text from the pages
documents = [page['original_text'] for page in data['pages']]

# User query
query = "woman rights"

# Encode the documents and query into sentence embeddings
doc_embeddings = model.encode(documents)
query_embedding = model.encode(query)

# Compute cosine similarity scores between the query and documents
cosine_similarities = util.cos_sim(query_embedding, doc_embeddings)

# Get indices of documents sorted by similarity score
sorted_indices = cosine_similarities.argsort(descending=True).tolist()[0]

# Set a similarity threshold
threshold = 0.45

# Print the top relevant documents
print("Top relevant pages:")
top_5_res=[]
for idx in sorted_indices[:5]:  # Get top 5 relevant documents
    if cosine_similarities[0][idx] >= threshold:
        print(f"Page Number: {data['pages'][idx]['page_number']}, Similarity: {cosine_similarities[0][idx]:.4f}")
        print(f"Cleaned Text: {documents[idx]}\n")
        top_5_res.append(documents[idx])
    else:
        break
print(top_5_res)

Top relevant pages:
Page Number: 221, Similarity: 0.4569
Cleaned Text: 190 THE CONSTITUTION OF INDIA
(Part XVI.—Special Provisions Relating to Certain Classes)
1[330A. Reservation of seats for women in the House of the People.-
(1)Seats shall be reserved for women in the House of the People.
(2) As nearly as may be, one-third of the total number of seats reserved
under clause (2) of article 330 shall be reserved for women belonging to the
Scheduled Castes or the Scheduled Tribes.
(3) As nearly as may be, one-third (including the number of seats
reserved for women belonging to the Scheduled Castes and the Scheduled
Tribes) of the total number of seats to be filled by direct election to the House
of the People shall be reserved for women.]
331. Representation of the Anglo-Indian Community in the House of
the People.—Notwithstanding anything in article 81, the President may, if he is
of opinion that the Anglo-Indian community is not adequately represented in
the House of the People, nomin

In [22]:
len(top_5_res)

1

In [4]:
# ! pip install tensorflow tensorflow-hub transformers
#! pip install transformers torch


In [21]:
import json
import tensorflow as tf
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer

# Load a pre-trained summarization model and tokenizer from Hugging Face
model_name = "t5-small"  # You can choose a different model as well
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name, from_pt=True)

# Function to summarize text
def summarize_text(text):
    try:
        inputs = tokenizer(text, return_tensors="tf", max_length=1024, truncation=True, padding="max_length")
        summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary
    except Exception as e:
        print(f"Error summarizing text: {e}")
        return "Summary generation failed."

# Function to clean and rephrase summaries for better understanding
def clean_summary(summary):
    # Remove unwanted tokens and phrases
    summary = summary.replace("<pad>", "").replace("<extra_id_0>", "").strip()
    
    # Simplifying language if necessary
    summary = summary.replace("shall", "must").replace("may", "can")
    
    return summary

# Function to format summary into meaningful sentences
def format_summary(summary):
    # Split into sentences and clean each sentence
    sentences = summary.split(". ")
    sentences = [sentence.strip() for sentence in sentences if sentence]

    # Join sentences with more context
    readable_sentences = []
    
    for sentence in sentences:
        # Add a period at the end if missing
        if not sentence.endswith('.'):
            sentence += '.'

        # Simplifying legal terms and enhancing clarity
        sentence = sentence.replace("must", "is required to").replace("can", "is allowed to")

        # Adding clarifying phrases for legal terms
        if "is required to" in sentence:
            sentence += " This indicates that compliance with this requirement is mandatory."
        elif "is allowed to" in sentence:
            sentence += " This implies that the action is permitted under the law."

        # Additional readability improvements
        sentence = sentence.replace("therefore", "as a result").replace("thus", "hence").replace("according to", "based on")

        # Adding explanations for complex terms
        if "agreement" in sentence:
            sentence += " This refers to a legally binding contract between parties."

        # Ensure all conjunctions are included and punctuated correctly
        sentence = sentence.replace(" and ", " and ").replace(" to ", " to ").replace(" be ", " be ")
        
        # Ensure proper punctuation
        if not any(punct in sentence for punct in ['.', '!', '?']):
            sentence += '.'

        # Strip any excess whitespace
        sentence = ' '.join(sentence.split())

        readable_sentences.append(sentence)
    
    # Join all readable sentences into a single string
    return " ".join(readable_sentences)

# Function to post-process summaries
def post_process_summary(summary):
    cleaned_summary = clean_summary(summary)
    formatted_summary = format_summary(cleaned_summary)
    
    return formatted_summary  # Return the final formatted summary

# Summarize and post-process each document
print("Summaries of top relevant pages:")
for res in top_5_res:
    summary = summarize_text(res)
    processed_summary = post_process_summary(summary)
    print(f"Summary:\n{processed_summary}\n")


All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Summaries of top relevant pages:
Summary:
the Constitution (One Hundred and Sixth Amendment) Act, 1984, s. 3, for certain words (w.e.f. 16-6-1986). 3. The words and letters "specified in Part A or Part B of the First Schedule" omitted by the Constitution (Fifty-first Amendment) Act, 1956, s. 2 (w.e.f. 16-6-1986). 3. The words and letters "specified in Part A or Part B of the First Schedule" omitted by the Constitution (.

