In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import re

class LawQASystem:
    def __init__(self, csv_path):
        """Initialize the QA system with the law dataset."""
        # Load and process the data
        self.df = pd.read_csv(csv_path)

        # Clean and preprocess the text data
        self.df['cleaned_law'] = self.df['law'].apply(self.preprocess_text)
        self.df['cleaned_title'] = self.df['act_title'].apply(self.preprocess_text)

        # Create section mapping for exact matches
        self.section_map = {str(row['section']): row for _, row in self.df.iterrows()}

        # Initialize the sentence transformer model
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

        # Generate embeddings for sections and titles separately
        print("Generating embeddings... This may take a few minutes.")
        self.section_embeddings = self.model.encode(self.df['cleaned_law'].tolist(),
                                                  show_progress_bar=True)
        self.title_embeddings = self.model.encode(self.df['cleaned_title'].tolist(),
                                                show_progress_bar=True)

    def preprocess_text(self, text):
        """Clean and preprocess text."""
        if pd.isna(text):
            return ""

        # Convert to lowercase
        text = text.lower()

        # Remove special characters while preserving important punctuation
        text = re.sub(r'[^\w\s\-.,()]', ' ', text)

        # Remove extra whitespace
        text = ' '.join(text.split())

        # Remove footnotes and references
        text = re.sub(r'\[\d+\]', '', text)

        return text

    def extract_section_number(self, query):
        """Extract section number from query if present."""
        section_match = re.search(r'section\s+(\d+)', query.lower())
        if section_match:
            return section_match.group(1)
        return None

    def get_exact_section_match(self, section_number):
        """Get exact section match if available."""
        return self.section_map.get(str(section_number))

    def calculate_combined_similarity(self, query_embedding, section_embeddings, title_embeddings):
        """Calculate combined similarity score giving more weight to section content."""
        section_scores = cosine_similarity(query_embedding, section_embeddings)[0]
        title_scores = cosine_similarity(query_embedding, title_embeddings)[0]

        # Weighted combination (70% section content, 30% title)
        return 0.7 * section_scores + 0.3 * title_scores

    def get_most_relevant_sections(self, query, top_k=3):
        """Find the most relevant sections for a given query."""
        # Preprocess query
        cleaned_query = self.preprocess_text(query)

        # Check for exact section number match first
        section_number = self.extract_section_number(query)
        if section_number:
            exact_match = self.get_exact_section_match(section_number)
            if exact_match is not None:
                return [{
                    'act_title': exact_match['act_title'],
                    'section': exact_match['section'],
                    'law': exact_match['law'],
                    'similarity': 1.0
                }]

        # Generate embedding for the query
        query_embedding = self.model.encode([cleaned_query])

        # Calculate combined similarity scores
        similarity_scores = self.calculate_combined_similarity(
            query_embedding,
            self.section_embeddings,
            self.title_embeddings
        )

        # Get top k most similar documents
        top_indices = np.argsort(similarity_scores)[-top_k:][::-1]

        results = []
        for idx in top_indices:
            # Only include results with similarity above threshold
            if similarity_scores[idx] > 0.5:  # Adjust threshold as needed
                results.append({
                    'act_title': self.df.iloc[idx]['act_title'],
                    'section': self.df.iloc[idx]['section'],
                    'law': self.df.iloc[idx]['law'],
                    'similarity': similarity_scores[idx]
                })

        return results

    def answer_query(self, query):
        """Generate a comprehensive answer for the user's query."""
        relevant_sections = self.get_most_relevant_sections(query)

        if not relevant_sections:
            return "I could not find any relevant sections matching your query. Please try rephrasing your question."

        # Format the answer
        answer = "Based on your query, here are the most relevant sections from the law:\n\n"

        for i, section in enumerate(relevant_sections, 1):
            answer += f"Result {i} (Similarity: {section['similarity']:.2f}):\n"
            answer += f"Act: {section['act_title']}\n"
            answer += f"Section: {section['section']}\n"
            answer += f"Content: {section['law']}\n\n"

        return answer

# Initialize and test the system
qa_system = LawQASystem('/content/new_compressed.csv')
query = "When can the Central Government remove the Chairperson or a Member under the Aadhaar Act, 2016?"
print(qa_system.answer_query(query))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from PIL import Image, ImageDraw, ImageFont
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import re
import os
import pickle

class LawQASystem:
    def __init__(self, csv_path, load_path=None):
        self.df = pd.read_csv(csv_path)
        self.df['cleaned_law'] = self.df['law'].apply(self.preprocess_text)

        # Initialize models from scratch if no load_path or if loading fails
        try:
            if load_path and os.path.exists(load_path):
                self.load_saved_models(load_path)
            else:
                raise FileNotFoundError("Model path not found")
        except (FileNotFoundError, ValueError) as e:
            print(f"Loading saved models failed: {str(e)}")
            print("Initializing new models...")
            self.initialize_new_models()

        self.last_results = []

    def initialize_new_models(self):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        print("Generating embeddings...")
        self.law_embeddings = self.model.encode(self.df['cleaned_law'].tolist(),
                                              show_progress_bar=True)
        print("Loading Legal Pegasus model...")
        self.summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus")
        self.summarizer_tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")

    def save_models_and_embeddings(self, save_path):
        try:
            os.makedirs(save_path, exist_ok=True)

            # Save sentence transformer model
            model_save_path = os.path.join(save_path, "sentence_transformer")
            os.makedirs(model_save_path, exist_ok=True)
            self.model.save(model_save_path)

            # Save Pegasus model and tokenizer
            pegasus_model_path = os.path.join(save_path, "legal_pegasus_model")
            pegasus_tokenizer_path = os.path.join(save_path, "legal_pegasus_tokenizer")
            self.summarizer_model.save_pretrained(pegasus_model_path)
            self.summarizer_tokenizer.save_pretrained(pegasus_tokenizer_path)

            # Save embeddings
            with open(os.path.join(save_path, "law_embeddings.pkl"), 'wb') as f:
                pickle.dump(self.law_embeddings, f)

            print("Models and embeddings saved successfully")

        except Exception as e:
            print(f"Error saving models: {str(e)}")
            raise
    def load_saved_models(self, load_path):
        if not os.path.exists(load_path):
            raise FileNotFoundError(f"Load path does not exist: {load_path}")

        model_path = os.path.join(load_path, "sentence_transformer")
        pegasus_model_path = os.path.join(load_path, "legal_pegasus_model")
        pegasus_tokenizer_path = os.path.join(load_path, "legal_pegasus_tokenizer")
        embeddings_path = os.path.join(load_path, "law_embeddings.pkl")

        if not all(os.path.exists(p) for p in [model_path, pegasus_model_path,
                                              pegasus_tokenizer_path, embeddings_path]):
            raise FileNotFoundError("One or more model files are missing")

        self.model = SentenceTransformer(model_path)
        self.summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(pegasus_model_path)
        self.summarizer_tokenizer = AutoTokenizer.from_pretrained(pegasus_tokenizer_path)

        with open(embeddings_path, 'rb') as f:
            self.law_embeddings = pickle.load(f)



    def save_models_and_embeddings(self, save_path):
        os.makedirs(save_path, exist_ok=True)
        self.model.save(f"{save_path}/sentence_transformer")
        self.summarizer_model.save_pretrained(f"{save_path}/legal_pegasus_model")
        self.summarizer_tokenizer.save_pretrained(f"{save_path}/legal_pegasus_tokenizer")
        with open(f"{save_path}/law_embeddings.pkl", 'wb') as f:
            pickle.dump(self.law_embeddings, f)

    def load_saved_models(self, load_path):
        self.model = SentenceTransformer(f"{load_path}/sentence_transformer")
        self.summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(f"{load_path}/legal_pegasus_model")
        self.summarizer_tokenizer = AutoTokenizer.from_pretrained(f"{load_path}/legal_pegasus_tokenizer")
        with open(f"{load_path}/law_embeddings.pkl", 'rb') as f:
            self.law_embeddings = pickle.load(f)

    def preprocess_text(self, text):
        if pd.isna(text):
            return ""
        text = text.lower()
        text = re.sub(r'[^\w\s\-.,()]', ' ', text)
        text = ' '.join(text.split())
        text = re.sub(r'\[\d+\]', '', text)
        return text

    def get_most_relevant_sections(self, query, top_k=3):
        cleaned_query = self.preprocess_text(query)
        query_embedding = self.model.encode([cleaned_query])
        similarity_scores = cosine_similarity(query_embedding, self.law_embeddings)[0]
        top_indices = np.argsort(similarity_scores)[-top_k:][::-1]

        results = []
        for idx in top_indices:
            if similarity_scores[idx] > 0.3:
                results.append({
                    'act_title': self.df.iloc[idx]['act_title'],
                    'section': self.df.iloc[idx]['section'],
                    'law': self.df.iloc[idx]['law'],
                    'similarity': similarity_scores[idx]
                })

        self.last_results = results
        return results

    def summarize_text(self, text):
        input_tokenized = self.summarizer_tokenizer.encode(
            text,
            return_tensors="pt",
            max_length=1024,
            truncation=True
        )

        summary_ids = self.summarizer_model.generate(
            input_tokenized,
            num_beams=9,
            no_repeat_ngram_size=3,
            length_penalty=2.0,
            min_length=100,
            max_length=150,
            early_stopping=True
        )

        summary = self.summarizer_tokenizer.decode(
            summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        return summary

    def answer_query(self, query):
        relevant_sections = self.get_most_relevant_sections(query)

        if not relevant_sections:
            return "I could not find any relevant sections matching your query. Please try rephrasing your question."

        answer = "Based on your query, here are the most relevant sections from the law:\n\n"

        for i, section in enumerate(relevant_sections, 1):
            answer += f"Result {i} (Similarity: {section['similarity']:.2f}):\n"
            answer += f"Act: {section['act_title']}\n"
            answer += f"Section: {section['section']}\n"
            answer += f"Content: {section['law']}\n\n"

        answer += "\nPlease select the most relevant result (1-{}) for a summarized version: ".format(len(relevant_sections))
        return answer

    def get_summary_for_selection(self, selection):
        try:
            selection = int(selection)
            if 1 <= selection <= len(self.last_results):
                selected_text = self.last_results[selection-1]['law']
                summary = self.summarize_text(selected_text)
                return f"\nSummary of selected section:\n{summary}"
            else:
                return "Invalid selection. Please choose a number between 1 and {}.".format(len(self.last_results))
        except ValueError:
            return "Invalid input. Please enter a number."


def main():
    # Initialize with saved models if available
    save_path = '/content/drive/MyDrive/law_qa_models'
    qa_system = LawQASystem('/content/new.csv', load_path=save_path)

    print("Welcome to the Law QA System!")
    print("Type 'exit' to quit or 'save' to save models.")

    while True:
        query = input("\nPlease enter your question: ").strip()

        if query.lower() == 'exit':
            print("Thank you for using the Law QA System!")
            break

        if query.lower() == 'save':
            qa_system.save_models_and_embeddings(save_path)
            print("Models and embeddings saved successfully!")
            continue

        if not query:
            print("Please enter a valid question.")
            continue

        answer = qa_system.answer_query(query)
        print("\nAnswer:")
        print(answer)

        if qa_system.last_results:
            selection = input("\nEnter your selection (or press Enter to skip): ").strip()
            if selection:
                summary = qa_system.get_summary_for_selection(selection)
                print(summary)

if __name__ == "__main__":
    main()



Loading saved models failed: Unrecognized model in /content/drive/MyDrive/law_qa_models/sentence_transformer. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, audio-spectrogram-transformer, autoformer, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deformable_detr, deit, depth_anything, deta, detr, dinat, dinov2, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fastspeech2_conformer, fl

Batches:   0%|          | 0/1071 [00:00<?, ?it/s]

Loading Legal Pegasus model...


config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Welcome to the Law QA System!
Type 'exit' to quit or 'save' to save models.

Answer:
Based on your query, here are the most relevant sections from the law:

Result 1 (Similarity: 0.78):
Act: Prohibition of Benami Property Transactions Act, 1988
Section: 35
Content: 35. Removal of Chairperson and Member from office in certain circumstances.-
(1) The Central Government may, in consultation with the Chief Justice of High Court, remove from office of the Chairperson or any Member, who-
(a) has been adjudged as an insolvent; or
(b) has been convicted of an offence which, in the opinion of the Central Government involves moral turpitude; or
(c) has become physically or mentally incapable; or
(d) has acquired such financial or other interest as is likely to affect prejudicially his functions; or (e) has so abused his position as to render his continuance in office prejudicial to the public interest.
(2) The Chairperson or Judicial Member shall not be removed from his office except by an order