# Create Dataset

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, LEDModel
import torch
import nltk
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import os

nltk.download('punkt')

# Initialize
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load dataset
dataset = load_dataset("Yashaswat/Indian-Legal-Text-ABS")
split = dataset['train'].select(range(2000))

# Load model
model_name = "allenai/led-base-16384"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = LEDModel.from_pretrained(model_name).to(device)
model.eval()

def get_sentence_embedding(text):
    if not text or not isinstance(text, str):
        return np.zeros(model.config.hidden_size)
    
    try:
        inputs = tokenizer(
            text, return_tensors="pt", 
            truncation=True, max_length=1024, 
            padding="max_length"
        ).to(device)
        
        with torch.no_grad():
            output = model(**inputs).last_hidden_state
            return output.mean(dim=1).squeeze().cpu().numpy()
    except Exception as e:
        print(f"Error: {str(e)}")
        return np.zeros(model.config.hidden_size)

def extract_summary(judgement, top_k=15):
    sentences = [s for s in nltk.sent_tokenize(judgement) if s.strip()]
    if len(sentences) <= top_k:
        return " ".join(sentences)
    
    embeddings = []
    batch_size = 8  # Adjust based on your GPU memory
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        embeddings.extend(get_sentence_embedding(sent) for sent in batch)
    
    doc_embedding = np.mean(embeddings, axis=0)
    sims = cosine_similarity(embeddings, [doc_embedding]).flatten()
    top_indices = np.argsort(sims)[-top_k:]
    
    return " ".join([sentences[i] for i in sorted(top_indices)])

# Process with memory management
extractive_summaries = []
for row in tqdm(split, desc="Processing"):
    try:
        summary = extract_summary(row['judgement'])
        extractive_summaries.append(summary)
    except Exception as e:
        print(f"Failed on row: {e}")
        extractive_summaries.append("")

# Save results
df = pd.DataFrame(split)
df["extractive_summary"] = extractive_summaries
df.to_csv("train_2000_extractive_summary_led.csv", index=False)
print("✅ Summaries saved!")

In [2]:
import pandas as pd

In [5]:
df = pd.DataFrame(extractive_summaries, columns=['summary'])

In [6]:
df.to_csv('extractive_summaries.csv')

In [7]:
extractive_summaries[0]

"From the Judgment and Order dated 25 6 74 of the Karna taka 'High Court in Civil Revision No 1981/73. This appeal by special leave is from the judgment .dated 25 June, 1974 of the Karnataka High Court. The respondent filed a suit for the grant of a permanent injunction restraining the appellant from interfering with the possession. The respondent contended that he was still a tenant. The appellant obtained a decree in the suit. Upon remand the respondent applied for the amendment of the written statement. The respondent claimed protection under the Karnataka Land Reforms Act, 1961. The appellant opposed the application for stay of the suit by the civil court and referring to the Tribunal for decision under the Karnataka Land Reforms Act, 1961. The trial Court dismissed the application of the respondent. The suit is for recovery of possession and for damages, for unauthorised occupation of the respondent. Therefore, no question can be referred for determination by the Tribunal under se