In [6]:
!pip install datasets==2.10.0 rouge_score tabulate tqdm



In [1]:
from datasets import load_dataset
import pandas as pd

pd.read_csv('extractive_summaries.csv')

Unnamed: 0.1,Unnamed: 0,summary
0,0,From the Judgment and Order dated 25 6 74 of t...
1,1,718 of 1979 From the Judgment and Order dated ...
2,2,She has made a prayer that the respondents may...
3,3,Section 4 provides for an application for fixa...
4,4,In other words the proceedings were allegedly ...
...,...,...
1015,1015,"These two appeals by special leave, one pre fe..."
1016,1016,The assessee was already in existence but the ...
1017,1017,1 in CA 1742/69 and for the Appellant in CA 17...
1018,1018,Appeal by special leave from the judgment and ...


# Final Dataset

In [None]:
import pandas as pd
from datasets import load_dataset

dataset = load_dataset("Yashaswat/Indian-Legal-Text-ABS")

judgement_data = dataset['train'].select(range(1020))
df_judgements = pd.DataFrame(judgement_data)

df_summaries = pd.read_csv("extractive_summaries.csv")   

assert len(df_judgements) == 1020
assert len(df_summaries) == 1020

final_df = pd.DataFrame({
    "judgement": df_judgements["judgement"],
    "summary": df_summaries["summary"]
})

final_df.to_csv("final_data.csv", index=False)

print("final_data.csv created successfully!")


Using the latest cached version of the dataset since Yashaswat/Indian-Legal-Text-ABS couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\jpsre\.cache\huggingface\datasets\Yashaswat___indian-legal-text-abs\default\0.0.0\a26ff9215839ebeeb88db6380ea67c8af3875a49 (last modified on Fri Apr  4 15:33:19 2025).


✅ final_data.csv created successfully!


In [8]:
pd.read_csv('final_data.csv').head()

Unnamed: 0,judgement,summary
0,Appeal No. 623 of 1975.\nFrom the Judgment and...,From the Judgment and Order dated 25 6 74 of t...
1,N: Criminal Appeal No. 718 of 1979 From the Ju...,718 of 1979 From the Judgment and Order dated ...
2,it Petition (Civil) No. 623 of 1989.\n(Under A...,She has made a prayer that the respondents may...
3,Appeals Nos.\n50 of 1968 and 1201 of 1970.\nFr...,Section 4 provides for an application for fixa...
4,N: Civil Appeal No. 135 of 1991.\nFrom the Jud...,In other words the proceedings were allegedly ...


# Traditional Methods

In [None]:
import nltk
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
from tabulate import tabulate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_metric

nltk.download("punkt")


def tfidf_summarization(text, top_k=5):
    sentences = nltk.sent_tokenize(text)
    if len(sentences) <= top_k:
        return text
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    sentence_scores = np.array(tfidf_matrix.sum(axis=1)).flatten()
    ranked_sentences = np.argsort(sentence_scores)[-top_k:][::-1]
    return " ".join([sentences[i] for i in sorted(ranked_sentences)])

def lsa_summarization(text, top_k=5):
    sentences = nltk.sent_tokenize(text)
    if len(sentences) <= top_k:
        return text
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    svd = TruncatedSVD(n_components=1)
    svd_matrix = svd.fit_transform(tfidf_matrix)
    sentence_scores = svd_matrix.flatten()
    ranked_sentences = np.argsort(sentence_scores)[-top_k:][::-1]
    return " ".join([sentences[i] for i in sorted(ranked_sentences)])

def textrank_summarization(text, top_k=5):
    sentences = nltk.sent_tokenize(text)
    if len(sentences) <= top_k:
        return text
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(tfidf_matrix)
    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph)
    ranked_sentences = sorted(scores, key=scores.get, reverse=True)[:top_k]
    return " ".join([sentences[i] for i in sorted(ranked_sentences)])


df = pd.read_csv("final_data.csv")
df = df.dropna(subset=["judgement", "summary"])

rouge = load_metric("rouge")

def evaluate_rouge(predictions, references):
    result = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
    formatted = {
        "ROUGE-1": round(result["rouge1"].mid.fmeasure * 100, 2),
        "ROUGE-2": round(result["rouge2"].mid.fmeasure * 100, 2),
        "ROUGE-L": round(result["rougeL"].mid.fmeasure * 100, 2)
    }
    return formatted

results = []

def evaluate_method(method_fn, name, top_k=5, limit=100):
    predictions, references = [], []
    for _, row in tqdm(df.head(limit).iterrows(), total=min(limit, len(df))):
        pred = method_fn(row["judgement"], top_k=top_k)
        predictions.append(pred)
        references.append(row["summary"])
    scores = evaluate_rouge(predictions, references)
    results.append([name, scores["ROUGE-1"], scores["ROUGE-2"], scores["ROUGE-L"]])

evaluate_method(tfidf_summarization, "TF-IDF")
evaluate_method(lsa_summarization, "LSA")
evaluate_method(textrank_summarization, "TextRank")

print("\n ROUGE Evaluation Results:\n")
print(tabulate(results, headers=["Method", "ROUGE-1", "ROUGE-2", "ROUGE-L"], tablefmt="fancy_grid"))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jpsre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  rouge = load_metric("rouge")
100%|██████████| 100/100 [00:00<00:00, 194.12it/s]
100%|██████████| 100/100 [00:01<00:00, 90.22it/s]
100%|██████████| 100/100 [00:12<00:00,  8.25it/s]



📊 ROUGE Evaluation Results:

╒══════════╤═══════════╤═══════════╤═══════════╕
│ Method   │   ROUGE-1 │   ROUGE-2 │   ROUGE-L │
╞══════════╪═══════════╪═══════════╪═══════════╡
│ TF-IDF   │     48.92 │     16.99 │     21.68 │
├──────────┼───────────┼───────────┼───────────┤
│ LSA      │     53.48 │     23.81 │     26.81 │
├──────────┼───────────┼───────────┼───────────┤
│ TextRank │     53.68 │     23.82 │     26.61 │
╘══════════╧═══════════╧═══════════╧═══════════╛


# DeepLearning Methods

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

nltk.download("punkt")


def preprocess_data(texts, summaries):
    processed_texts = []
    processed_labels = []

    for text, summary in zip(texts, summaries):
        text_sentences = nltk.sent_tokenize(str(text))
        summary_sentences = nltk.sent_tokenize(str(summary))

        labels = [1 if sentence in summary_sentences else 0 for sentence in text_sentences]

        processed_texts.extend(text_sentences)
        processed_labels.extend(labels)

    return processed_texts, processed_labels


class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        return {
            "text": text,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.float),
        }


class ExtractiveSummarizer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout, n_heads=8):
        super(ExtractiveSummarizer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=n_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if n_layers > 1 else 0,
        )

        self.attention = nn.MultiheadAttention(embed_dim=hidden_dim * 2, num_heads=n_heads)

        self.conv1d = nn.Conv1d(in_channels=hidden_dim * 2, out_channels=hidden_dim, kernel_size=3, padding=1)

        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, output_dim),
        )

        self.layer_norm = nn.LayerNorm(hidden_dim * 2)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        embedded = self.dropout(self.embedding(input_ids)) 
        lstm_output, _ = self.lstm(embedded)
        lstm_output = self.layer_norm(lstm_output)

        lstm_output = lstm_output.permute(1, 0, 2)
        attn_output, _ = self.attention(lstm_output, lstm_output, lstm_output)
        attn_output = attn_output.permute(1, 0, 2)

        conv_input = attn_output.permute(0, 2, 1)
        conv_output = self.conv1d(conv_input)
        conv_output = conv_output.permute(0, 2, 1)

        pooled_output = torch.mean(conv_output, dim=1)

        output = self.fc(pooled_output)

        return torch.sigmoid(output).squeeze(1)


def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device).float()

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)


def generate_summary(model, text, tokenizer, max_len, device, top_k=10):
    model.eval()
    sentences = nltk.sent_tokenize(text)
    summaries = []

    for sentence in sentences:
        encoding = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        input_ids = encoding["input_ids"].to(device)
        attention_mask = encoding["attention_mask"].to(device)

        with torch.no_grad():
            output = model(input_ids, attention_mask)
            summaries.append((sentence, output.item()))

    summaries.sort(key=lambda x: x[1], reverse=True)
    return ". ".join([sentence for sentence, score in summaries[:top_k]])


if __name__ == "__main__":
    VOCAB_SIZE = 30522
    EMBEDDING_DIM = 256
    HIDDEN_DIM = 128
    OUTPUT_DIM = 1
    N_LAYERS = 2
    DROPOUT = 0.3
    MAX_LEN = 1024
    BATCH_SIZE = 4
    LR = 1e-4
    EPOCHS = 3
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

    df = pd.read_csv("final_data.csv")  
    df = df.dropna(subset=["judgement", "summary"])

    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
    train_texts, train_summaries = train_df["judgement"].tolist(), train_df["summary"].tolist()
    val_texts, val_summaries = val_df["judgement"].tolist(), val_df["summary"].tolist()


    train_texts, train_labels = preprocess_data(train_texts, train_summaries)
    val_texts, val_labels = preprocess_data(val_texts, val_summaries)

    train_dataset = TextDataset(train_texts, train_labels, tokenizer, MAX_LEN)
    val_dataset = TextDataset(val_texts, val_labels, tokenizer, MAX_LEN)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    model = ExtractiveSummarizer(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT).to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=LR)
    criterion = nn.BCELoss()

    for epoch in range(EPOCHS):
        train_loss = train_model(model, train_loader, optimizer, criterion, DEVICE)
        print(f"\nEpoch {epoch+1}/{EPOCHS} - Loss: {train_loss:.4f}")

    example_text = df.iloc[0]["judgement"]
    summary = generate_summary(model, example_text, tokenizer, MAX_LEN, DEVICE)
    print("\nGenerated Summary:\n", summary)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jpsre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Training: 100%|██████████| 33477/33477 [20:06<00:00, 27.74it/s]



Epoch 1/3 - Loss: 0.2820


Training: 100%|██████████| 33477/33477 [20:17<00:00, 27.49it/s]



Epoch 2/3 - Loss: 0.2505


Training: 100%|██████████| 33477/33477 [20:11<00:00, 27.63it/s]



Epoch 3/3 - Loss: 0.2398

Generated Summary:
 The respondent filed a suit for the grant of a permanent injunction restraining the appellant from interfering with the possession.. The trial Court in the present case rightly said that it could not be said that there was any dispute as to tenancy.. In view of the fact that no costs were al lowed by the High Court, there will be no order as to costs.. This appeal by special leave is from the judgment .dated 25 June, 1974 of the Karnataka High Court.. The question whether the respondent is a tenant or deemed to be a tenant does not at all arise because the tenancy came to an end.. The respondent by notice was called upon to hand over possession of the land immediately after the expiry of the period of lease.. The appellant opposed the application for stay of the suit by the civil court and referring to the Tribunal for decision under the Karnataka Land Reforms Act, 1961.. The High Court reversed the decision of the trial Court and directed

# save model

In [None]:
MODEL_PATH = "extractive_summarizer_model.pth"
torch.save(model.state_dict(), MODEL_PATH)
print(f"\nModel saved to {MODEL_PATH}")
tokenizer.save_pretrained("saved_tokenizer")
print("Tokenizer saved to 'saved_tokenizer/' directory")


Model saved to extractive_summarizer_model.pth
Tokenizer saved to 'saved_tokenizer/' directory


# Testing 

In [5]:
text = """
Appeal No. 623 of 1975.
From the Judgment and Order dated 25 6 74 of the Karnataka 'High Court in Civil Revision No 1981/73.
S.S. JavaIi and B.P. Singh, for the Appellants.
S.V. Gupte and K.N. Bhatt, for the Respondent.
The Judgment of the Court was delivered by RAY, C.J.
This appeal by special leave is from the judgment .dated 25 June, 1974 of the Karnataka High Court.
The principal question in this appeal whether section 107 of the Karnataka Land Reforms Act, 1961 applies to the land in suit which was leased to the respondent.
A large plot of land comprising an area of about 20 acres popularly known as "The Chamaraja Sewage Farm" situate in the city of Bangalore belongs to the appellant Corpora tion.
The appellant :leased to the respondent by a regis tered lease dated 14 September, 270 1953 the aforementioned land for a period of 5 years on an annual rent of Rs. 13,555/ .
The respondent by notice was called upon to hand over possession of the land immediately after the expiry of the period of lease.
The respondent failed to deliver possession.
The reason why the appellant required that land is that the Corporation proposed a scheme for the development and construction of a new township on that area.
The respondent filed a suit for the grant of a permanent injunction restraining the appellant from interfering with the possession.
The Court upheld the contentions of the appellant that the lease had terminated by efflux of time.
The respondent 's 'suit was dismissed.
An appeal was pre ferred.
The appeal was dismissed on 21 August,.
The appellant then instituted the suit in appeal claim ing possession from the respondent.
The appellant contended that the respondent was a trespasser and claimed damages for unauthorised occupation.
The respondent contended that he was still a tenant.
The respondent claimed protection under the Mysore Tenants (Temporary Protection from Eviction) Act, 1961 being Act No. 15 of 1961.
Section 3 of the Mysore Tenants (Temporary Protection from Evic .
tion) Act, 1961 provided for prohibition against eviction.
The appellant obtained a decree in the suit.
The decree directed the respondent to deliver possession.
The respond ent preferred an ' appeal.
The High Court remanded the matter to the trial Court for assessment of damages.
Upon remand the respondent applied for the amendment of the written statement.
The respondent claimed protection under the Karnataka Land Reforms Act, 1961.
It may be stated here that the Mysore Tenants (Temporary Protection from Eviction) Act, 1961 ceased to be in force in March, 1966.
That is perhaps why the respondent made an applica tion for amendment of the written statement on 2 February 1973.
The respondent contended relying on section 133 of the Karnataka Land Reforms Act, 1961 that the.
suit should be stayed by the civil court and should be referred to the Tribunal for decision.
Section 112(B)(b) of the Karnataka Land Reforms Act, 1961 confers power on the Tribunal to decide inter alia whether a person is a tenant or not.
The respondent contended that he was a person who was deemed to be a tenant.
The appellant opposed the application for stay of the suit by the civil court and referring to the Tribunal for decision under the Karnataka Land Reforms Act, 1961.
The trial Court held that the land ' belonging to the appellant was exempted from the application of the provisions of the Land Reforms Act.
The trial Court dismissed the application of the respondent.
The respondent presented a revision petition t0 the High Court.
The High Court reversed the decision of the trial Court and directed the trial Court to refer such of the issues which are required to be.
decided by the Tribunal.
271 Counsel for the respondent contended that the respondent is a tenant within the meaning of the word "tenant" defined in section 2(34) of the Karnataka Land Reforms Act, 1961.
"Tenant" is defined to mean an agriculturist who cultivates personally the land he holds on lease from a landlord and includes (i) a person who is deemed to be a tenant under section 4 of the Karnataka Land Reforms Act, 1961, Section of the Karnataka Land Reforms Act, 1961 states that a person lawfully cultivating any land belonging to another person shall be deemed to be a tenant if such land is not cultivat ed personally by the owner and if such person is not (a) a member of the owner 's family, or (b) a servant or a hired labourer on wages, or (c) a mortgage in possession It was, therefore, said that the respondent could raise the con tention whether the respondent was a tenant or not.
It was next contended that section 8 of the Karnataka Land Reforms Act, 1961 speaks of rent and rent is referable to tenant and therefore a dispute as to tenancy would be within the ambit of the Karnataka Land Reforms Act, 1961.
Section 107 of the Karnataka Land Reforms Act, 1961 states that subject to the provisions of section 110 nothing in this Act, except section 8 shall apply to lands, inter alia (iii) belonging to or held on lease or from a local authority.
There is no dispute that the land was given on lease by the local authority.
There is also no 'dispute that the land belongs to the local authority.
There is also no dispute that the lease was detrmined by efflux of time.
The question whether the respondent is a tenant or deemed to be a tenant does not at all arise because the tenancy came to an end.
The 'respondent thereafter was a trespasser.
Section 107 of the Karnataka Land Reforms Act, 1961 makes it quite clear that the only provision which applies, inter alia, to lands belonging to or hold on lease or from a local authority is section 8.
No other section of the Land Reforms Act applies to these lands.
Section 8 of the Karna taka Land Reforms Act, 1961 deals with rent.
The suit in the present case was not for recovery of rent.
The suit is for recovery of possession and for damages, for unauthorised occupation of the respondent.
Section 2 of the Karnataka Land Reforms Act, 1961 is not applicable.
Therefore, no question can be referred for determination by the Tribunal under section 133.
The Mysore Tenants (Temporary Protection from Eviction) Act, 1961 came into effect on 13 December, 1961.
The Mysore Tenants (Temporary Protection from Eviction) Act, 1961 remained in force till the month of March, 1966.
The re spondent could not draw any support from that Act for pro tection against eviction.
The land in question was outside the applicability of the Mysore Tenants (Temporary Protec tion from Eviction) Act, 1961.
Further the Act ceased to be in operation in 1966 and no question could be referred for determination as to whether the respondent was a tenant under the Mysore Tenants (Temporary Protection from Evic tion) Act, 1961 or not.
The trial Court in the present case rightly said that it could not be said that there was any dispute as to tenancy.
272 The respondent had filed a suit where he claimed to remain in possession.
The suit of the respondent was dismissed.
The appellant all along contended that the lease dated 14 September 1963 for a period of 5 years expired by efflux of time.
The appellant claimed possession on the ground Of unauthorised occupation and claimed damages against the respondent, who was a trespasser.
The High Court was clearly in error in referring to the Tribunal under the Karnataka Land Reforms Act 1961 determi nation of the plea taken by the respondent that he was pro tected by the Mysore Tenants (Temporary Protection from Eviction) Act 1961.
Counsel for the respondent did not support the judgment on that ground.
Counsel for the respondent contended that section 133 of the Karnataka Land Reforms Act 1961 excludes jurisdiction of Civil court in suits for possession where the defendant claims to be a tenant.
The plea of the respondent is utterly unsound.
Section 133 of the Karnataka Land Reforms Act 1961 cannot apply to lands which are held by a person on lease from the local authority or where the lease had ex pired and the local authority sues for possession on the ground that there is unauthorised occupation.
No provision of the Karnataka Land Reforms Act can be relied upon to contend that there should be protection against recovery of possession by the local authority.
For the foregoing reasons the judgment of the High Court is set aside.
In view of the fact that no costs were al lowed by the High Court, there will be no order as to costs.
M.R. Appeal allowed.
"""

In [7]:
import pandas as pd

df = pd.read_csv('final_data.csv')
tt = df['judgement'][1]

# Final

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from rouge_score import rouge_scorer

nltk.download("punkt")

def preprocess_data(texts, summaries):
    processed_texts = []
    processed_labels = []

    for text, summary in zip(texts, summaries):
        text_sentences = nltk.sent_tokenize(str(text))
        summary_sentences = nltk.sent_tokenize(str(summary))

        labels = [1 if sentence in summary_sentences else 0 for sentence in text_sentences]

        processed_texts.extend(text_sentences)
        processed_labels.extend(labels)

    return processed_texts, processed_labels


class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        return {
            "text": text,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.float),
        }

class ExtractiveSummarizer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout, n_heads=8):
        super(ExtractiveSummarizer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=n_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if n_layers > 1 else 0,
        )

        self.attention = nn.MultiheadAttention(embed_dim=hidden_dim * 2, num_heads=n_heads)

        self.conv1d = nn.Conv1d(in_channels=hidden_dim * 2, out_channels=hidden_dim, kernel_size=3, padding=1)

        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, output_dim),
        )

        self.layer_norm = nn.LayerNorm(hidden_dim * 2)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        embedded = self.dropout(self.embedding(input_ids)) 
        lstm_output, _ = self.lstm(embedded)
        lstm_output = self.layer_norm(lstm_output)

        lstm_output = lstm_output.permute(1, 0, 2)
        attn_output, _ = self.attention(lstm_output, lstm_output, lstm_output)
        attn_output = attn_output.permute(1, 0, 2)

        conv_input = attn_output.permute(0, 2, 1)
        conv_output = self.conv1d(conv_input)
        conv_output = conv_output.permute(0, 2, 1)

        pooled_output = torch.mean(conv_output, dim=1)

        output = self.fc(pooled_output)

        return torch.sigmoid(output).squeeze(1)


def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device).float()

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)


def generate_summary(model, text, tokenizer, max_len, device, top_k=10):
    model.eval()
    sentences = nltk.sent_tokenize(text)
    summaries = []

    for sentence in sentences:
        encoding = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        input_ids = encoding["input_ids"].to(device)
        attention_mask = encoding["attention_mask"].to(device)

        with torch.no_grad():
            output = model(input_ids, attention_mask)
            summaries.append((sentence, output.item()))

    summaries.sort(key=lambda x: x[1], reverse=True)
    return ". ".join([sentence for sentence, score in summaries[:top_k]])


def evaluate_rouge(model, texts, references, tokenizer, max_len, device, top_k=10):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge1_scores, rouge2_scores, rougeL_scores = [], [], []

    for text, reference in tqdm(zip(texts, references), total=len(texts), desc="Evaluating ROUGE"):
        generated_summary = generate_summary(model, text, tokenizer, max_len, device, top_k)
        scores = scorer.score(reference, generated_summary)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)

    avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
    avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
    avg_rougeL = sum(rougeL_scores) / len(rougeL_scores)

    print("\nROUGE Evaluation:")
    print(f"ROUGE-1: {avg_rouge1:.4f}")
    print(f"ROUGE-2: {avg_rouge2:.4f}")
    print(f"ROUGE-L: {avg_rougeL:.4f}")


if __name__ == "__main__":
    VOCAB_SIZE = 30522
    EMBEDDING_DIM = 256
    HIDDEN_DIM = 128
    OUTPUT_DIM = 1
    N_LAYERS = 2
    DROPOUT = 0.3
    MAX_LEN = 1024
    BATCH_SIZE = 4
    LR = 1e-4
    EPOCHS = 3
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

    df = pd.read_csv("final_data.csv")
    df = df.dropna(subset=["judgement", "summary"])

    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
    train_texts, train_summaries = train_df["judgement"].tolist(), train_df["summary"].tolist()
    val_texts, val_summaries = val_df["judgement"].tolist(), val_df["summary"].tolist()

    train_texts, train_labels = preprocess_data(train_texts, train_summaries)
    val_texts, val_labels = preprocess_data(val_texts, val_summaries)

    train_dataset = TextDataset(train_texts, train_labels, tokenizer, MAX_LEN)
    val_dataset = TextDataset(val_texts, val_labels, tokenizer, MAX_LEN)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    model = ExtractiveSummarizer(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT).to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=LR)
    criterion = nn.BCELoss()

    for epoch in range(EPOCHS):
        train_loss = train_model(model, train_loader, optimizer, criterion, DEVICE)
        print(f"\nEpoch {epoch+1}/{EPOCHS} - Loss: {train_loss:.4f}")

    example_text = df.iloc[0]["judgement"]
    summary = generate_summary(model, example_text, tokenizer, MAX_LEN, DEVICE)
    print("\nGenerated Summary:\n", summary)

    evaluate_rouge(model, val_df["judgement"].tolist(), val_df["summary"].tolist(), tokenizer, MAX_LEN, DEVICE, top_k=10)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jpsre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Training: 100%|██████████| 33477/33477 [19:46<00:00, 28.21it/s]



Epoch 1/3 - Loss: 0.2835


Training: 100%|██████████| 33477/33477 [19:48<00:00, 28.16it/s]



Epoch 2/3 - Loss: 0.2523


Training: 100%|██████████| 33477/33477 [20:26<00:00, 27.30it/s]



Epoch 3/3 - Loss: 0.2421

Generated Summary:
 The trial Court in the present case rightly said that it could not be said that there was any dispute as to tenancy.. The respondent filed a suit for the grant of a permanent injunction restraining the appellant from interfering with the possession.. 272 The respondent had filed a suit where he claimed to remain in possession.. In view of the fact that no costs were al lowed by the High Court, there will be no order as to costs.. The High Court remanded the matter to the trial Court for assessment of damages.. This appeal by special leave is from the judgment .dated 25 June, 1974 of the Karnataka High Court.. The appellant then instituted the suit in appeal claim ing possession from the respondent.. The question whether the respondent is a tenant or deemed to be a tenant does not at all arise because the tenancy came to an end.. The High Court reversed the decision of the trial Court and directed the trial Court to refer such of the issues

Evaluating ROUGE: 100%|██████████| 204/204 [05:05<00:00,  1.50s/it]


ROUGE Evaluation:
ROUGE-1: 0.6010
ROUGE-2: 0.4185
ROUGE-L: 0.3266





# Testing

In [None]:
import torch
import pandas as pd
import re
from transformers import BertTokenizer
from torch import nn
import torch.nn.functional as F
from model import  ExtractiveSummarizer
tokenizer = BertTokenizer.from_pretrained("saved_tokenizer/")

VOCAB_SIZE = 30522
EMBEDDING_DIM = 256
HIDDEN_DIM = 128
OUTPUT_DIM = 1
N_LAYERS = 2
DROPOUT = 0.3
MAX_LEN = 1024
BATCH_SIZE = 4
LR = 1e-4

class ExtractiveSummarizer(nn.Module):
    def __init__(self, vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM,
                 hidden_dim=HIDDEN_DIM, output_dim=OUTPUT_DIM,
                 n_layers=N_LAYERS, dropout=DROPOUT, n_heads=8):
        super(ExtractiveSummarizer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=n_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if n_layers > 1 else 0,
        )

        self.attention = nn.MultiheadAttention(embed_dim=hidden_dim * 2, num_heads=n_heads)

        self.conv1d = nn.Conv1d(in_channels=hidden_dim * 2, out_channels=hidden_dim, kernel_size=3, padding=1)

        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, output_dim),
        )

        self.layer_norm = nn.LayerNorm(hidden_dim * 2)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask=None):
        embedded = self.dropout(self.embedding(input_ids))   
        lstm_output, _ = self.lstm(embedded)   
        lstm_output = self.layer_norm(lstm_output)

        lstm_output = lstm_output.permute(1, 0, 2)  
        attn_output, _ = self.attention(lstm_output, lstm_output, lstm_output)
        attn_output = attn_output.permute(1, 0, 2)  

        conv_input = attn_output.permute(0, 2, 1)   
        conv_output = self.conv1d(conv_input)   
        conv_output = conv_output.permute(0, 2, 1)  

        pooled_output = torch.mean(conv_output, dim=1)   

        output = self.fc(pooled_output)   

        return torch.sigmoid(output).squeeze(1)   

model = ExtractiveSummarizer()
model.load_state_dict(torch.load("extractive_summarizer_model.pth", map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu")))
model.eval()

ipc_df = pd.read_csv("ipc_sections.csv")
ipc_lookup = {
    row["Section"].strip().upper(): {"Offence": row["Offense"], "Punishment": row["Punishment"]}
    for _, row in ipc_df.iterrows()
}

def extract_ipc_sections(text):
    matches = re.findall(r"Section\s+(\d+[A-Z]?)", text, flags=re.IGNORECASE)
    return list(set([f"IPC_{match.upper()}" for match in matches]))

def summarize_text(text, max_sentences=3):
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    if len(sentences) == 0:
        return ""

    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    input_ids = inputs["input_ids"]

    with torch.no_grad():
        scores = model(input_ids)

    if isinstance(scores, torch.Tensor):
        scores = scores.squeeze().tolist()
    if not isinstance(scores, list):
        scores = [scores]

    top_idxs = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:max_sentences]
    top_idxs.sort()   
    summary = " ".join([sentences[i] for i in top_idxs])
    return summary

def process_legal_text(text):
    print("\n📝 Summary:")
    summary = summarize_text(text)
    print(summary)

    print("\n📚 IPC Sections Referenced:\n")
    sections_found = extract_ipc_sections(text)

    valid_sections = []
    for sec in sections_found:
        details = ipc_lookup.get(sec, None)
        if details:
            valid_sections.append((sec, details))

    if not valid_sections:
        print("⚠️ No valid IPC Sections referenced in the text.")
        return

    for sec, details in valid_sections:
        print(f"➡️ {sec}")
        print(f"   Offense   : {details['Offence']}")
        print(f"   Punishment: {details['Punishment']}\n")

if __name__ == "__main__":
    process_legal_text(tt)



📝 Summary:
The complainant Savai Kala, the brother of the deceased saw the latter part of the occurrence when the deceased was being carried away by the accused. After hearing the learned counsel and examining the petition of appeal and after going through the relevant parts of the judgment of the High Court and the Sessions Court. The appeal is summarily dismissed under S 384 of the Code of Criminal Procedure.

📚 IPC Sections Referenced:

➡️ IPC_302
   Offense   : Murder
   Punishment: Death or Imprisonment for Life + Fine

➡️ IPC_384
   Offense   : Extortion
   Punishment: 3 Years or Fine or Both

➡️ IPC_379
   Offense   : Theft
   Punishment: 3 Years or Fine or Both

