In [None]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import json

In [None]:

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer


In [None]:
dataset_path = "/content/drive/MyDrive/all_v1.json"
df = pd.read_json(dataset_path)

df = df.transpose()
df = df.rename(columns={'original_text': 'normal', 'reference_summary': 'simple'})
df.drop(['doc', 'id', 'title', 'uid', 'case_code', 'case_text', 'note', 'title_code','title_text', 'urls', 'tldr_code','tldr_text'], axis=1, inplace=True)
df = df.reset_index(drop=True)
#shuffled_df = df.sample(frac=1).reset_index(drop=True)

In [None]:
df.head()

Unnamed: 0,normal,simple
0,welcome to the pokémon go video game services ...,hi.
1,by using our services you are agreeing to thes...,by playing this game you agree to these terms....
2,if you want to use certain features of the ser...,you have to use google pokemon trainer club or...
3,during game play please be aware of your surro...,don t die or hurt others and if you do it s no...
4,subject to your compliance with these terms ni...,don t copy modify resell distribute or reverse...


In [None]:
wiki_data = pd.read_csv("/content/drive/MyDrive/wiki_data2.csv")

wiki_data.drop(['Unnamed: 0','example_id','paragraph_alignment', 'sentence_alignment'], axis=1, inplace=True)

In [None]:
wiki_data.head()

Unnamed: 0,normal,simple
0,English law is the common law legal system of ...,"English law, also called common law, is the le..."
1,Solitary confinement is a form of imprisonment...,Solitary confinement is a punishment or specia...
2,Rape is a type of sexual assault usually invol...,Rape is usually defined as having sexual inter...
3,The Southern Poverty Law Center (SPLC) is an A...,The Southern Poverty Hate Center (SPLC) is a U...
4,Waste (or wastes) are unwanted or unusable mat...,Waste (or wastes) are unwanted or unusable mat...


In [None]:
wiki_data.shape

(609, 2)

In [None]:
comb_data = pd.concat([df, wiki_data], ignore_index=True)
comb_data = comb_data.sample(frac=1).reset_index(drop=True)

In [None]:
comb_data.head()

Unnamed: 0,normal,simple
0,A ferry is a merchant vessel used to carry pas...,Ferry is a boat or ship that is used to take p...
1,Fear is a feeling induced by perceived danger ...,Fear is a feeling or an emotion. A person who ...
2,by using the discogs website and affiliated we...,users are subject to the policies and guidelin...
3,"An act of parliament, also called primary legi...",An Act of Parliament or Act is law made by the...
4,if you have any questions or concerns about no...,the service provides a complaint mechanism for...


In [None]:
class LegalSimplificationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_input_length=512, max_output_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]


        #change to df column name
        input_text = "Rewrite the following sentences in a concise and clear summary while retaining the general meaning. Ensure the summary remains factual, neutral, and avoids exaggeration or speculative language: " + str(self.data.iloc[idx]["normal"])

        #chang to df column name
        target_text = str(self.data.iloc[idx]["simple"])

        # Tokenize input and target text
        input_encoding = self.tokenizer(
            input_text, max_length=self.max_input_length, padding="max_length", truncation=True, return_tensors="pt"
        )
        target_encoding = self.tokenizer(
            target_text, max_length=self.max_output_length, padding="max_length", truncation=True, return_tensors="pt"
        )
        input_ids = input_encoding["input_ids"].squeeze(0)
        attention_mask = input_encoding["attention_mask"].squeeze(0)
        labels = target_encoding["input_ids"].squeeze(0)
        labels[labels == 0] = -100  # Set padding tokens to -100 for loss calculation

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }



In [None]:
from transformers import T5ForConditionalGeneration
#load tokenizer
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
# Load model
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [None]:
# Split DataFrame into train and validation sets (90% train, 10% validation)
train_size = int(0.9 * len(wiki_data))
train_df = wiki_data[:train_size]
val_df = wiki_data[train_size:]

train_dataset = LegalSimplificationDataset(comb_data, tokenizer)
val_dataset = LegalSimplificationDataset(val_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

In [None]:
from torch.cuda.amp import GradScaler
scaler = GradScaler()

  scaler = GradScaler()


In [None]:
import torch.optim as optim

In [None]:
# Freeze encoder layers but keep decoder trainable
for name, param in model.named_parameters():
    # Freeze all encoder parameters
    if "encoder" in name:
        param.requires_grad = False

    # Optionally unfreeze encoder embedding layer
    if "encoder.embed_tokens" in name:
        param.requires_grad = True

Training on wikipedia data first

In [None]:


# Set up optimizer
optimizer = optim.AdamW(filter(lambda p: p.requires_grad,model.parameters()), lr=3e-5, weight_decay=0.01)

# Training loop
epochs = 5

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1, Training Loss: 2.5377
Epoch 2, Training Loss: 2.3893
Epoch 3, Training Loss: 2.2894
Epoch 4, Training Loss: 2.1952
Epoch 5, Training Loss: 2.0938


In [None]:
# Split DataFrame into train and validation sets (90% train, 10% validation)
train_size = int(0.9 * len(df))
train_df = df[:train_size]
val_df = df[train_size:]

train_dataset = LegalSimplificationDataset(comb_data, tokenizer)
val_dataset = LegalSimplificationDataset(val_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

Training on https://github.com/lauramanor/legal_summarization dataset second

In [None]:
# Training loop
epochs = 5

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}")

Epoch 1, Training Loss: 2.0293
Epoch 2, Training Loss: 1.9708
Epoch 3, Training Loss: 1.8926
Epoch 4, Training Loss: 1.8218
Epoch 5, Training Loss: 1.7701


In [None]:
model.eval()
total_val_loss = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)

        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)


        loss = outputs.loss
        total_val_loss += loss.item()

avg_val_loss = total_val_loss / len(val_loader)
print(f"Validation Loss: {avg_val_loss:.4f}")

Validation Loss: 0.9004


In [None]:
# Save the model
model.save_pretrained("./t5_legal_simplification")
tokenizer.save_pretrained("./t5_legal_simplification")


'''
from safetensors.torch import load_file, save_file
state_dict = load_file("./t5_legal_simplification/model.safetensors")

# Save them as a PyTorch .bin file
torch.save(state_dict, "./t5_legal_simplification/pytorch_model.bin")
'''

'\nfrom safetensors.torch import load_file, save_file\nstate_dict = load_file("./t5_legal_simplification/model.safetensors")\n\n# Save them as a PyTorch .bin file\ntorch.save(state_dict, "./t5_legal_simplification/pytorch_model.bin")\n'

In [None]:
def generate_simplified_text(text, model, tokenizer, device):
    # Prepare input
    input_text = "Rewrite these sentences using nonlegal terminology while retaining the general meaning:  " + text
    input_encoding = tokenizer(
        input_text,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    input_ids = input_encoding["input_ids"].to(device)
    attention_mask = input_encoding["attention_mask"].to(device)

    # Generate output
    output_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=150,
        num_beams=4,
        early_stopping=True
    )

    # Decode output
    simplified_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return simplified_text

# Example usage
test_text = "The Company can also limit or revoke the use of the Service if You post such objectionable Content. As the Company cannot control all content posted by users and/or third parties on the Service."
test_text2 = "User Accounts When You create an account with Us, You must provide Us information that is accurate, complete, and current at all times. Failure to do so constitutes a breach of the Terms, which may result in immediate termination of Your account on Our Service. You are responsible for safeguarding the password that You use to access the Service and for any activities or actions under Your password, whether Your password is with Our Service or a Third-Party Social Media Service. "
test_text3 = "The duties of the Executive shall include those delegated to him from time to time by the Company. The Executive shall at all times comply with, and be subject to, those policies and procedures as the Company may establish from time to time which are applicable to all employees generally."
simplified = generate_simplified_text(test_text, model, tokenizer, device)
simplified2 = generate_simplified_text(test_text2, model, tokenizer, device)
print(simplified)
print(simplified2)

The service can delete your account without prior notice and without a reason.
When you create an account with us, you must provide us with accurate, complete, and current information. Failure to do so constitutes a breach of the Terms, which may result in immediate termination of your account on Our Service. You are responsible for maintaining the security of your account and for any activities or actions under your account.


In [None]:
import os
print(os.listdir("t5_legal_simplification"))


!mkdir -p "/content/drive/My Drive/t5_fine_tunedv9wiki_and_orig"

!cp -r /content/t5_legal_simplification /content/drive/MyDrive/t5_fine_tunedv9wiki_and_orig

['special_tokens_map.json', 'tokenizer_config.json', 'added_tokens.json', 'spiece.model', 'model.safetensors', 'config.json', 'generation_config.json']


In [None]:
pretrained_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base").to(device)
fine_tuned_model = T5ForConditionalGeneration.from_pretrained("t5_legal_simplification").to(device)

test_text2 = "summarize: Artist shall not detrimentally interfere with the efforts of Company to distribute the Recording through one or more distribution companies or enter into any contract inconsistent with the rights of distribution assigned to Company hereunder."
test_text = "simplify the vocabulary of this legal document: The Company reserves the right, but not the obligation, to, in its sole discretion, determine whether or not any Content is appropriate and complies with this Terms"
inputs = tokenizer(test_text2, return_tensors="pt").to("cuda")

pretrained_output = pretrained_model.generate(**inputs, max_length=200)
fine_tuned_output = fine_tuned_model.generate(**inputs, max_length=200)

print("Pretrained Output:", tokenizer.decode(pretrained_output[0], skip_special_tokens=True))
print("Fine-Tuned Output:", tokenizer.decode(fine_tuned_output[0], skip_special_tokens=True))

txt1 = tokenizer.decode(fine_tuned_output[0], skip_special_tokens=True)

Pretrained Output: a) The Artist shall not interfere with the efforts of Company to distribute the Recording through one or more distribution companies or enter into any contract inconsistent with the rights of distribution assigned to Company hereunder.
Fine-Tuned Output: the rights of distribution assigned to Company hereunder are not affected by the acts of the artist.


# *`Trash bin`*

In [None]:
import spacy
import re

In [None]:
def advanced_simplify_with_spacy(text):
    doc = nlp(text)

    # Create a list to store the simplified tokens
    simplified_text = []
    i = 0

    while i < len(doc):
        # Get current token and its lemma
        token = doc[i]
        token_lemma = token.lemma_.lower()

        # Check if this token starts a phrase in our dictionary
        longest_match = None
        longest_length = 0

        # Look for multi-word phrases
        for j in range(1, 6):  # Check phrases up to 5 words
            if i + j <= len(doc):
                phrase = ' '.join([t.text.lower() for t in doc[i:i+j]])
                if phrase in simplification_dict and j > longest_length:
                    longest_match = simplification_dict[phrase]
                    longest_length = j

        # Also check the lemma form for verbs
        if token_lemma in simplification_dict and token.pos_ == "VERB":
            # If it's a verb, we need to preserve tense and person
            simple_form = simplification_dict[token_lemma]

            # Handle different verb forms
            if token.tag_ in ["VBD", "VBN"]:  # Past tense or participle
                if ' ' not in simple_form:  # Single word replacement
                    # Add -ed or use irregular form (would need a lookup table)
                    if simple_form.endswith('e'):
                        simplified_text.append(simple_form + 'd')
                    else:
                        simplified_text.append(simple_form + 'ed')
                else:
                    simplified_text.append(simple_form)  # Use as is for phrases
            elif token.tag_ == "VBG":  # Gerund/present participle
                if ' ' not in simple_form:
                    if simple_form.endswith('e'):
                        simplified_text.append(simple_form[:-1] + 'ing')
                    else:
                        simplified_text.append(simple_form + 'ing')
                else:
                    simplified_text.append(simple_form)
            elif token.tag_ == "VBZ":  # 3rd person singular present
                if ' ' not in simple_form:
                    simplified_text.append(simple_form + 's')
                else:
                    simplified_text.append(simple_form)
            else:
                simplified_text.append(simple_form)

            i += 1

        # Use the longest matching phrase if found
        elif longest_match:
            # Preserve capitalization of first word
            if doc[i].text[0].isupper():
                words = longest_match.split()
                if words:
                    words[0] = words[0].capitalize()
                    longest_match = ' '.join(words)

            simplified_text.append(longest_match)
            i += longest_length

        # Single word in dictionary
        elif token.text.lower() in simplification_dict:
            simple_word = simplification_dict[token.text.lower()]

            # Preserve capitalization
            if token.text[0].isupper():
                simple_word = simple_word.capitalize()

            simplified_text.append(simple_word)
            i += 1

        # Keep original if not in dictionary
        else:
            simplified_text.append(token.text)
            i += 1

    result = ' '.join(simplified_text)

    # Fix spacing around punctuation
    result = re.sub(r'\s+([.,;:!?)])', r'\1', result)
    result = re.sub(r'(\()\s+', r'\1', result)

    return result


In [None]:
df_terms = pd.read_csv('/content/drive/MyDrive/simplified_terms.csv')
simplification_dict = dict(zip(df_terms['Complex_Term'], df_terms['Simple_Alternative']))

# Load spaCy model
nlp = spacy.load("en_core_web_sm")