In [3]:
!pip install transformers==4.28.1 pandas numpy torch tqdm -q
print("✅ Packages installed")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.0/110.0 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m88.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd
import numpy as np
from google.colab import files
import io

uploaded = files.upload()
real_texts = []
for filename in uploaded.keys():
    with io.open(filename, 'r', encoding='utf-8') as f:
        real_texts.append(f.read())


if real_texts:
    df = pd.DataFrame({"text": real_texts})
    print("📁 Using uploaded files")
else:
    print("🧪 Using synthetic data")

    # Templates
    templates = {
        "positive": [
            "In {case}, the court ruled in favor of {party}, stating {reason}.",
            "The settlement agreement was approved, granting {relief} to {party}.",
            "The appellate court affirmed the judgment for {party} in {case}.",
            "{party} prevailed on all counts in the {court_type} decision.",
            "The jury awarded {party} {damages} in {case}.",
            "The motion for {relief} was granted to {party}."
        ],
        "negative": [
            "The court denied {party}'s motion to {motion} due to {reason}.",
            "{party} was found liable for {violation} and ordered to pay {penalty}.",
            "The Supreme Court reversed the decision in {case}, ruling against {party}.",
            "{party}'s appeal was dismissed with prejudice.",
            "The complaint filed by {party} was dismissed without prejudice.",
            "Summary judgment was entered against {party} in {case}."
        ],
        "neutral": [
            "The hearing regarding {case} was adjourned until {date}.",
            "Documents were filed in {case} concerning {matter}.",
            "The court ordered supplemental briefing on {issue}.",
            "Amicus briefs were submitted in {case} by {organization}.",
            "The {court_type} heard oral arguments in {case} today.",
            "The parties in {case} stipulated to extend deadlines."
        ]
    }


    placeholder_values = {
        "case": [f"Case No. 2024-{x}" for x in range(1000, 9999)],
        "party": ["the plaintiff", "the defendant", "the appellant", "the respondent"],
        "reason": ["clear precedent", "procedural error", "factual insufficiency", "legal grounds"],
        "relief": ["damages", "injunctive relief", "specific performance", "attorney's fees"],
        "court_type": ["district court", "circuit court", "bankruptcy court"],
        "damages": ["$100,000", "$1.2 million", "nominal damages"],
        "motion": ["dismiss", "suppress evidence", "summary judgment"],
        "violation": ["breach of contract", "negligence", "fraud"],
        "penalty": ["$5,000", "$250,000", "$1 million"],
        "date": [f"{d}/06/2024" for d in np.random.randint(1, 30, 10)],
        "matter": ["jurisdiction", "discovery", "standing"],
        "issue": ["constitutional interpretation", "statutory construction"],
        "organization": ["ACLU", "US Chamber of Commerce", "NAACP"]
    }


    np.random.seed(42)
    data = []
    for _ in range(500):
        sentiment = np.random.choice(["positive", "negative", "neutral"])
        template = np.random.choice(templates[sentiment])
        doc = template
        while "{" in doc:
            start = doc.find("{")
            end = doc.find("}")
            placeholder = doc[start+1:end]
            doc = doc[:start] + np.random.choice(placeholder_values[placeholder]) + doc[end+1:]
        data.append({"text": doc, "true_sentiment": sentiment})


    unique_texts = set()
    clean_data = []
    for d in data:
        text = d["text"]
        if len(text) > 50 and text not in unique_texts and any(c.isupper() for c in text):
            unique_texts.add(text)
            clean_data.append(d)

    df = pd.DataFrame(clean_data)
    print(f"✅ Generated {len(df)} legal documents (expanded dataset)")

🧪 Using synthetic data
✅ Generated 326 legal documents (expanded dataset)


In [3]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
import torch

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.config.pad_token_id = model.config.eos_token_id

print(f"🚀 Model moved to: {device}")
print("🚀 GPT-2 model loaded")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]



merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🚀 Model moved to: cuda
🚀 GPT-2 model loaded


In [4]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

df["label"] = df["true_sentiment"].map({"negative": 0, "neutral": 1, "positive": 2})
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

train_encodings = tokenize_function(train_df["text"].tolist())
test_encodings = tokenize_function(test_df["text"].tolist())

class LegalDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels.iloc[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = LegalDataset(train_encodings, train_df["label"])
test_dataset = LegalDataset(test_encodings, test_df["label"])

# Step 5: Configure training (wandb disabled)
from transformers import Trainer, TrainingArguments
import os

os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_steps=500,
    logging_steps=10,
    logging_dir="./logs",
    report_to="none"
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = (preds == labels).mean()
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

print("🔥 Starting training (no wandb)...")
trainer.train()
print("✅ Training complete!")

🔥 Starting training (no wandb)...




Step,Training Loss
10,1.4162
20,0.7295
30,0.4748
40,0.563
50,0.5005
60,0.2629
70,0.4082
80,0.2963
90,0.0507
100,0.038


✅ Training complete!


In [5]:
def analyze_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits).item()
    return ["negative", "neutral", "positive"][prediction]

In [6]:
sample_text = "The court granted the plaintiff's motion for summary judgment."
print(f"\nSample analysis:\nText: {sample_text}\nSentiment: {analyze_sentiment(sample_text)}")

df["predicted_sentiment"] = df["text"].apply(analyze_sentiment)
print("\n📊 Sentiment Distribution:")
print(df["predicted_sentiment"].value_counts())


Sample analysis:
Text: The court granted the plaintiff's motion for summary judgment.
Sentiment: positive

📊 Sentiment Distribution:
predicted_sentiment
neutral     139
positive    107
negative     80
Name: count, dtype: int64
