In [3]:
!pip install rouge-score -q
!pip install networkx -q
!pip install sentencepiece



In [4]:
"""
ROUGE alignment script + Notebook (script-style) to generate training and inference datasets
for wb_script_summarizer.py on Kaggle.

This file includes two deliverables in one file:

1) rouge_align.py (module functions + CLI) - greedy ROUGE alignment to create sentence-level labels
   Usage: python rouge_align.py --script script.txt --reference reference.txt --top_k 3 --out script.labels.json

2) notebook-style pipeline (cells with # %%) that:
   - downloads required Kaggle datasets (Cornell, IMSDb scripts, OpenSubtitles) via slugs,
   - downloads CMU Movie Summary Corpus (Kaggle slug used as fallback),
   - downloads SAMSum via HuggingFace datasets,
   - runs alignment to produce .txt + .labels.json files compatible with wb_script_summarizer.py,
   - builds a simple vocab (simple_vocab.json) and shows a demo inference using wb_script_summarizer.

Notes:
- Requires: rouge-score, datasets, tqdm, transformers (optional), scikit-learn (optional for fallback), networkx/matplotlib (for graphing demo).
- On Kaggle: the `kaggle` CLI is available; you can also attach datasets in the UI and pass local dirs instead of slugs.

"""

# ----------------------
# Part A: ROUGE alignment utility (greedy selection)
# ----------------------

import argparse
import json
import os
import re
from typing import List, Tuple

try:
    from rouge_score import rouge_scorer
    ROUGE_AVAILABLE = True
except Exception:
    ROUGE_AVAILABLE = False


def sentence_split(text: str) -> List[str]:
    """Naive sentence splitter used for scripts and dialogues."""
    text = text.replace('\r', '\n')
    sents = re.split(r'(?<=[.!?\n])\s+', text)
    sents = [s.strip() for s in sents if s.strip()]
    return sents


def compute_rouge_gain(candidate: str, reference: str, scorer) -> float:
    # returns rouge-l fmeasure as the score
    scores = scorer.score(reference, candidate)
    # score returns dict with keys like 'rouge1','rougeL' etc., values with precision/recall/fmeasure
    return scores['rougeL'].fmeasure


def greedy_rouge_select(sentences: List[str], reference: str, k: int = 3) -> List[int]:
    """Greedy selection of up to k sentences that maximize ROUGE-L against the reference summary.
    Returns list of selected sentence indices.
    """
    if not ROUGE_AVAILABLE:
        raise RuntimeError('rouge_score package is required for greedy_rouge_select. Install via `pip install rouge-score`.')
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    selected = []
    cur_text = ''
    cur_score = 0.0
    for _ in range(k):
        best_gain = 0.0
        best_idx = None
        for i, s in enumerate(sentences):
            if i in selected:
                continue
            cand = (cur_text + ' ' + s).strip() if cur_text else s
            score = compute_rouge_gain(cand, reference, scorer)
            gain = score - cur_score
            if gain > best_gain:
                best_gain = gain
                best_idx = i
        if best_idx is None:
            break
        # update
        selected.append(best_idx)
        cur_text = (cur_text + ' ' + sentences[best_idx]).strip() if cur_text else sentences[best_idx]
        cur_score = compute_rouge_gain(cur_text, reference, scorer)
    return selected


def align_script_to_summary(script_text: str, reference_summary: str, top_k: int = 3) -> List[int]:
    sents = sentence_split(script_text)
    if len(sents) == 0:
        return []
    ids = greedy_rouge_select(sents, reference_summary, k=top_k)
    return ids


# CLI wrapper for single-file usage
def rouge_cli():
    parser = argparse.ArgumentParser(description='ROUGE-based greedy labeling for extractive summarization')
    parser.add_argument('--script', type=str, required=True, help='Path to script/text file to label')
    parser.add_argument('--reference', type=str, required=True, help='Path to reference summary (text file)')
    parser.add_argument('--top_k', type=int, default=3, help='Number of sentences to select')
    parser.add_argument('--out', type=str, default=None, help='Output .labels.json file path')
    args = parser.parse_args()

    script_text = open(args.script, 'r', encoding='utf-8').read()
    ref_text = open(args.reference, 'r', encoding='utf-8').read()
    sel = align_script_to_summary(script_text, ref_text, top_k=args.top_k)
    outp = args.out or args.script.replace('.txt', '.labels.json')
    with open(outp, 'w', encoding='utf-8') as fo:
        json.dump({'important_sentence_indices': sel}, fo)
    print('Wrote labels to', outp)


# ----------------------
# Part B: Notebook-style pipeline (cells with # %% separators)
# ----------------------

In [5]:
# Notebook: Full pipeline to produce training and inference datasets
# Save this file as `movie_pipeline_notebook.py` or paste cells into a Kaggle notebook.

In [6]:
# Cell 1: Install dependencies (run in Kaggle notebook cell)
INSTALL_CMD = '''
# If running on a new kernel, uncomment these
# !pip install --quiet rouge-score datasets transformers tqdm scikit-learn
'''
print('If you need dependencies, run the following in a notebook cell:')
print(INSTALL_CMD)

If you need dependencies, run the following in a notebook cell:

# If running on a new kernel, uncomment these
# !pip install --quiet rouge-score datasets transformers tqdm scikit-learn



In [7]:
# Cell 2: Imports
import subprocess
import shutil
from pathlib import Path
from datasets import load_dataset
from tqdm import tqdm

# local helpers from above

In [8]:
# Cell 3: Kaggle dataset slugs (pre-validated)
KAGGLE_CORPUS_SLUG = 'rajathmc/cornell-moviedialog-corpus'  # Cornell
KAGGLE_SCRIPTS_SLUG = 'veeralakrishna/imsdb-movie-scripts'  # IMSDb scripts
KAGGLE_SUBTITLES_SLUG = 'kaushikrahul/english-subtitles-opensubtitles-org'  # OpenSubtitles (English)
KAGGLE_CMU_SUMMARIES_SLUG = 'srikarmell/cmu-movie-summary-corpus'  # CMU Movie Summary (Kaggle mirror)

print('Using Kaggle slugs:')
print(KAGGLE_CORPUS_SLUG)
print(KAGGLE_SCRIPTS_SLUG)
print(KAGGLE_SUBTITLES_SLUG)
print(KAGGLE_CMU_SUMMARIES_SLUG)

Using Kaggle slugs:
rajathmc/cornell-moviedialog-corpus
veeralakrishna/imsdb-movie-scripts
kaushikrahul/english-subtitles-opensubtitles-org
srikarmell/cmu-movie-summary-corpus


In [12]:
# Cell 5: Prepare SAMSum (HuggingFace) as extractive-labeled dialogue data
print('Loading SAMSum via HuggingFace datasets...')
try:
    samsum = load_dataset('knkarthick/samsum')
except Exception as e:
    print('Failed to download samsum via datasets.load_dataset():', e)
    samsum = None

Loading SAMSum via HuggingFace datasets...


README.md: 0.00B [00:00, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

validation.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

In [14]:
# Cell 6: Convert SAMSum to per-dialogue .txt and .labels.json using greedy ROUGE
if samsum is not None:
    samsum_out = OUT_DIR / 'samsum_prepared'
    samsum_out.mkdir(parents=True, exist_ok=True)
    from rouge_score import rouge_scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

    def make_labels_for_dialogue(dialogue: str, summary: str, top_k: int = 3):
        sents = sentence_split(dialogue)
        if len(sents) == 0:
            return []
        return greedy_rouge_select(sents, summary, k=top_k)

    for split in ['train', 'validation', 'test']:
        if split not in samsum:
            continue
        for i, ex in enumerate(tqdm(samsum[split])):
            dialogue = ex['dialogue']
            summary = ex['summary']
            # save dialogue as a text file (one line per utterance)
            fname = samsum_out / f'samsum_{split}_{i:06d}.txt'
            with open(fname, 'w', encoding='utf-8') as fo:
                fo.write(str(dialogue))
            # compute labels
            try:
                sel = make_labels_for_dialogue(dialogue, summary, top_k=3)
            except Exception as e:
                sel = []
            labp = str(fname).replace('.txt', '.labels.json')
            with open(labp, 'w', encoding='utf-8') as fo:
                json.dump({'important_sentence_indices': sel}, fo)
    print('SAMSum converted to', samsum_out)

100%|██████████| 14732/14732 [04:23<00:00, 55.86it/s]
100%|██████████| 818/818 [00:14<00:00, 57.03it/s]
100%|██████████| 819/819 [00:14<00:00, 55.33it/s]

SAMSum converted to /kaggle/working/preprocessed/samsum_prepared





In [17]:
# Cell 7: Prepare CMU Movie Summary corpus and align to IMSDb scripts (domain-specific labeling)
# Note: this step tries to match script filenames to CMU titles by fuzzy matching. It may not find matches for all scripts.

import glob
from difflib import SequenceMatcher

cmu_dir = Path("/kaggle/input/cmu-movie-summary-corpus")
cmu_dir_exists = cmu_dir.exists()
if cmu_dir_exists:
    # load CMU summaries (assumes CSV/TSV or text files in the downloaded folder)
    # Try to find a file named 'plot_summaries.txt' or similar
    possible = list(cmu_dir.rglob('*plot*')) + list(cmu_dir.rglob('*.csv')) + list(cmu_dir.rglob('*.tsv'))
    cmu_entries = []
    for p in possible:
        try:
            txt = p.read_text(encoding='utf-8', errors='ignore')
            # naive parsing: lines with tab-separated movieid \t plot
            for line in txt.splitlines():
                parts = line.split('\t')
                if len(parts) >= 2:
                    title = parts[0].strip()
                    plot = '\t'.join(parts[1:]).strip()
                    cmu_entries.append({'title': title, 'plot': plot})
        except Exception:
            continue
    print('Loaded', len(cmu_entries), 'CMU entries (approx)')
else:
    print('CMU summaries not found at', cmu_dir, "— if running on Kaggle, download the '" + KAGGLE_CMU_SUMMARIES_SLUG + "' dataset or provide a local CMU folder.")

# Now try to align to scripts in scripts_raw produced by preprocess_kaggle.py
scripts_raw_dir = Path("/kaggle/input/imsdb-movie-scripts")
if scripts_raw_dir.exists():
    out_align_dir = OUT_DIR / 'scripts_with_labels'
    out_align_dir.mkdir(parents=True, exist_ok=True)
    for script_path in scripts_raw_dir.glob('*.txt'):
        name = script_path.stem.lower()
        # find best matching CMU title via simple substring or sequence matcher
        best = None
        best_score = 0.0
        for e in cmu_entries:
            title = e['title'].lower()
            # prefer substring match
            if title and title in name:
                best = e
                best_score = 1.0
                break
            # else fuzzy
            seq = SequenceMatcher(None, title, name).ratio()
            if seq > best_score:
                best_score = seq
                best = e
        if best and best_score > 0.6:
            # align using greedy ROUGE
            script_text = script_path.read_text(encoding='utf-8', errors='ignore')
            try:
                sel = align_script_to_summary(script_text, best['plot'], top_k=5)
            except Exception:
                sel = []
            dest = out_align_dir / script_path.name
            shutil.copy(script_path, dest)
            labp = str(dest).replace('.txt', '.labels.json')
            with open(labp, 'w', encoding='utf-8') as fo:
                json.dump({'important_sentence_indices': sel}, fo)
    print('Aligned scripts saved to', out_align_dir)
else:
    print('No scripts_raw dir found at', scripts_raw_dir, '— run preprocess_kaggle.py first to populate it.')

Loaded 124047 CMU entries (approx)
Aligned scripts saved to /kaggle/working/preprocessed/scripts_with_labels


In [20]:
# Cell 8: Build simple vocab from produced datasets (samsum_prepared + scripts_with_labels + cornell convs)
from collections import Counter
import re

vocab_out = OUT_DIR / 'simple_vocab.json'
pat = re.compile(r"\w+|[^\s\w]")

sources = []
if (OUT_DIR / 'samsum_prepared').exists():
    sources.append(OUT_DIR / 'samsum_prepared')
if (OUT_DIR / 'scripts_with_labels').exists():
    sources.append(OUT_DIR / 'scripts_with_labels')
if (OUT_DIR / 'cornell_conversations').exists():
    sources.append(OUT_DIR / 'cornell_conversations')

counter = Counter()
for d in sources:
    for p in d.glob('*.txt'):
        txt = p.read_text(encoding='utf-8', errors='ignore')
        toks = pat.findall(txt)
        counter.update(toks)

vocab = {'<unk>': 0}
next_id = 1
for tok, _ in counter.most_common():
    vocab[tok] = next_id
    next_id += 1
with open(vocab_out, 'w', encoding='utf-8') as fo:
    json.dump(vocab, fo)
print('Saved vocab to', vocab_out, 'size=', len(vocab))

Saved vocab to /kaggle/working/preprocessed/simple_vocab.json size= 37649


In [21]:
# Cell 9: Quick demo - run infer_summary on a SAMSum example using wb_script_summarizer (random-initialized model)
# Assumes wb_script_summarizer.py is in working dir
try:
    import script_summarizer as wbs
    # load vocab
    tok = wbs.SimpleTokenizer(vocab=vocab)
    # tiny model for demo
    model = wbs.SummarizerModel(vocab_size=len(vocab), d_model=128, num_layers=2, nhead=4)
    # pick one samsum file if exists
    samsum_dir = OUT_DIR / 'samsum_prepared'
    if samsum_dir.exists():
        sample = next(samsum_dir.glob('samsum_train_*.txt'))
        text = sample.read_text(encoding='utf-8')
        summary, sentences, scores, attn_maps = wbs.infer_summary(model, tok, text, top_k=3, device='cpu')
        print('INFERRED SUMMARY:\n', summary)
    else:
        print('No samsum_prepared found; run earlier cells to generate it.')
except Exception as e:
    print('Demo failed (is wb_script_summarizer.py present?):', e)

INFERRED SUMMARY:
 Burton: Did u manage to get some money? I feel so guilty to lie to my mum😢
Alfio: I told her I need new reference book at school😢
Burton: Oh no.....(ToT)/~~~
Alfio: But now I can get the concert ticket. 👋
Alfio: I dont wanna think about anything else for now	👋


In [22]:
# End of notebook
print('Pipeline cells prepared. On Kaggle: run each cell sequentially (or paste into a notebook) to produce training and inference datasets.')

Pipeline cells prepared. On Kaggle: run each cell sequentially (or paste into a notebook) to produce training and inference datasets.
