In [1]:
# ==============================
# 1. Install & Import Libraries
# ==============================
!pip install -U transformers datasets evaluate accelerate sentencepiece
!pip install rouge_score
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorForSeq2Seq
import evaluate
import torch



In [None]:
# ==============================
# 2. Load Dataset
# ==============================

import os
DATA_PATH = os.path.abspath(os.path.join(os.pardir, 'data', 'googleplaystore_user_reviews.csv'))
print('Loading dataset from:', DATA_PATH)

df = pd.read_csv(DATA_PATH)

# Keep only needed columns (App, Review, Sentiment)
df = df[["App", "Translated_Review", "Sentiment"]]
df = df.dropna().reset_index(drop=True)

print("Sample data:\n", df.head())


Loading dataset from: /Users/bawantharathnayake/Desktop/Academic/semester 7/advanced ai/AI-Project/Customer-Review-Summarizer-for-Mobile-Apps-using-LLMs-Project/data/googleplaystore_user_reviews.csv
Sample data:
                      App                                  Translated_Review  \
0  10 Best Foods for You  I like eat delicious food. That's I'm cooking ...   
1  10 Best Foods for You    This help eating healthy exercise regular basis   
2  10 Best Foods for You         Works great especially going grocery store   
3  10 Best Foods for You                                       Best idea us   
4  10 Best Foods for You                                           Best way   

  Sentiment  
0  Positive  
1  Positive  
2  Positive  
3  Positive  
4  Positive  


In [3]:
# ==============================
# 3. Preprocessing (enhanced cleaning)
# ==============================
# If dataset not loaded (cell 2 skipped), load it here safely
import os, re
try:
    df
except NameError:
    DATA_PATH = os.path.abspath(os.path.join(os.pardir, 'data', 'googleplaystore_user_reviews.csv'))
    print('(Re)loading dataset inside preprocessing from:', DATA_PATH)
    import pandas as pd
    df = pd.read_csv(DATA_PATH)
    df = df[["App", "Translated_Review", "Sentiment"]].dropna().reset_index(drop=True)

print(f"Initial rows: {len(df)}")

# Basic text cleaning function
_clean_url_pattern = re.compile(r'https?://\S+|www\.\S+')
_non_alpha_pattern = re.compile(r'[^a-z\s]')
_multi_space_pattern = re.compile(r'\s+')

def clean_text(text: str) -> str:
    text = str(text).lower().strip()
    text = _clean_url_pattern.sub(' ', text)
    # Keep only letters + space
    text = _non_alpha_pattern.sub(' ', text)
    # Collapse repeated chars (e.g., cooooool -> coool) simple heuristic
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    text = _multi_space_pattern.sub(' ', text).strip()
    return text

# Backup raw review
if 'raw_review' not in df.columns:
    df['raw_review'] = df['Translated_Review']

# Apply cleaning
df['Translated_Review'] = df['Translated_Review'].apply(clean_text)

# Drop empty / very short after cleaning
before_len = len(df)
min_len = 10
mask = df['Translated_Review'].str.len() >= min_len
removed_short = before_len - mask.sum()
df = df[mask]

# Remove duplicates on cleaned text
before_dedup = len(df)
df = df.drop_duplicates(subset=['Translated_Review'])
removed_dup = before_dedup - len(df)

print(f"Removed short (<{min_len} chars): {removed_short}")
print(f"Removed duplicates: {removed_dup}")
print(f"Remaining rows: {len(df)}")

# Rebuild summaries from cleaned text (using sentiment)
def build_summary(row):
    txt = row['Translated_Review']
    if row['Sentiment'] == 'Positive':
        return f"Positive feedback: {txt}"
    elif row['Sentiment'] == 'Negative':
        return f"Problem: {txt}"
    else:
        return f"Neutral comment: {txt}"

df['summary'] = df.apply(build_summary, axis=1)

# Create HuggingFace dataset
from datasets import Dataset, DatasetDict

dataset = Dataset.from_pandas(df[['Translated_Review', 'summary']])

# Train/validation split
dataset = dataset.train_test_split(test_size=0.2, seed=42)
dataset = DatasetDict({
    'train': dataset['train'].shuffle(seed=42),
    'validation': dataset['test'].shuffle(seed=42)
})

print(dataset)

Initial rows: 37427
Removed short (<10 chars): 3203
Removed duplicates: 7435
Remaining rows: 26789
DatasetDict({
    train: Dataset({
        features: ['Translated_Review', 'summary', '__index_level_0__'],
        num_rows: 21431
    })
    validation: Dataset({
        features: ['Translated_Review', 'summary', '__index_level_0__'],
        num_rows: 5358
    })
})


In [8]:
# ==============================
# 4. Tokenization
# ==============================
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)

max_input_length = 256
max_target_length = 64

def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["Translated_Review"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True, padding="max_length")
    label_ids = labels["input_ids"]
    # Replace pad token ids in labels with -100
    pad_token_id = tokenizer.pad_token_id
    label_ids = [[(lid if lid != pad_token_id else -100) for lid in seq] for seq in label_ids]
    model_inputs["labels"] = label_ids
    return model_inputs

# (Optional) Subsample for quick experiment
#small_train = dataset["train"].select(range(512)) if dataset["train"].num_rows > 512 else dataset["train"]
#small_val = dataset["validation"].select(range(128)) if dataset["validation"].num_rows > 128 else dataset["validation"]
big_train = dataset["train"]
big_val = dataset["validation"]

working_dataset = DatasetDict({"train": big_train, "validation": big_val})

tokenized_dataset = working_dataset.map(preprocess_function, batched=True, remove_columns=["Translated_Review", "summary"])

Map:   0%|          | 0/21431 [00:00<?, ? examples/s]



Map:   0%|          | 0/5358 [00:00<?, ? examples/s]

In [9]:

# ==============================
# 5. Load Model
# ==============================
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [19]:
dataset["train"].num_rows


21431

In [20]:
dataset["validation"].num_rows

5358

In [None]:
# ==============================
# 7. Training Setup (3 epochs, no manual early stopping)
# ==============================
import transformers
print('Transformers version:', transformers.__version__)
from inspect import signature
print('TrainingArguments init params:', list(signature(TrainingArguments.__init__).parameters.keys()))

from transformers import set_seed
set_seed(42)

OUTPUT_DIR = './t5_finetuned_reviews_model'

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    save_strategy="no",  
    learning_rate=1e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8, 
    dataloader_drop_last=True,
    label_smoothing_factor=0.1,
    num_train_epochs=5,  
    weight_decay=0.01,
    max_grad_norm=1.0,
    optim="adamw_torch",
    save_total_limit=1,
    load_best_model_at_end=False,
    logging_strategy='steps',
    logging_dir='./logs',
    logging_steps=200,
    report_to=[],
    seed=42,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# ==============================
# 8. Train (no manual loop)
# ==============================
import math, os

train_output = trainer.train()
final_train_loss = getattr(train_output, 'training_loss', None)
if final_train_loss is not None and not math.isnan(final_train_loss):
    print(f"Final training loss (from Trainer): {final_train_loss:.4f}")

# Also print the last logged training loss from step logs
last_step_loss = None
for entry in reversed(getattr(trainer.state, 'log_history', [])):
    if 'loss' in entry:
        last_step_loss = entry['loss']
        break
if last_step_loss is not None:
    print(f"Last logged training loss (step): {last_step_loss:.4f}")

# Evaluate once after training
eval_output = trainer.evaluate()
val_loss = float(eval_output.get('eval_loss', float('nan')))
if not math.isnan(val_loss):
    print(f"Validation loss after training: {val_loss:.4f}")

# Save model and tokenizer
os.makedirs(OUTPUT_DIR, exist_ok=True)
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Saved model and tokenizer to {OUTPUT_DIR}")

Transformers version: 4.56.1
TrainingArguments init params: ['self', 'output_dir', 'overwrite_output_dir', 'do_train', 'do_eval', 'do_predict', 'eval_strategy', 'prediction_loss_only', 'per_device_train_batch_size', 'per_device_eval_batch_size', 'per_gpu_train_batch_size', 'per_gpu_eval_batch_size', 'gradient_accumulation_steps', 'eval_accumulation_steps', 'eval_delay', 'torch_empty_cache_steps', 'learning_rate', 'weight_decay', 'adam_beta1', 'adam_beta2', 'adam_epsilon', 'max_grad_norm', 'num_train_epochs', 'max_steps', 'lr_scheduler_type', 'lr_scheduler_kwargs', 'warmup_ratio', 'warmup_steps', 'log_level', 'log_level_replica', 'log_on_each_node', 'logging_dir', 'logging_strategy', 'logging_first_step', 'logging_steps', 'logging_nan_inf_filter', 'save_strategy', 'save_steps', 'save_total_limit', 'save_safetensors', 'save_on_each_node', 'save_only_model', 'restore_callback_states_from_checkpoint', 'no_cuda', 'use_cpu', 'use_mps_device', 'seed', 'data_seed', 'jit_mode_eval', 'use_ipex',

  trainer = Trainer(


Step,Training Loss
50,2.9824
100,2.9351
150,2.8436
200,2.6622
250,2.4249
300,2.1813
350,2.0318
400,1.9572
450,1.9048
500,1.8795


Final training loss: 1.8061


Validation loss after training: 1.6801
Saved model and tokenizer to ./t5_finetuned_reviews
Saved model and tokenizer to ./t5_finetuned_reviews


In [None]:
# ==============================
# 9. Training Loss Summary (recent steps and per-epoch means)
# ==============================
from pathlib import Path
import json
import re


try:
    logs = getattr(trainer.state, 'log_history', [])
    train_step_losses = [x for x in logs if 'loss' in x and 'epoch' in x]
    # Print last 10 step losses
    print("Last 10 training loss logs:")
    for row in train_step_losses[-10:]:
        step = row.get('step')
        loss = row.get('loss')
        epoch = row.get('epoch')
        print(f"step={step:>6}  epoch={epoch:>4.1f}  loss={loss:.4f}")

    # Compute per-epoch mean training loss from step logs
    from collections import defaultdict
    buckets = defaultdict(list)
    for r in train_step_losses:
        ep = f"{float(r.get('epoch', 0)):.0f}"
        buckets[ep].append(float(r['loss']))
    if buckets:
        print("\nPer-epoch mean training loss (approx from step logs):")
        for ep in sorted(buckets, key=lambda k: int(k)):
            vals = buckets[ep]
            print(f"epoch {ep}: mean loss {sum(vals)/len(vals):.4f} (n={len(vals)})")
except Exception as e:
    print("Could not read trainer logs:", e)

In [32]:
# ==============================
# 10. App-level Summarization (one summary per sentiment)
# ==============================
import os
import re as _re
import torch

# Use this saved model by default
DEFAULT_SAVED_MODEL_DIR = \
    "/Users/bawantharathnayake/Desktop/Academic/semester 7/advanced ai/AI-Project/Customer-Review-Summarizer-for-Mobile-Apps-using-LLMs-Project/notebook/t5_finetuned_reviews"

# Map dataset sentiments to the prefixes used during training
_SENTIMENT_PREFIX = {
    "Positive": "Positive feedback:",
    "Negative": "Problem:",
    "Neutral":  "Neutral comment:",
}

# Decide which model/tokenizer to use (auto-load saved model if available and not loaded yet)
def _get_handles():
    global val_model, val_tokenizer
    if 'val_model' in globals() and 'val_tokenizer' in globals():
        mdl = val_model
        tok = val_tokenizer
    else:
        if os.path.isdir(DEFAULT_SAVED_MODEL_DIR):
            from transformers import T5Tokenizer, T5ForConditionalGeneration
            tok = T5Tokenizer.from_pretrained(DEFAULT_SAVED_MODEL_DIR)
            mdl = T5ForConditionalGeneration.from_pretrained(DEFAULT_SAVED_MODEL_DIR)
            device = torch.device("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu"))
            mdl.to(device)
            val_model = mdl
            val_tokenizer = tok
        else:
            mdl = model
            tok = tokenizer
    dev = next(mdl.parameters()).device
    return mdl, tok, dev

# Generate a summary with an optional forced prefix; strip the prefix for display
def _strip_prefix(txt: str) -> str:
    return _re.sub(r"^(Positive feedback:|Problem:|Neutral comment:)\s*", "", str(txt)).strip()


def _generate_prefixed(text: str, forced_prefix: str | None = None, max_length: int = 60, num_beams: int = 4,
                       strip_prefix: bool = True) -> str:
    mdl, tok, dev = _get_handles()
    enc = tok("summarize: " + text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    enc = {k: v.to(dev) for k, v in enc.items()}

    out_text = None

    # If we have the helper from Cell 13 and a saved model, use it
    if forced_prefix and 'generate_with_forced_prefix' in globals() and 'val_model' in globals():
        try:
            out_text, _, _ = generate_with_forced_prefix(text, prefixes=[forced_prefix], max_length=max_length, num_beams=num_beams)
        except Exception:
            out_text = None

    if out_text is None:
        # Fallback: seed decoder with the forced prefix (if provided), else plain generate
        gen_kwargs = dict(max_length=max_length, num_beams=num_beams, early_stopping=True)
        if forced_prefix:
            dec_ids = tok(forced_prefix, return_tensors='pt', add_special_tokens=False).input_ids.to(dev)
            gen_kwargs["decoder_input_ids"] = dec_ids
        try:
            with torch.no_grad():
                outs = mdl.generate(**enc, **gen_kwargs)
        except RuntimeError as e:
            # MPS fallback
            if dev.type == 'mps':
                enc_cpu = {k: v.to('cpu') for k, v in enc.items()}
                with torch.no_grad():
                    outs = mdl.to('cpu').generate(**enc_cpu, **gen_kwargs)
                mdl.to(dev)
            else:
                raise
        out_text = tok.decode(outs[0], skip_special_tokens=True)

    return _strip_prefix(out_text) if strip_prefix else out_text


def summarize_app_by_sentiment(app_name: str, per_group_limit: int = 50, strip_prefix: bool = True) -> dict:
    """Return one summary per sentiment for the given app.
    per_group_limit: max number of reviews per sentiment to include (to keep prompt length reasonable).
    """
    assert 'df' in globals(), "Dataframe 'df' not found; run data loading cells first."

    app_reviews = df[df["App"] == app_name][["Translated_Review", "Sentiment"]].dropna()
    results = {}
    for sent in ["Positive", "Negative", "Neutral"]:
        subset = app_reviews[app_reviews["Sentiment"] == sent]["Translated_Review"].astype(str).tolist()
        if not subset:
            results[sent] = ""
            continue
        # Take up to per_group_limit reviews, join with separators
        texts = subset[:per_group_limit]
        joined = " \n- ".join(texts)  # lightweight structuring
        forced = _SENTIMENT_PREFIX[sent]
        summary = _generate_prefixed(joined, forced_prefix=forced, max_length=80, num_beams=4, strip_prefix=strip_prefix)
        results[sent] = summary
    return results

# Example usage
app_name = "10 Best Foods for You"
summaries = summarize_app_by_sentiment(app_name, per_group_limit=50, strip_prefix=True)
print(f"=== One summary per sentiment for {app_name} ===\n")
print("Positive Summary:\n", summaries.get("Positive", ""), "\n")
print("Negative Summary:\n", summaries.get("Negative", ""), "\n")
print("Neutral Summary:\n", summaries.get("Neutral", ""), "\n")

=== One summary per sentiment for 10 Best Foods for You ===

Positive Summary:
 i like eat delicious food that s i m cooking food myself case best foods helps lot also best before shelf life - this help eating healthy exercise regular basis - works great especially going grocery store - best idea us - useful information the amount spelling errors questions validity information shared once fixed stars given - thank you great app add arthritis eyes immunity kidney 

Negative Summary:
 no recipe book unable recipe book - waste time it needs internet time n ask calls information - faltu plz waste ur time - crap doesn t work - boring i thought actually just texts that s it too poor old texts 

Neutral Summary:
 looking forward app - it helpful site it help foods get - god health - i found lot wealth form health - this helpful - doesn t work zero - this starr download - i like was helpful 



In [27]:
# ==============================
# 11. Validate a Saved Model (eval loss + sample generations)
# ==============================
import os
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq

SAVED_MODEL_DIR = "/Users/bawantharathnayake/Desktop/Academic/semester 7/advanced ai/AI-Project/Customer-Review-Summarizer-for-Mobile-Apps-using-LLMs-Project/notebook/t5_finetuned_reviews"
print("Loading saved model from:", SAVED_MODEL_DIR)

# Load tokenizer/model
val_tokenizer = T5Tokenizer.from_pretrained(SAVED_MODEL_DIR)
val_model = T5ForConditionalGeneration.from_pretrained(SAVED_MODEL_DIR)

# Move to device
_device = torch.device("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu"))
val_model.to(_device)
print("Using device:", _device)

# Data collator and eval-only Trainer
val_data_collator = DataCollatorForSeq2Seq(tokenizer=val_tokenizer, model=val_model)

val_args = TrainingArguments(
    output_dir="./_tmp_eval_saved_model",
    per_device_eval_batch_size=8,
    dataloader_drop_last=False,
    report_to=[],
    logging_dir="./logs",
    logging_strategy="no",
)

val_trainer = Trainer(
    model=val_model,
    args=val_args,
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=val_tokenizer,
    data_collator=val_data_collator,
)

# Evaluate on validation split
eval_metrics = val_trainer.evaluate()
print("\n=== Saved Model Evaluation ===")
print({k: float(v) for k, v in eval_metrics.items() if isinstance(v, (int, float))})
if "eval_loss" in eval_metrics:
    print(f"eval_loss: {float(eval_metrics['eval_loss']):.4f}")

# Qualitative check: generate summaries for a few validation samples (with MPS->CPU fallback)
print("\n=== Sample Generations (saved model) ===")
try:
    src_ds = working_dataset["validation"] if 'working_dataset' in globals() else dataset["validation"]
    n = min(5, src_ds.num_rows)
    samples = src_ds.select(range(n))
    for i in range(n):
        text = samples[i]["Translated_Review"]
        ref = samples[i]["summary"]
        inputs = val_tokenizer("summarize: " + text, return_tensors="pt", truncation=True, padding=True, max_length=256)
        try:
            inputs = inputs.to(_device)
            with torch.no_grad():
                outputs = val_model.generate(
                    **inputs,
                    max_length=30,
                    min_length=5,
                    num_beams=4,
                    early_stopping=True,
                )
        except RuntimeError as e:
            if "Placeholder storage has not been allocated on MPS device" in str(e) or (_device.type == 'mps'):
                print("[Info] MPS generate failed; retrying on CPU for this sample...")
                val_model_cpu = val_model.to('cpu')
                inputs_cpu = {k: v.to('cpu') for k, v in inputs.items()}
                with torch.no_grad():
                    outputs = val_model_cpu.generate(
                        **inputs_cpu,
                        max_length=30,
                        min_length=5,
                        num_beams=4,
                        early_stopping=True,
                    )
            else:
                raise
        pred = val_tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"\n[{i+1}] Original : {text}")
        print(f"    Reference: {ref}")
        print(f"    Predicted: {pred}")
except Exception as e:
    print("(Skipping sample generations) Reason:", e)

Loading saved model from: /Users/bawantharathnayake/Desktop/Academic/semester 7/advanced ai/AI-Project/Customer-Review-Summarizer-for-Mobile-Apps-using-LLMs-Project/notebook/t5_finetuned_reviews
Using device: mps


  val_trainer = Trainer(



=== Saved Model Evaluation ===
{'eval_loss': 0.08040162175893784, 'eval_model_preparation_time': 0.0014, 'eval_runtime': 78.2963, 'eval_samples_per_second': 68.432, 'eval_steps_per_second': 8.557}
eval_loss: 0.0804

=== Sample Generations (saved model) ===

[1] Original : great always however bug latest even sync launch selected wakes says syncing notification bar never syncs forever says syncing the option force close several times day for i must alternative
    Reference: Positive feedback: great always however bug latest even sync launch selected wakes says syncing notification bar never syncs forever says syncing the option force close several times day for i must alternative
    Predicted: Positive feedback: great always however bug latest even sync launch selected wakes says syncing notification bar never syncs forever says

[2] Original : i love playing game like many others said hard get gems without spending money makes frustrating there way could get gems easily trade coins 

In [None]:
# ==============================
# 12. Saved Model Metrics: ROUGE + Sentiment-Prefix Accuracy
# ==============================
import math
import evaluate as _eval
from tqdm.auto import tqdm

rouge = _eval.load("rouge")

# Use the same device/_device, tokenizer (val_tokenizer), and model (val_model) from the previous cell.
# Generate on a subset for speed; increase MAX_SAMPLES for fuller evaluation.
MAX_SAMPLES = 512  # adjust as needed
BATCH_SIZE = 8

src_ds = working_dataset["validation"] if 'working_dataset' in globals() else dataset["validation"]
N = src_ds.num_rows
M = min(MAX_SAMPLES, N)
samples = src_ds.select(range(M))

preds = []
refs = []

def _move_to(d, device):
    return {k: v.to(device) for k, v in d.items()}

for start in tqdm(range(0, M, BATCH_SIZE)):
    end = min(start + BATCH_SIZE, M)
    batch = samples.select(range(start, end))
    texts = [b["Translated_Review"] for b in batch]
    ref_texts = [b["summary"] for b in batch]
    inputs = val_tokenizer([
        "summarize: " + t for t in texts
    ], return_tensors="pt", truncation=True, padding=True, max_length=256)
    try:
        with torch.no_grad():
            outs = val_model.generate(
                **_move_to(inputs, _device),
                max_length=30,
                min_length=5,
                num_beams=4,
                early_stopping=True,
            )
    except RuntimeError as e:
        # MPS fallback per-batch
        print("[Info] Generation failed on", _device, "; retrying batch on CPU...")
        with torch.no_grad():
            outs = val_model.to('cpu').generate(
                **_move_to(inputs, 'cpu'),
                max_length=30,
                min_length=5,
                num_beams=4,
                early_stopping=True,
            )
        val_model.to(_device)
    batch_preds = val_tokenizer.batch_decode(outs, skip_special_tokens=True)
    preds.extend(batch_preds)
    refs.extend(ref_texts)

# ROUGE
rouge_res = rouge.compute(
    predictions=["\n".join(p.split()) for p in preds],
    references=["\n".join(r.split()) for r in refs],
)
print("\n=== ROUGE (saved model) ===")
for k in ["rouge1", "rouge2", "rougeL", "rougeLsum"]:
    v = rouge_res.get(k)
    if isinstance(v, dict) and 'mid' in v and hasattr(v['mid'], 'fmeasure'):
        score = v['mid'].fmeasure
    elif hasattr(v, 'mid') and hasattr(v.mid, 'fmeasure'):
        score = v.mid.fmeasure
    else:
        score = float(v)
    print(f"{k}: {score:.4f}")



  0%|          | 0/64 [00:00<?, ?it/s]


=== ROUGE (saved model) ===
rouge1: 0.8925
rouge2: 0.8858
rougeL: 0.8921
rougeLsum: 0.8919


In [38]:
# ==============================
# 12b. Sentiment F1 Scores (per-class + micro/macro/weighted)
# ==============================
import re as _re

# Ensure we have predictions and references from the previous metrics cell
assert 'preds' in globals() and 'refs' in globals() and len(preds) == len(refs) and len(refs) > 0, \
    "Run the 'Saved Model Metrics' cell first to populate preds/refs."

_allowed_prefixes = ["Positive feedback:", "Problem:", "Neutral comment:"]
_label_map = {p: i for i, p in enumerate(_allowed_prefixes)}

def _prefix(text: str) -> str:
    m = _re.match(r"^(Positive feedback:|Problem:|Neutral comment:)", str(text).strip())
    return m.group(1) if m else ""

# Convert preds/refs into label ids
y_true = []
y_pred = []
for p, r in zip(preds, refs):
    y_pred.append(_label_map.get(_prefix(p), -1))
    y_true.append(_label_map.get(_prefix(r), -1))

# Filter out any items where the reference label is unknown (shouldn't happen)
filtered = [(yp, yt) for yp, yt in zip(y_pred, y_true) if yt != -1]
if not filtered:
    raise RuntimeError("No valid references for F1 computation.")
y_pred, y_true = zip(*filtered)

print("\n=== Sentiment F1 Scores (prefix-based) ===")

try:
    from sklearn.metrics import classification_report, f1_score
    target_names = ["Positive", "Negative", "Neutral"]
    # Remap label ids to contiguous range [0..2]
    # (They already are, but keep explicit.)
    print(classification_report(y_true, y_pred, labels=[0,1,2], target_names=target_names, digits=4))
    print("micro F1:", f1_score(y_true, y_pred, average='micro'))
    print("macro F1:", f1_score(y_true, y_pred, average='macro'))
    print("weighted F1:", f1_score(y_true, y_pred, average='weighted'))
except Exception as e:
    # Minimal pure-Python fallback for F1 per class and macro/micro
    from collections import Counter
    import math

    labels = [0,1,2]
    # Confusion counts
    cm = {l: {m: 0 for m in labels} for l in labels}
    for yt, yp in zip(y_true, y_pred):
        if yt in labels and yp in labels:
            cm[yt][yp] += 1

    def precision_recall_f1(tp, fp, fn):
        prec = tp / (tp + fp) if (tp + fp) else 0.0
        rec = tp / (tp + fn) if (tp + fn) else 0.0
        f1 = 2*prec*rec / (prec + rec) if (prec + rec) else 0.0
        return prec, rec, f1

    per_class_f1 = {}
    micro_tp = micro_fp = micro_fn = 0

    for c in labels:
        tp = cm[c][c]
        fp = sum(cm[r][c] for r in labels if r != c)
        fn = sum(cm[c][r] for r in labels if r != c)
        prec, rec, f1 = precision_recall_f1(tp, fp, fn)
        per_class_f1[c] = f1
        micro_tp += tp
        micro_fp += fp
        micro_fn += fn
        name = ["Positive", "Negative", "Neutral"][c]
        print(f"{name:8s} F1: {f1:.4f} (P={prec:.4f}, R={rec:.4f})")

    macro_f1 = sum(per_class_f1.values()) / len(labels)
    # Micro F1 equals micro precision/recall in multiclass
    micro_prec, micro_rec, micro_f1 = precision_recall_f1(micro_tp, micro_fp, micro_fn)

    # Weighted F1 by support
    supports = Counter(y_true)
    total = sum(supports[l] for l in labels)
    weighted_f1 = sum(per_class_f1[l] * (supports[l] / total) for l in labels)

    print(f"macro F1:   {macro_f1:.4f}")
    print(f"micro F1:   {micro_f1:.4f}")
    print(f"weighted F1:{weighted_f1:.4f}")
    print("(Note: using fallback implementation; install scikit-learn for a detailed report.)")


=== Sentiment F1 Scores (prefix-based) ===
              precision    recall  f1-score   support

    Positive     0.8278    0.9255    0.8739       322
    Negative     0.7300    0.5984    0.6577       122
     Neutral     0.6923    0.5294    0.6000        68

    accuracy                         0.7949       512
   macro avg     0.7500    0.6844    0.7105       512
weighted avg     0.7865    0.7949    0.7860       512

micro F1: 0.794921875
macro F1: 0.71051931697093
weighted F1: 0.7859963450437241


In [40]:
# ==============================
# 12c. Lightweight hyperparameter search for generation
# ==============================
import itertools
import math
from collections import Counter
from tqdm.auto import tqdm

# Use saved model/tokenizer and device from previous cells: val_model, val_tokenizer, _device
assert 'val_model' in globals() and 'val_tokenizer' in globals() and '_device' in globals(), \
    "Load the saved model first (run cell 11)."

# Reuse prefix extractor from earlier cells
import re as _re

def _prefix(text: str) -> str:
    m = _re.match(r"^(Positive feedback:|Problem:|Neutral comment:)", str(text).strip())
    return m.group(1) if m else ""

label_to_id = {"Positive feedback:": 0, "Problem:": 1, "Neutral comment:": 2}

# Small validation slice for quick tuning
TUNE_MAX_SAMPLES = 96  # keep small for speed; increase if needed
BATCH_SIZE = 8
src_ds = working_dataset["validation"] if 'working_dataset' in globals() else dataset["validation"]
N = src_ds.num_rows
M = min(TUNE_MAX_SAMPLES, N)
samples = src_ds.select(range(M))

# Search space (kept small to be fast)
search_space = []
# Beam variants
for num_beams in [1, 4]:
    for max_len in [48, 64]:
        for min_len in [5, 10]:
            for rep_pen in [1.0, 1.15]:
                search_space.append({
                    "strategy": "beam",
                    "num_beams": num_beams,
                    "max_length": max_len,
                    "min_length": min_len,
                    "length_penalty": 1.0,
                    "repetition_penalty": rep_pen,
                })
# Nucleus sampling variant
for max_len in [48, 64]:
    search_space.append({
        "strategy": "nucleus",
        "top_p": 0.9,
        "top_k": 50,
        "temperature": 1.0,
        "max_length": max_len,
        "min_length": 5,
        "repetition_penalty": 1.05,
    })

# Metric helpers
import evaluate as _eval
_rouge = _eval.load("rouge")

def evaluate_config(cfg):
    preds = []
    refs = []
    for start in range(0, M, BATCH_SIZE):
        end = min(start + BATCH_SIZE, M)
        batch = samples.select(range(start, end))
        texts = [b["Translated_Review"] for b in batch]
        ref_texts = [b["summary"] for b in batch]
        enc = val_tokenizer([
            "summarize: " + t for t in texts
        ], return_tensors="pt", truncation=True, padding=True, max_length=256)
        gen_kwargs = dict(max_length=cfg["max_length"], min_length=cfg["min_length"], repetition_penalty=cfg.get("repetition_penalty", 1.0))
        if cfg["strategy"] == "beam":
            gen_kwargs.update(dict(num_beams=cfg["num_beams"], early_stopping=True))
        else:
            gen_kwargs.update(dict(do_sample=True, top_p=cfg["top_p"], top_k=cfg["top_k"], temperature=cfg["temperature"]))
        try:
            with torch.no_grad():
                outs = val_model.generate(**{k: v.to(_device) for k, v in enc.items()}, **gen_kwargs)
        except RuntimeError as e:
            # MPS fallback
            if _device.type == 'mps':
                with torch.no_grad():
                    outs = val_model.to('cpu').generate(**{k: v.to('cpu') for k, v in enc.items()}, **gen_kwargs)
                val_model.to(_device)
            else:
                raise
        batch_preds = val_tokenizer.batch_decode(outs, skip_special_tokens=True)
        preds.extend(batch_preds)
        refs.extend(ref_texts)

    # ROUGE-L as primary text metric
    rouge_res = _rouge.compute(
        predictions=["\n".join(p.split()) for p in preds],
        references=["\n".join(r.split()) for r in refs],
    )
    # Normalize to float
    def _rouge_to_float(v):
        if isinstance(v, dict) and 'mid' in v and hasattr(v['mid'], 'fmeasure'):
            return float(v['mid'].fmeasure)
        if hasattr(v, 'mid') and hasattr(v.mid, 'fmeasure'):
            return float(v.mid.fmeasure)
        return float(v)
    rougeL = _rouge_to_float(rouge_res.get("rougeL"))

    # Macro F1 over sentiment prefix labels
    y_true = [label_to_id.get(_prefix(r), -1) for r in refs]
    y_pred = [label_to_id.get(_prefix(p), -1) for p in preds]
    pairs = [(yt, yp) for yt, yp in zip(y_true, y_pred) if yt != -1]
    if not pairs:
        macro_f1 = 0.0
    else:
        labels = [0,1,2]
        cm = {l: {m: 0 for m in labels} for l in labels}
        for yt, yp in pairs:
            if yp in labels:
                cm[yt][yp] += 1
        def prf(tp, fp, fn):
            p = tp/(tp+fp) if (tp+fp) else 0.0
            r = tp/(tp+fn) if (tp+fn) else 0.0
            f1 = 2*p*r/(p+r) if (p+r) else 0.0
            return p, r, f1
        f1s = []
        for c in labels:
            tp = cm[c][c]
            fp = sum(cm[r][c] for r in labels if r != c)
            fn = sum(cm[c][r] for r in labels if r != c)
            _, _, f1 = prf(tp, fp, fn)
            f1s.append(f1)
        macro_f1 = sum(f1s)/len(f1s)

    return {"macro_f1": macro_f1, "rougeL": rougeL}

results = []
for cfg in tqdm(search_space, desc="Tuning", leave=False):
    metrics = evaluate_config(cfg)
    results.append({"config": cfg, **metrics})

# Rank: primary macro_f1 desc, then rougeL desc
results_sorted = sorted(results, key=lambda x: (x["macro_f1"], x["rougeL"]), reverse=True)
print("\n=== Top generation configs (by macro F1, tie-break ROUGE-L) ===")
for i, r in enumerate(results_sorted[:5]):
    print(f"#{i+1}", r)

BEST_GEN_CONFIG = results_sorted[0]["config"] if results_sorted else None
print("\nBEST_GEN_CONFIG =", BEST_GEN_CONFIG)

Tuning:   0%|          | 0/18 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



=== Top generation configs (by macro F1, tie-break ROUGE-L) ===
#1 {'config': {'strategy': 'beam', 'num_beams': 1, 'max_length': 64, 'min_length': 5, 'length_penalty': 1.0, 'repetition_penalty': 1.0}, 'macro_f1': 0.6786211414066977, 'rougeL': 0.9568092082250543}
#2 {'config': {'strategy': 'beam', 'num_beams': 1, 'max_length': 64, 'min_length': 5, 'length_penalty': 1.0, 'repetition_penalty': 1.15}, 'macro_f1': 0.6786211414066977, 'rougeL': 0.9568092082250543}
#3 {'config': {'strategy': 'beam', 'num_beams': 1, 'max_length': 48, 'min_length': 5, 'length_penalty': 1.0, 'repetition_penalty': 1.0}, 'macro_f1': 0.6786211414066977, 'rougeL': 0.9408365027038681}
#4 {'config': {'strategy': 'beam', 'num_beams': 1, 'max_length': 48, 'min_length': 5, 'length_penalty': 1.0, 'repetition_penalty': 1.15}, 'macro_f1': 0.6786211414066977, 'rougeL': 0.9408365027038681}
#5 {'config': {'strategy': 'beam', 'num_beams': 1, 'max_length': 64, 'min_length': 10, 'length_penalty': 1.0, 'repetition_penalty': 1.0},

In [43]:
# ==============================
# 12d. Prompt engineering: zero-shot / one-shot / few-shot helpers
# ==============================
import textwrap

assert 'val_model' in globals() and 'val_tokenizer' in globals() and '_device' in globals(), \
    "Load the saved model first (run cell 11)."

INSTR_DEFAULT = (
    "You are a helpful assistant that writes concise, fluent summaries of app reviews. "
    "Include only the most important user points."
)

SENTIMENT_TEMPLATES = {
    "Positive": "Write a brief positive summary of the user reviews.",
    "Negative": "Write a brief summary highlighting the problems and issues in the user reviews.",
    "Neutral":  "Write a brief neutral summary of the user reviews.",
}

SENTIMENT_TO_PREFIX = {
    "Positive": "Positive feedback:",
    "Negative": "Problem:",
    "Neutral":  "Neutral comment:",
}

# Build a prompt with optional in-context examples
# shots: list of dicts with keys: {instruction, input, target}

def build_prompt(review_block: str,
                 instruction: str = INSTR_DEFAULT,
                 sentiment: str | None = None,
                 shots: list | None = None,
                 max_examples: int = 3) -> str:
    parts = []
    # Keep instruction short to avoid echoing
    if instruction:
        parts.append(f"Instruction: {instruction}")
    if sentiment and sentiment in SENTIMENT_TEMPLATES:
        parts.append(f"Task: {SENTIMENT_TEMPLATES[sentiment]}")
    # In-context examples in training style
    if shots:
        for ex in shots[:max_examples]:
            parts.append("Example:\nInput: " + ex.get('input', '').strip())
            tgt = ex.get('target', '').strip()
            if tgt:
                parts.append("Output: " + tgt)
    parts.append("Input: " + review_block.strip())
    parts.append("Output:")
    return "\n\n".join(parts)

# Generate with optional prompt (wrapped by the T5 "summarize: " prefix),
# and seed the decoder with the sentiment prefix if provided.

def generate_with_prompt(review_block: str,
                         instruction: str = INSTR_DEFAULT,
                         sentiment: str | None = None,
                         shots: list | None = None,
                         gen_config: dict | None = None) -> str:
    prompt = build_prompt(review_block, instruction=instruction, sentiment=sentiment, shots=shots)
    input_text = "summarize: " + prompt
    enc = val_tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # Base cfg; do not set deprecated flags
    cfg = dict(max_length=64, min_length=8, num_beams=4)
    # Use tuned config if available
    if 'BEST_GEN_CONFIG' in globals() and BEST_GEN_CONFIG:
        # Map strategy flags
        mapped = dict(BEST_GEN_CONFIG)
        if mapped.get('strategy') == 'nucleus':
            mapped.update(dict(do_sample=True))
            mapped.pop('strategy', None)
        else:
            mapped.pop('strategy', None)
        cfg.update({k: v for k, v in mapped.items() if k in {
            'num_beams','max_length','min_length','length_penalty','repetition_penalty','do_sample','top_p','top_k','temperature'
        }})
    if gen_config:
        cfg.update(gen_config)

    # Seed decoder with the sentiment-specific prefix if provided
    gen_kwargs = dict(cfg)
    seed_ids = None
    if sentiment and sentiment in SENTIMENT_TO_PREFIX:
        prefix_text = SENTIMENT_TO_PREFIX[sentiment]
        seed_ids = val_tokenizer(prefix_text, return_tensors='pt', add_special_tokens=False).input_ids
        gen_kwargs['decoder_input_ids'] = seed_ids.to(_device)

    try:
        with torch.no_grad():
            outs = val_model.generate(**{k: v.to(_device) for k, v in enc.items()}, **gen_kwargs)
    except RuntimeError as e:
        if _device.type == 'mps':
            with torch.no_grad():
                # Ensure seed ids on cpu when falling back
                di = gen_kwargs.get('decoder_input_ids')
                if di is not None:
                    gen_kwargs['decoder_input_ids'] = di.to('cpu')
                outs = val_model.to('cpu').generate(**{k: v.to('cpu') for k, v in enc.items()}, **gen_kwargs)
            val_model.to(_device)
        else:
            raise
    text = val_tokenizer.decode(outs[0], skip_special_tokens=True)
    return text

# Tiny demo
try:
    # Build a small review block
    demo_texts = [
        "The app is super fast and the UI looks great.",
        "Crashes sometimes when opening settings.",
        "Notifications are delayed after the last update."
    ]
    review_block = " - " + "\n - ".join(demo_texts)

    # Zero-shot (no sentiment seeding)
    z = generate_with_prompt(review_block, sentiment=None)
    print("Zero-shot summary:\n", z)

    # One-shot Positive (with decoder seeding)
    one_shot = [{
        "input": " - love the clean interface and quick responses\n - great performance overall",
        "target": "Positive feedback: clean UI and fast performance appreciated by users."
    }]
    o = generate_with_prompt(review_block, sentiment="Positive", shots=one_shot)
    print("\nOne-shot positive summary:\n", o)

    # Few-shot Negative (with decoder seeding)
    few_shots = [
        {
            "input": " - app is intuitive and smooth\n - great features for daily use",
            "target": "Positive feedback: intuitive design and useful features for daily tasks."
        },
        {
            "input": " - crashes on launch\n - bug in login screen",
            "target": "Problem: frequent crashes and login bug impact usability."
        }
    ]
    f = generate_with_prompt(review_block, sentiment="Negative", shots=few_shots)
    print("\nFew-shot negative summary:\n", f)
except Exception as e:
    print("(Prompt demo skipped)", e)

Zero-shot summary:
 Positive feedback: You are a helpful assistant that writes concise, fluent summaries of app reviews. Include only the most important user points

One-shot positive summary:
 Positive feedback: You are a helpful assistant that writes concise, fluent summaries of app reviews. Include only the most important user points

One-shot positive summary:
 Positive feedback: You are a helpful assistant that writes concise, fluent summaries of app reviews. Include only the most important user points

Few-shot negative summary:
 Problem: You are a helpful assistant that writes concise, fluent summaries of app reviews. Include only the most important user points

Few-shot negative summary:
 Problem: You are a helpful assistant that writes concise, fluent summaries of app reviews. Include only the most important user points


In [44]:
# ==============================
# 12e. Train vs Validation: ROUGE + eval_loss comparison
# ==============================
import math
import re as _re
from tqdm.auto import tqdm
import evaluate as _eval

# Preconditions
assert 'val_model' in globals() and 'val_tokenizer' in globals() and '_device' in globals(), \
    "Load the saved model first (run the validation cell)."
assert 'tokenized_dataset' in globals() and 'working_dataset' in globals(), \
    "Ensure tokenized_dataset and working_dataset are defined (run earlier cells)."

_rouge_metric = _eval.load("rouge")

def _move_to(d, device):
    return {k: v.to(device) for k, v in d.items()}

def _rouge_to_float(v):
    if isinstance(v, dict) and 'mid' in v and hasattr(v['mid'], 'fmeasure'):
        return float(v['mid'].fmeasure)
    if hasattr(v, 'mid') and hasattr(v.mid, 'fmeasure'):
        return float(v.mid.fmeasure)
    return float(v)

# Shared generation params (match earlier eval)
GEN_KW = dict(max_length=30, min_length=5, num_beams=4)
BATCH_SIZE = 8
MAX_SAMPLES = 512

# Helper to compute ROUGE for a split
def compute_rouge_for_split(split_name: str):
    ds = working_dataset[split_name]
    M = min(MAX_SAMPLES, ds.num_rows)
    samples = ds.select(range(M))
    preds, refs = [], []

    for start in tqdm(range(0, M, BATCH_SIZE), desc=f"{split_name} gen", leave=False):
        end = min(start + BATCH_SIZE, M)
        batch = samples.select(range(start, end))
        texts = [b["Translated_Review"] for b in batch]
        ref_texts = [b["summary"] for b in batch]
        inputs = val_tokenizer(["summarize: " + t for t in texts], return_tensors="pt", truncation=True, padding=True, max_length=256)
        try:
            with torch.no_grad():
                outs = val_model.generate(**_move_to(inputs, _device), **GEN_KW)
        except RuntimeError as e:
            if _device.type == 'mps':
                with torch.no_grad():
                    outs = val_model.to('cpu').generate(**_move_to(inputs, 'cpu'), **GEN_KW)
                val_model.to(_device)
            else:
                raise
        batch_preds = val_tokenizer.batch_decode(outs, skip_special_tokens=True)
        preds.extend(batch_preds)
        refs.extend(ref_texts)

    rouge_res = _rouge_metric.compute(
        predictions=["\n".join(p.split()) for p in preds],
        references=["\n".join(r.split()) for r in refs],
    )
    scores = {k: _rouge_to_float(v) for k, v in rouge_res.items() if k in ("rouge1","rouge2","rougeL","rougeLsum")}
    return scores

# Compute ROUGE
train_rouge = compute_rouge_for_split('train')
val_rouge   = compute_rouge_for_split('validation')

# Compute eval losses using Trainer on each split
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq
_tmp_args = TrainingArguments(output_dir="./_tmp_eval_compare", per_device_eval_batch_size=8, report_to=[], logging_strategy="no")
_data_collator = DataCollatorForSeq2Seq(tokenizer=val_tokenizer, model=val_model)

trainer_train_eval = Trainer(model=val_model, args=_tmp_args, eval_dataset=tokenized_dataset['train'], tokenizer=val_tokenizer, data_collator=_data_collator)
trainer_val_eval   = Trainer(model=val_model, args=_tmp_args, eval_dataset=tokenized_dataset['validation'], tokenizer=val_tokenizer, data_collator=_data_collator)

train_eval_metrics = trainer_train_eval.evaluate()
val_eval_metrics   = trainer_val_eval.evaluate()

train_loss = float(train_eval_metrics.get('eval_loss', float('nan')))
val_loss   = float(val_eval_metrics.get('eval_loss', float('nan')))

# Print comparison
print("\n=== Train vs Validation (ROUGE and eval_loss) ===")
print("Train ROUGE:", {k: round(v, 4) for k, v in train_rouge.items()})
print("Valid ROUGE:", {k: round(v, 4) for k, v in val_rouge.items()})
print(f"Train eval_loss: {train_loss:.4f}")
print(f"Valid eval_loss: {val_loss:.4f}")

# Quick overfitting signal
gap_rougeL = train_rouge.get('rougeL', float('nan')) - val_rouge.get('rougeL', float('nan'))
loss_gap   = val_loss - train_loss if (not math.isnan(train_loss) and not math.isnan(val_loss)) else float('nan')
print(f"\nDelta (train - valid) ROUGE-L: {gap_rougeL:.4f}")
print(f"Delta (valid - train) eval_loss: {loss_gap:.4f}")
if (not math.isnan(gap_rougeL) and gap_rougeL > 0.05) or (not math.isnan(loss_gap) and loss_gap > 0.3):
    print("[Note] Signs of overfitting: higher train ROUGE-L and/or much lower train loss.")
else:
    print("[Note] No strong overfitting signal based on these thresholds.")

train gen:   0%|          | 0/64 [00:00<?, ?it/s]

validation gen:   0%|          | 0/64 [00:00<?, ?it/s]

  trainer_train_eval = Trainer(model=val_model, args=_tmp_args, eval_dataset=tokenized_dataset['train'], tokenizer=val_tokenizer, data_collator=_data_collator)
  trainer_val_eval   = Trainer(model=val_model, args=_tmp_args, eval_dataset=tokenized_dataset['validation'], tokenizer=val_tokenizer, data_collator=_data_collator)



=== Train vs Validation (ROUGE and eval_loss) ===
Train ROUGE: {'rouge1': 0.9037, 'rouge2': 0.8972, 'rougeL': 0.9036, 'rougeLsum': 0.9036}
Valid ROUGE: {'rouge1': 0.8925, 'rouge2': 0.8858, 'rougeL': 0.8921, 'rougeLsum': 0.8919}
Train eval_loss: 0.0800
Valid eval_loss: 0.0804

Delta (train - valid) ROUGE-L: 0.0115
Delta (valid - train) eval_loss: 0.0004
[Note] No strong overfitting signal based on these thresholds.


In [1]:
# ==============================
# 15. UI: Summarize any app interactively (supports unknown apps via custom reviews)
# ==============================
try:
    import ipywidgets as widgets
    from IPython.display import display, Markdown
except Exception as e:
    print("ipywidgets not available. Install it with: pip install ipywidgets && pip install jupyterlab_widgets")
    raise

# Preconditions
assert 'df' in globals(), "Load the dataset first (cells 2–3)."
assert 'summarize_app_by_sentiment' in globals(), "Define summarize_app_by_sentiment first (cell 10)."
assert '_generate_prefixed' in globals() and '_SENTIMENT_PREFIX' in globals(), "Make sure cell 10 is executed."

# App options from data (sorted, unique)
app_names = sorted(set(df['App'].astype(str).unique()))

# Widgets
w_dropdown = widgets.Dropdown(options=app_names[:5000], description='Choose:', layout=widgets.Layout(width='70%'))
w_text = widgets.Text(description='Or type:', placeholder='Any app name...', layout=widgets.Layout(width='70%'))
w_custom = widgets.Textarea(
    value='',
    placeholder='Optional: Paste custom reviews here, one per line.\nIf the app is not in the CSV, these will be used to generate summaries.',
    description='Reviews:',
    layout=widgets.Layout(width='90%', height='120px')
)
w_limit = widgets.IntSlider(value=50, min=10, max=200, step=10, description='Per-sent cap:')
w_strip = widgets.Checkbox(value=True, description='Strip sentiment prefix')
w_button = widgets.Button(description='Summarize', button_style='primary', icon='play')
w_out = widgets.Output(layout={'border': '1px solid #ddd'})

# Handler

def on_run_clicked(b):
    with w_out:
        w_out.clear_output()
        name = (w_text.value or '').strip() or (w_dropdown.value or '').strip()
        typed = (w_text.value or '').strip()
        # Try dataset if present
        subset = df[df['App'] == name] if name else df[df['App'] == '__NO_APP__']

        if name and not subset.empty:
            # Use dataset-backed summarization
            counts = subset['Sentiment'].value_counts().to_dict()
            try:
                results = summarize_app_by_sentiment(name, per_group_limit=int(w_limit.value), strip_prefix=bool(w_strip.value))
            except Exception as e:
                print('Error generating summaries:', e)
                return
            display(Markdown(f"## Summaries for '{name}'\n\nReviews: {len(subset)} | Sentiments: {counts}"))
            pos = results.get('Positive', '')
            neg = results.get('Negative', '')
            neu = results.get('Neutral', '')
            if pos:
                display(Markdown(f"### Positive\n{pos}"))
            if neg:
                display(Markdown(f"### Negative\n{neg}"))
            if neu:
                display(Markdown(f"### Neutral\n{neu}"))
        else:
            # Fallback: use custom reviews the user pasted
            lines = [ln.strip() for ln in (w_custom.value or '').splitlines() if ln.strip()]
            if not lines:
                print("App not found in CSV. Please paste custom reviews (one per line) in the 'Reviews' box.")
                return
            # Use all pasted lines for each sentiment, forcing the sentiment prefix
            joined = " \n- ".join(lines)
            results = {}
            for sent in ["Positive", "Negative", "Neutral"]:
                forced = _SENTIMENT_PREFIX[sent]
                try:
                    # Reuse the lower-level generator with decoder seeding
                    results[sent] = _generate_prefixed(joined, forced_prefix=forced, max_length=80, num_beams=4, strip_prefix=bool(w_strip.value))
                except Exception as e:
                    results[sent] = ''
            label = name if name else (typed if typed else 'Custom input')
            display(Markdown(f"## Summaries for '{label}' (custom reviews)\n\nCustom reviews: {len(lines)}"))
            if results.get('Positive'):
                display(Markdown(f"### Positive\n{results['Positive']}"))
            if results.get('Negative'):
                display(Markdown(f"### Negative\n{results['Negative']}"))
            if results.get('Neutral'):
                display(Markdown(f"### Neutral\n{results['Neutral']}"))

w_button.on_click(on_run_clicked)

# Layout
controls = widgets.VBox([
    widgets.HBox([w_dropdown]),
    widgets.HBox([w_text]),
    widgets.HBox([w_custom]),
    widgets.HBox([w_limit, w_strip]),
    w_button
])

ui = widgets.VBox([controls, w_out])
display(ui)

AssertionError: Load the dataset first (cells 2–3).

In [2]:
# ==============================
# 14. Summarize custom data with saved model/tokenizer
# ==============================
import os
import re as _re
import torch
import pandas as pd

# Ensure a default saved model dir is available
if 'DEFAULT_SAVED_MODEL_DIR' not in globals():
    DEFAULT_SAVED_MODEL_DIR = \
        "/Users/bawantharathnayake/Desktop/Academic/semester 7/advanced ai/AI-Project/Customer-Review-Summarizer-for-Mobile-Apps-using-LLMs-Project/notebook/t5_finetuned_reviews"

# Load saved model/tokenizer if not already loaded

def ensure_saved_model(saved_dir: str | None = None):
    """Loads val_model/val_tokenizer and device if not present; returns (model, tokenizer, device)."""
    global val_model, val_tokenizer, _device
    saved_dir = saved_dir or DEFAULT_SAVED_MODEL_DIR
    if 'val_model' in globals() and 'val_tokenizer' in globals():
        mdl, tok = val_model, val_tokenizer
    else:
        from transformers import T5Tokenizer, T5ForConditionalGeneration
        tok = T5Tokenizer.from_pretrained(saved_dir)
        mdl = T5ForConditionalGeneration.from_pretrained(saved_dir)
    _device = torch.device("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu"))
    mdl.to(_device)
    val_model = mdl
    val_tokenizer = tok
    return mdl, tok, _device


def _strip_prefix(txt: str) -> str:
    return _re.sub(r"^(Positive feedback:|Problem:|Neutral comment:)\s*", "", str(txt)).strip()


def summarize_texts(texts: list[str],
                    max_input_length: int = 256,
                    max_length: int = 64,
                    min_length: int = 5,
                    num_beams: int = 4,
                    batch_size: int = 8,
                    strip_prefix: bool = True) -> list[str]:
    """Summarize a list of raw texts using the saved model/tokenizer."""
    mdl, tok, device = ensure_saved_model()
    outputs = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        enc = tok(["summarize: " + t for t in batch_texts],
                   return_tensors="pt", truncation=True, padding=True, max_length=max_input_length)
        try:
            with torch.no_grad():
                gen = mdl.generate(**{k: v.to(device) for k, v in enc.items()},
                                   max_length=max_length,
                                   min_length=min_length,
                                   num_beams=num_beams,
                                   early_stopping=True)
        except RuntimeError as e:
            # MPS fallback per-batch
            if device.type == 'mps':
                with torch.no_grad():
                    gen = mdl.to('cpu').generate(**{k: v.to('cpu') for k, v in enc.items()},
                                                 max_length=max_length,
                                                 min_length=min_length,
                                                 num_beams=num_beams,
                                                 early_stopping=True)
                mdl.to(device)
            else:
                raise
        dec = val_tokenizer.batch_decode(gen, skip_special_tokens=True)
        if strip_prefix:
            dec = [_strip_prefix(x) for x in dec]
        outputs.extend(dec)
    return outputs


def summarize_csv(csv_path: str,
                   text_column: str,
                   output_csv_path: str | None = None,
                   max_input_length: int = 256,
                   max_length: int = 64,
                   min_length: int = 5,
                   num_beams: int = 4,
                   batch_size: int = 8,
                   strip_prefix: bool = True) -> pd.DataFrame:
    """Summarize a CSV column of texts; returns a DataFrame with a new 'summary' column and optionally saves it."""
    df_custom = pd.read_csv(csv_path)
    if text_column not in df_custom.columns:
        raise ValueError(f"Column '{text_column}' not found in {csv_path}. Columns: {list(df_custom.columns)}")
    texts = df_custom[text_column].astype(str).tolist()
    df_custom['summary'] = summarize_texts(texts,
                                          max_input_length=max_input_length,
                                          max_length=max_length,
                                          min_length=min_length,
                                          num_beams=num_beams,
                                          batch_size=batch_size,
                                          strip_prefix=strip_prefix)
    if output_csv_path:
        os.makedirs(os.path.dirname(output_csv_path), exist_ok=True) if os.path.dirname(output_csv_path) else None
        df_custom.to_csv(output_csv_path, index=False)
        print(f"Saved summaries to {output_csv_path}")
    return df_custom

# --- Example usage ---
# 1) Summarize a few custom texts
example_texts = [
    "I really love this app. The interface is clean and it runs smoothly.",
    "The last update broke notifications. Please fix it soon.",
    "It does what it says. Nothing special, but it works."
]
example_summaries = summarize_texts(example_texts, strip_prefix=True)
print("\nCustom text summaries:")
for t, s in zip(example_texts, example_summaries):
    print("-", s)

# 2) Summarize a CSV (uncomment and set your paths/column)
# out_df = summarize_csv(
#     csv_path="/path/to/your/custom.csv",
#     text_column="your_text_column",
#     output_csv_path="/path/to/output_summaries.csv",
#     strip_prefix=True,
# )
# out_df.head()


Custom text summaries:
- I really love this app
- The last update broke notifications. Please fix it soon
- it does what it says. Nothing special, but it works


In [None]:
# Synthetic review generation for any app (LLM-like)
import random, math, re, textwrap
from typing import List, Dict, Tuple

try:
    import ipywidgets as widgets
    from IPython.display import display, HTML, clear_output
except Exception:
    widgets = None

_ASPECTS = [
    "performance", "speed", "battery usage", "crashes/bugs", "UI/UX",
    "notifications", "privacy", "permissions", "ads", "pricing",
    "features", "offline mode", "sync", "customer support", "updates",
]
_PERSONAS = [
    "college student", "busy professional", "parent", "gamer", "photography enthusiast",
    "traveler", "small business owner", "power user", "new user", "accessibility user",
]
_STYLE_TO_LEN = {
    "concise": (1, 2),
    "detailed": (2, 4),
    "story-like": (3, 6),
}

# Fallbacks to access a ready model/tokenizer and device
import torch

def _device_default():
    return torch.device('cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))


def _get_model_tokenizer_device():
    # T5 seq2seq (your fine-tuned model)
    global val_model, val_tokenizer
    model = None
    tok = None
    if 'val_model' in globals() and val_model is not None:
        model = val_model
    if 'val_tokenizer' in globals() and val_tokenizer is not None:
        tok = val_tokenizer
    if model is None or tok is None:
        from transformers import T5ForConditionalGeneration, T5Tokenizer
        model_dir = None
        if 'DEFAULT_SAVED_MODEL_DIR' in globals():
            model_dir = DEFAULT_SAVED_MODEL_DIR
        elif 'SAVED_MODEL_DIR' in globals():
            model_dir = SAVED_MODEL_DIR
        if model_dir is None:
            model_dir = 't5-small'  # last-resort fallback
        tok = T5Tokenizer.from_pretrained(model_dir)
        model = T5ForConditionalGeneration.from_pretrained(model_dir)
    device = getattr(model, 'device', None) or _device_default()
    model.to(device).eval()
    return model, tok, device


# Optional causal LM for better open-ended generation
_causal_cache = {"model": None, "tok": None, "name": None}

def _get_causal_lm(name: str = 'distilgpt2'):
    from transformers import AutoModelForCausalLM, AutoTokenizer
    if _causal_cache["model"] is not None and _causal_cache["name"] == name:
        return _causal_cache["model"], _causal_cache["tok"], _causal_cache.get("device")
    tok = AutoTokenizer.from_pretrained(name)
    if tok.pad_token_id is None:
        tok.pad_token = tok.eos_token
    model = AutoModelForCausalLM.from_pretrained(name)
    device = _device_default()
    model.to(device).eval()
    _causal_cache.update({"model": model, "tok": tok, "name": name, "device": device})
    return model, tok, device


# Build a targeted prompt to elicit realistic reviews
SENTIMENT_LABELS = ["Positive", "Negative", "Neutral"]


def build_review_prompt(app: str, sentiment: str, persona: str, aspects: List[str], style: str) -> str:
    # Minimal, content-only context to avoid instruction echo
    min_s, max_s = _STYLE_TO_LEN.get(style, (1, 2))
    return "\n".join([
        f"App: {app}",
        f"Persona: {persona}",
        f"Sentiment: {sentiment}",
        f"Aspects: {', '.join(aspects)}",
        f"Sentences: {min_s}-{max_s}",
        "Review:",
    ])


def _strip_any_prefix(text: str) -> str:
    # Remove known sentiment prefixes if model adds them
    try:
        prefixes = list(SENTIMENT_TO_PREFIX.values()) if 'SENTIMENT_TO_PREFIX' in globals() else []
    except Exception:
        prefixes = []
    t = text.strip()
    for p in prefixes:
        if t.lower().startswith(p.lower()):
            t = t[len(p):].lstrip(" -:\n\t")
    # remove generic labels like 'Review:' at start
    t = re.sub(r'^(review\s*[:\-]\s*)', '', t, flags=re.IGNORECASE).strip()
    # collapse spaces
    t = re.sub(r'\s+', ' ', t).strip()
    return t


def _seed_text(persona: str, sentiment: str, app: str, aspects: List[str]) -> str:
    s = sentiment.strip().lower()
    aspects_txt = ', '.join(aspects)
    if s.startswith('pos'):
        return f"As a {persona}, I love how {app} handles {aspects_txt}. "
    if s.startswith('neg'):
        return f"As a {persona}, I'm frustrated with {app}, especially around {aspects_txt}. "
    return f"As a {persona}, I find {app} okay overall for {aspects_txt}. "


def _finish_sentence(text: str) -> str:
    # Trim to last sentence end if it exists
    m = re.search(r'[\.\!\?](?!.*[\.\!\?])', text)
    if m:
        return text[:m.end()].strip()
    return text.strip()


def generate_reviews_for_app(
    app: str,
    n_per_sentiment: int = 3,
    sentiments: List[str] = None,
    style: str = "concise",
    temperature: float = 0.9,
    top_p: float = 0.92,
    max_new_tokens: int = 72,
    min_new_tokens: int = 12,
    seed_with_prefix: bool = True,
    backend: str = 'auto',  # 'auto' | 't5' | 'gpt2'
) -> Dict[str, List[str]]:
    """
    Generate synthetic user reviews per sentiment for an app.

    backend:
      - 't5': use your fine-tuned T5 (may echo instruction-like text)
      - 'gpt2': use a small causal LM for open-ended text
      - 'auto': prefer gpt2 if available, else fallback to T5
    """
    if sentiments is None:
        sentiments = SENTIMENT_LABELS

    use_gpt2 = backend == 'gpt2'
    if backend == 'auto':
        try:
            _ = _get_causal_lm('distilgpt2')
            use_gpt2 = True
        except Exception:
            use_gpt2 = False

    out: Dict[str, List[str]] = {s: [] for s in sentiments}

    if use_gpt2:
        gpt2, tok, device = _get_causal_lm('distilgpt2')
        gen_kw = dict(
            do_sample=True,
            top_p=float(top_p),
            temperature=float(temperature),
            max_new_tokens=int(max_new_tokens),
            repetition_penalty=1.05,
            no_repeat_ngram_size=3,
            pad_token_id=tok.pad_token_id,
            eos_token_id=tok.eos_token_id,
        )
        for s in sentiments:
            for _ in range(int(n_per_sentiment)):
                aspects = random.sample(_ASPECTS, k=min(2, len(_ASPECTS)))
                persona = random.choice(_PERSONAS)
                seed = _seed_text(persona, s, app, aspects)
                prompt = seed  # causal LM prompt is the seed itself
                enc = tok(prompt, return_tensors='pt').to(device)
                with torch.no_grad():
                    out_ids = gpt2.generate(**enc, **gen_kw)
                txt = tok.decode(out_ids[0], skip_special_tokens=True)
                # remove the seed if it got copied fully
                gen_part = txt[len(prompt):].strip() if txt.startswith(prompt) else txt
                final = _finish_sentence((seed + gen_part).strip())
                out[s].append(final)
        return out

    # T5 path (seq2seq)
    model, tok, device = _get_model_tokenizer_device()
    gen_kw = dict(
        do_sample=True,
        top_p=float(top_p),
        temperature=float(temperature),
        max_new_tokens=int(max_new_tokens),
        min_new_tokens=int(min_new_tokens),
        pad_token_id=tok.pad_token_id,
        eos_token_id=tok.eos_token_id,
        repetition_penalty=1.05,
        no_repeat_ngram_size=3,
    )

    for s in sentiments:
        for _ in range(int(n_per_sentiment)):
            aspects = random.sample(_ASPECTS, k=min(2, len(_ASPECTS)))
            persona = random.choice(_PERSONAS)
            prompt = build_review_prompt(app, s, persona, aspects, style)
            enc = tok(prompt, return_tensors='pt', truncation=True, max_length=256)
            enc = {k: v.to(device) for k, v in enc.items()}

            dec_args = {}
            if seed_with_prefix:
                seed_txt = _seed_text(persona, s, app, aspects)
                dec_ids = tok(seed_txt, return_tensors='pt', add_special_tokens=False).input_ids.to(device)
                dec_args['decoder_input_ids'] = dec_ids

            with torch.no_grad():
                out_ids = model.generate(**enc, **dec_args, **gen_kw)
            txt = tok.decode(out_ids[0], skip_special_tokens=True)
            txt = _strip_any_prefix(txt)
            if not txt:
                txt = seed_txt  # ensure some content
            words = txt.split()
            if len(words) < max(5, min_new_tokens // 2):
                with torch.no_grad():
                    out_ids = model.generate(**enc, **dec_args, **gen_kw)
                txt = tok.decode(out_ids[0], skip_special_tokens=True)
                txt = _strip_any_prefix(txt)
            out[s].append(_finish_sentence(txt))
    return out


def _render_synth_output(reviews: Dict[str, List[str]]):
    lines = []
    for s in reviews:
        lines.append(f"\n=== {s} Reviews ===")
        for i, r in enumerate(reviews[s], 1):
            lines.append(f"{i}. {r}")
    return "\n".join(lines)

# --- Optional UI ---
if widgets is not None:
    w2_app = widgets.Text(description='App', placeholder='Enter app name')
    w2_backend = widgets.Dropdown(description='Backend', options=['auto','gpt2','t5'], value='auto')
    w2_n = widgets.IntSlider(description='#/sent', value=3, min=1, max=6, step=1)
    w2_style = widgets.Dropdown(description='Style', options=list(_STYLE_TO_LEN.keys()), value='concise')
    w2_sent = widgets.SelectMultiple(description='Sentiments', options=SENTIMENT_LABELS, value=tuple(SENTIMENT_LABELS))
    w2_temp = widgets.FloatSlider(description='Temp', value=0.9, min=0.1, max=1.5, step=0.05)
    w2_top_p = widgets.FloatSlider(description='Top-p', value=0.92, min=0.1, max=1.0, step=0.02)
    w2_len = widgets.IntRangeSlider(description='New tokens', value=[12, 72], min=8, max=160, step=4)
    w2_seed = widgets.Checkbox(description='Seed persona', value=True)
    w2_btn = widgets.Button(description='Run', button_style='primary')
    w2_out = widgets.Output(layout={'border': '1px solid #ddd'})

    def _on_run(_):
        w2_out.clear_output()
        with w2_out:
            app = w2_app.value.strip()
            if not app:
                print('Please enter an app name.')
                return
            res = generate_reviews_for_app(
                app=app,
                n_per_sentiment=w2_n.value,
                sentiments=list(w2_sent.value),
                style=w2_style.value,
                temperature=w2_temp.value,
                top_p=w2_top_p.value,
                min_new_tokens=w2_len.value[0],
                max_new_tokens=w2_len.value[1],
                seed_with_prefix=w2_seed.value,
                backend=w2_backend.value,
            )
            print(_render_synth_output(res))

    w2_btn.on_click(_on_run)
    ui_gen = widgets.VBox([
        widgets.HTML('<h3>Generate Synthetic App Reviews</h3><p>Creates realistic user reviews per sentiment for any app. Backend auto=GPT-2 fallback.</p>'),
        widgets.HBox([w2_app, w2_backend, w2_style]),
        widgets.HBox([w2_n, w2_sent]),
        widgets.HBox([w2_temp, w2_top_p, w2_len]),
        widgets.HBox([w2_seed]),
        w2_btn,
        w2_out
    ])
    display(ui_gen)
else:
    print("ipywidgets not available. Use generate_reviews_for_app(app='Your App') programmatically.")

VBox(children=(HTML(value='<h3>Generate Synthetic App Reviews</h3><p>Creates realistic user reviews per sentim…

In [4]:
# Smoke test: ensure generator produces new reviews
try:
    _test = generate_reviews_for_app(
        app="TestApp",
        n_per_sentiment=1,
        temperature=1.1,
        top_p=0.95,
        max_new_tokens=80,
        min_new_tokens=20,
        seed_with_prefix=True,
    )
    for s, lst in _test.items():
        print(f"{s}: {lst[0]}")
except Exception as e:
    print('Generation error:', e)

Positive: As a gamer, I love how TestApp handles offline mode, features. ????
Negative: As a busy professional, I'm frustrated with TestApp, especially around customer support, pricing. So do all my friends here and our associates: Do you think if they can help in sales? If so, or are there any tips for getting over your head about this problem? Let's talk to someone that says hi! We're looking forward to hearing from you!
Neutral: As a college student, I find TestApp okay overall for battery usage, offline mode.  - The power and battery life of these two Android tablets may look decent when it comes to battery storage and performance.
