In [1]:
!pip install transformers accelerate bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2


In [8]:
import re
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# ==========================
# 0) Configuration
# ==========================
INPUT_XLSX = "Task3_Review_Summarization_Generative_RuleBased_v2.xlsx"
OUTPUT_CSV  = "task3_Review_summarisation_mistral_both_clean_v2.csv"
PROS_COL = "Pros Summary"
CONS_COL = "Cons Summary"
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"

# ==========================
# 1) Load Model
# ==========================
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)
pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id

# ==========================
# 2) Helper Functions
# ==========================
def _clean_text(x):
    """Ensure clean text input (no NaNs or floats)."""
    if isinstance(x, float) and pd.isna(x):
        return ""
    return x if isinstance(x, str) else ""

def _generate(prompt, max_new_tokens=320, temperature=0.3):
    """Generate only the model continuation (exclude echoed prompt)."""
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
    prompt_len = inputs.input_ids.shape[1]  # number of tokens in the prompt

    out = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=0.9,
        do_sample=False,
        pad_token_id=pad_id
    )

    # Keep only the newly generated tokens after the prompt
    new_tokens = out[0][prompt_len:]
    text = tokenizer.decode(new_tokens, skip_special_tokens=True)
    return text.strip()

# ==========================
# 3) Post-Cleaners (remove echoed prompts)
# ==========================
def clean_generated_text(text):
    """
    Remove any leftover prompt text from model output.
    Works for both neutral and persuasive outputs.
    """
    if not isinstance(text, str):
        return ""

    # Remove common echoed parts like "PROS:", "CONS:", "Use only this input:", etc.
    text = re.sub(r"(?is)(Use only this input:|PROS:|CONS:|Positive opinions:|Negative opinions:|Positive aspects:|Negative aspects:).*?(?=\n|$)", "", text)

    # Remove multiple blank lines
    text = re.sub(r"\n{2,}", "\n", text)

    # Trim excessive leading/trailing spaces and lines
    text = text.strip()

    # Remove any residual instructional sentences
    text = re.sub(r"(?i)(You are a careful copywriter|Write the summary now|Style & rules:).*", "", text).strip()

    return text.strip()

# ==========================
# 4) Neutral (Internal Team)
# ==========================
def summarise_with_mistral_neutral(pros, cons, max_new_tokens=320):
    pros = _clean_text(pros)
    cons = _clean_text(cons)
    if not pros.strip() and not cons.strip():
        return ""

    prompt = f"""
You are a neutral summarization assistant.
Summarize verified customer opinions, strictly using the provided text.

Rules:
- Neutral, factual tone.
- No first-person.
- Two sections with these exact headings: "Positive aspects" and "Negative aspects".
- Reflect mixed opinions if present.
- Do not add information that is not present in the input.

---
Positive opinions:
{pros}

Negative opinions:
{cons}

---
Write the summary now.
"""
    output = _generate(prompt, max_new_tokens=max_new_tokens, temperature=0.3)
    return clean_generated_text(output)

# ==========================
# 5) Persuasive (Customer-Facing)
# ==========================
def summarise_with_mistral_persuasive_paragraph(pros, cons, max_new_tokens=300):
    pros = _clean_text(pros)
    cons = _clean_text(cons)
    if not pros.strip() and not cons.strip():
        return ""

    prompt = f"""
You are a careful copywriter. Write a concise product summary as smooth paragraphs, grounded ONLY in the provided opinions.

Style & rules:
- Paragraph prose only. No headings, no bullets.
- Emphasize benefits from PROS (this is the main focus).
- Mention CONS briefly in ONE short sentence with hedging ("may", "for some users"), only if CONS exist.
- Neutral-to-positive tone, sales-friendly but truthful. No first-person. No invented claims.
- Keep it tight: about 3–6 sentences total.

Use only this input:
PROS:
{pros}

CONS:
{cons}

Now write the summary.
"""
    output = _generate(prompt, max_new_tokens=max_new_tokens, temperature=0.35)
    return clean_generated_text(output)

# ==========================
# 6) Generate Summaries
# ==========================
df = pd.read_excel(INPUT_XLSX)

neutral_summaries = []
persuasive_summaries = []

for idx, row in df.iterrows():
    pros = row.get(PROS_COL, "")
    cons = row.get(CONS_COL, "")

    neutral = summarise_with_mistral_neutral(pros, cons)
    persuasive = summarise_with_mistral_persuasive_paragraph(pros, cons)

    neutral_summaries.append(neutral)
    persuasive_summaries.append(persuasive)
    print(f"✅ Processed row {idx+1}/{len(df)}")

df["Overall_Summary_Mistral_Neutral"] = neutral_summaries
df["Overall_Summary_Mistral_Persuasive"] = persuasive_summaries

# ==========================
# 7) Save Clean Output
# ==========================
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")
print(f"\n✅ Saved clean summaries only to: {OUTPUT_CSV}")


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


✅ Processed row 1/42
✅ Processed row 2/42
✅ Processed row 3/42
✅ Processed row 4/42
✅ Processed row 5/42
✅ Processed row 6/42
✅ Processed row 7/42
✅ Processed row 8/42
✅ Processed row 9/42
✅ Processed row 10/42
✅ Processed row 11/42
✅ Processed row 12/42
✅ Processed row 13/42
✅ Processed row 14/42
✅ Processed row 15/42
✅ Processed row 16/42
✅ Processed row 17/42
✅ Processed row 18/42
✅ Processed row 19/42
✅ Processed row 20/42
✅ Processed row 21/42
✅ Processed row 22/42
✅ Processed row 23/42
✅ Processed row 24/42
✅ Processed row 25/42
✅ Processed row 26/42
✅ Processed row 27/42
✅ Processed row 28/42
✅ Processed row 29/42
✅ Processed row 30/42
✅ Processed row 31/42
✅ Processed row 32/42
✅ Processed row 33/42
✅ Processed row 34/42
✅ Processed row 35/42
✅ Processed row 36/42
✅ Processed row 37/42
✅ Processed row 38/42
✅ Processed row 39/42
✅ Processed row 40/42
✅ Processed row 41/42
✅ Processed row 42/42

✅ Saved clean summaries only to: task3_Review_summarisation_mistral_both_clean_v2.cs