In [1]:
import argparse, re, textwrap, json, datetime, html, sys
from pathlib import Path
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
from transformers import pipeline
sns.set_style("whitegrid")

KEYBERT_OK = True
try:
    from keybert import KeyBERT
except Exception:
    KEYBERT_OK = False
    print("🛈  KeyBERT (or its deps) not available – key-phrase mining disabled.\n"
          "    pip install --upgrade keybert sentence-transformers transformers")

ISSUE_PATTERNS = {
    "late_delivery":  r"\b(late|delay(ed)?|wait(ed|ing)?)\b.*\b(car|vehicle|pickup|delivery)\b",
    "car_condition":  r"\b(broken|scratch|dent|damage|engine|tyre|tire|ac\b|air ?cond|mechanic)\b",
    "cleanliness":    r"\b(dirty|filthy|stain|smell|odou?r|trash|garbage)\b",
    "staff_behavior": r"\b(rude|unhelpful|impolite|attitude|staff|agent|clerk)\b",
    "billing":        r"\b(over.?charge|extra fee|hidden fee|double charge|refund|invoice|charge\b.*error)\b",
}
SENT_MODEL = "distilbert-base-uncased-finetuned-sst-2-english"
ISSUE_ORDER = list(ISSUE_PATTERNS.keys()) + ["other"]

def load_reviews(csv_path: Path) -> pd.DataFrame:
    df = pd.read_csv(csv_path)

    for col in ("review", "Feedback", "text", "comment"):
        if col in df.columns:
            df.rename(columns={col: "review"}, inplace=True)
            break
    if "review" not in df.columns:
        raise ValueError("CSV needs a column named 'review' (or 'feedback', 'text').")
    df["review"] = df["review"].astype(str).fillna("")

    for col in ("car_model", "model", "Model", "vehicle_model", "car", "Car"):
        if col in df.columns:
            df.rename(columns={col: "car_model"}, inplace=True)
            break
    if "car_model" not in df.columns:
        df["car_model"] = "Unknown"
    df["car_model"] = df["car_model"].astype(str).fillna("Unknown")

    return df

def tag_sentiment(texts):
    nlp = pipeline("sentiment-analysis", model=SENT_MODEL, device=-1)
    preds = nlp(texts, truncation=True)
    return [d["label"].lower() if d["label"].lower() in ("positive","negative") else "neutral"
            for d in preds]

compiled_regex = {k: re.compile(v, flags=re.I) for k,v in ISSUE_PATTERNS.items()}

def extract_issues(text):
    found = [name for name,pat in compiled_regex.items() if pat.search(text)]
    return found or ["other"]

def mine_keyphrases(texts, top_n=20):
    kw_model = KeyBERT(model="all-MiniLM-L6-v2")
    corpus = " ".join(texts)
    return kw_model.extract_keywords(
        corpus, keyphrase_ngram_range=(1,3),
        stop_words="english", top_n=top_n
    )

def plot_sentiment_bar(df, path_png):
    ax = sns.countplot(x="sentiment", data=df,
                       order=["positive","negative","neutral"])
    ax.set_title("Sentiment distribution (all models)")
    ax.set_xlabel("")
    for p in ax.patches:
        ax.annotate(str(p.get_height()), (p.get_x()+0.25, p.get_height()+1))
    plt.tight_layout();  plt.savefig(path_png, dpi=160);  plt.close()

def issue_bar_by_sentiment(sub_df, model_name):
    fig, ax = plt.subplots(figsize=(8,4))
    sns.countplot(data=sub_df, x="issues", hue="sentiment",
                  order=ISSUE_ORDER, hue_order=["positive","negative","neutral"], ax=ax)
    ax.set_title(f"Issue counts – {model_name}")
    ax.set_xlabel("Issue category");  ax.set_ylabel("Count")
    ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right")
    fig.tight_layout()
    return fig

def html_report(sent_ct, issue_ct, phrases, img_path):
    dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
    parts = [f"<h1>Car-Rental Feedback Report</h1><p><em>{html.escape(dt)}</em></p>",
             "<h2>Overall sentiment</h2>",
             f'<img src="{html.escape(img_path)}" width="460">',
             "<ul>"]
    for k,v in sent_ct.items():
        parts.append(f"<li>{k.title()}: {v}</li>")
    parts.append("</ul><h2>Top complaint categories</h2><ul>")
    for iss,n in issue_ct.most_common():
        parts.append(f"<li>{iss.replace('_',' ').title()}: {n}</li>")
    parts.append("</ul>")
    if phrases:
        parts.append("<h2>Key phrases (KeyBERT)</h2><ol>")
        for ph,sc in phrases:
            parts.append(f"<li>{html.escape(ph)} <span style='color:#888'>({sc:.2f})</span></li>")
        parts.append("</ol>")
    return "\n".join(parts)

csv_path="car_rental_feedback.csv"
df = load_reviews(Path(csv_path))

print("🔍 Tagging sentiment…")
df["sentiment"] = tag_sentiment(df["review"].tolist())

print("🔍 Extracting issue categories…")
df["issues"] = df["review"].apply(extract_issues)
exploded = df.explode("issues")

phrases = []
if KEYBERT_OK:
    print("🔍 Mining key phrases (KeyBERT)…")
    phrases = mine_keyphrases(df["review"].tolist(), top_n=20)

sent_ct   = Counter(df["sentiment"])
issue_ct  = Counter(exploded["issues"])
per_model_sent = df.pivot_table(index="car_model",
                                columns="sentiment",
                                aggfunc="size",
                                fill_value=0)

ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
chart_png = f"sentiment_{ts}.png"
plot_sentiment_bar(df, chart_png)

html_out = f"summary_{ts}.html"
with open(html_out, "w", encoding="utf8") as fp:
    fp.write(html_report(sent_ct, issue_ct, phrases, chart_png))

csv_out = f"details_{ts}.csv"
df.to_csv(csv_out, index=False)

pdf_out = f"summary_{ts}.pdf"
with PdfPages(pdf_out) as pdf:
    fig = plt.figure(figsize=(8.3, 11.7))
    plt.axis("off")
    txt = f"Car-Rental Feedback Report\n\nGenerated {datetime.datetime.now():%Y-%m-%d %H:%M}\n\n" \
          f"Total reviews: {len(df)}\n" \
          f"Positive: {sent_ct['positive']}    " \
          f"Negative: {sent_ct['negative']}    " \
          f"Neutral: {sent_ct['neutral']}"
    plt.text(0.5, 0.6, txt, ha="center", va="center", fontsize=14)
    table_data = per_model_sent.reset_index().values.tolist()
    col_labels = ["Model", "Negative", "Neutral", "Positive"]
    table = plt.table(cellText=table_data, colLabels=col_labels,
                      loc="center", colWidths=[0.25,0.15,0.15,0.15])
    table.auto_set_font_size(False);  table.set_fontsize(8)
    table.scale(1, 1.5)
    pdf.savefig(fig);  plt.close()

    fig_img = plt.figure(figsize=(8, 4.5))
    plt.imshow(plt.imread(chart_png))
    plt.axis("off")
    pdf.savefig(fig_img, dpi=300)
    plt.close(fig_img)

    for model in df["car_model"].unique():
        sub = exploded[exploded["car_model"] == model]
        if sub.empty: continue
        fig = issue_bar_by_sentiment(sub, model)
        sc = per_model_sent.loc[model]
        txt2 = f"{model} – Positive: {sc.get('positive',0)}, " \
                f"Negative: {sc.get('negative',0)}, " \
                f"Neutral: {sc.get('neutral',0)}"
        fig.text(0.5, 1.02, txt2, ha="center", fontsize=10)
        pdf.savefig(fig);  plt.close(fig)

print("\n======== SUMMARY ========")
print(textwrap.indent(json.dumps(sent_ct, indent=2), "  "))
print("\nTop issues:")
for iss,n in issue_ct.most_common():
    print(f"  {iss:15s} {n}")
print("\nSentiment per model:")
print(per_model_sent)
if not KEYBERT_OK:
    print("\n(Key-phrase section skipped – install packages to enable.)")
print(f"\nHTML report  : {html_out}")
print(f"CSV details  : {csv_out}")
print(f"PDF summary  : {pdf_out}")
print("=========================")

🛈  KeyBERT (or its deps) not available – key-phrase mining disabled.
    pip install --upgrade keybert sentence-transformers transformers
🔍 Tagging sentiment…


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


🔍 Extracting issue categories…


  ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right")
  ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right")
  ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right")
  ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right")
  ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right")
  ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right")
  ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right")
  ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right")
  ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right")
  ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right")
  ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right")
  ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right")
  ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right")
  ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right")
  ax.set_xticklabels(ax.get_xticklabels(), rotat


  {
    "positive": 226,
    "negative": 174
  }

Top issues:
  staff_behavior  266
  car_condition   152
  late_delivery   137
  cleanliness     133
  billing         132
  other           18

Sentiment per model:
sentiment         negative  positive
car_model                           
Audi A4                 11        16
BMW 3 Series            12        15
Chevrolet Malibu        10        16
Ford Focus              13        16
Honda Civic              9        10
Hyundai Elantra         12         9
Jeep Wrangler           11        17
Kia Optima              11        16
Mazda 3                 10        17
Mercedes C-Class        13        17
Nissan Altima            7        20
Subaru Impreza          12        14
Tesla Model 3           15        13
Toyota Corolla          13        12
Volkswagen Jetta        15        18

(Key-phrase section skipped – install packages to enable.)

HTML report  : summary_20250705_164814.html
CSV details  : details_20250705_164814.csv
PDF sum