In [None]:
```json
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import torch\n",
    "from datasets import Dataset, DatasetDict, load_metric\n",
    "from transformers import (\n",
    "    AutoTokenizer, \n",
    "    AutoModelForSeq2SeqLM, \n",
    "    Seq2SeqTrainingArguments, \n",
    "    Seq2SeqTrainer, \n",
    "    DataCollatorForSeq2Seq,\n",
    "    pipeline\n",
    ")\n",
    "import nltk\n",
    "import spacy\n",
    "from sklearn.model_selection import train_test_split\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import optuna\n",
    "from pathlib import Path\n",
    "\n",
    "# Download necessary NLTK data\n",
    "try:\n",
    "    nltk.data.find('tokenizers/punkt')\n",
    "except nltk.downloader.DownloadError:\n",
    "    nltk.download('punkt')\n",
    "\n",
    "# Load spaCy model (download if needed)\n",
    "try:\n",
    "    nlp_spacy = spacy.load(\"en_core_web_sm\")\n",
    "except OSError:\n",
    "    print(\"Downloading spaCy en_core_web_sm model...\")\n",
    "    spacy.cli.download(\"en_core_web_sm\")\n",
    "    nlp_spacy = spacy.load(\"en_core_web_sm\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- Configuration ---\n",
    "MODEL_NAME = \"facebook/bart-large-cnn\" # Pre-trained summarization model\n",
    "TOKENIZER_NAME = \"facebook/bart-large-cnn\"\n",
    "OUTPUT_DIR = \"./contextual_convo_condenser_model\"\n",
    "LOGGING_DIR = f\"{OUTPUT_DIR}/logs\"\n",
    "TRAIN_BATCH_SIZE = 4\n",
    "EVAL_BATCH_SIZE = 4\n",
    "NUM_TRAIN_EPOCHS = 3\n",
    "LEARNING_RATE = 5e-5\n",
    "WEIGHT_DECAY = 0.01\n",
    "MAX_INPUT_LENGTH = 1024\n",
    "MAX_TARGET_LENGTH = 128\n",
    "SEED = 42\n",
    "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "USE_DUMMY_DATA = True # Set to False to load real data\n",
    "DUMMY_DATA_SIZE = 100\n",
    "TEST_SPLIT_SIZE = 0.1\n",
    "VALIDATION_SPLIT_SIZE = 0.1 # Relative to the remaining data after test split\n",
    "N_TRIALS_OPTUNA = 10 # Number of hyperparameter tuning trials\n",
    "\n",
    "Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)\n",
    "Path(LOGGING_DIR).mkdir(parents=True, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- Placeholder Data Loading --- \n",
    "# In a real project, this function would load data from files (CSV, JSON, etc.)\n",
    "# It should return a pandas DataFrame with at least 'transcript' and 'summary' columns.\n",
    "def load_data(use_dummy=True, dummy_size=50):\n",
    "    if use_dummy:\n",
    "        print(f\"Using dummy data with size {dummy_size}...\")\n",
    "        data = {\n",
    "            'transcript': [\n",
    "                f\"Speaker A: Hello team, let's discuss the Q3 project goals. Speaker B: Agreed. I think we should focus on user acquisition. Speaker A: Good point. Action item for John: Research acquisition channels by Friday. Speaker C: What about retention? Speaker A: Also important. Decision: We will allocate 60% resources to acquisition, 40% to retention. Any questions? Speaker B: No, sounds good. Vibe seems positive today.\" for i in range(dummy_size)\n",
    "            ],\n",
    "            'summary': [\n",
    "                f\"Meeting Summary {i}: Key Decision: Allocate 60% resources to acquisition, 40% to retention. Action Item: John to research acquisition channels by Friday. Sentiment: Positive.\" for i in range(dummy_size)\n",
    "            ]\n",
    "        }\n",
    "        df = pd.DataFrame(data)\n",
    "    else:\n",
    "        # Replace with actual data loading logic\n",
    "        # Example: df = pd.read_csv('path/to/your/transcripts.csv')\n",
    "        # Ensure columns are named 'transcript' and 'summary'\n",
    "        print(\"Loading real data (replace with actual implementation)...\")\n",
    "        # Dummy implementation for real data loading placeholder\n",
    "        data = {\n",
    "            'transcript': [\"Real transcript data point 1...\", \"Real transcript data point 2...\"],\n",
    "            'summary': [\"Real summary 1...\", \"Real summary 2...\"]\n",
    "        }\n",
    "        df = pd.DataFrame(data)\n",
    "        if 'transcript' not in df.columns or 'summary' not in df.columns:\n",
    "            raise ValueError(\"Dataframe must contain 'transcript' and 'summary' columns\")\n",
    "    return df\n",
    "\n",
    "# Load the data\n",
    "raw_df = load_data(use_dummy=USE_DUMMY_DATA, dummy_size=DUMMY_DATA_SIZE)\n",
    "print(f\"Loaded data shape: {raw_df.shape}\")\n",
    "print(raw_df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- Data Splitting ---\n",
    "train_val_df, test_df = train_test_split(raw_df, test_size=TEST_SPLIT_SIZE, random_state=SEED)\n",
    "train_df, val_df = train_test_split(train_val_df, test_size=VALIDATION_SPLIT_SIZE / (1 - TEST_SPLIT_SIZE), random_state=SEED)\n",
    "\n",
    "print(f\"Train size: {len(train_df)}\")\n",
    "print(f\"Validation size: {len(val_df)}\")\n",
    "print(f\"Test size: {len(test_df)}\")\n",
    "\n",
    "# Convert pandas DataFrames to Hugging Face Datasets\n",
    "train_dataset = Dataset.from_pandas(train_df)\n",
    "val_dataset = Dataset.from_pandas(val_df)\n",
    "test_dataset = Dataset.from_pandas(test_df)\n",
    "\n",
    "raw_datasets = DatasetDict({\n",
    "    'train': train_dataset,\n",
    "    'validation': val_dataset,\n",
    "    'test': test_dataset\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- Preprocessing & Tokenization ---\n",
    "tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)\n",
    "\n",
    "def preprocess_function(examples):\n",
    "    inputs = [doc for doc in examples[\"transcript\"]]\n",
    "    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True, padding=\"max_length\")\n",
    "\n",
    "    # Setup the tokenizer for targets\n",
    "    with tokenizer.as_target_tokenizer():\n",
    "        labels = tokenizer(examples[\"summary\"], max_length=MAX_TARGET_LENGTH, truncation=True, padding=\"max_length\")\n",
    "\n",
    "    model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
    "    # Replace tokenizer.pad_token_id in the labels by -100 to ignore padding in the loss\n",
    "    model_inputs[\"labels\"] = [\n",
    "        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in model_inputs[\"labels\"]\n",
    "    ]\n",
    "    return model_inputs\n",
    "\n",
    "tokenized_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets[\"train\"].column_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- Model Definition (Summarization) ---\n",
    "model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)\n",
    "\n",
    "# Data Collator\n",
    "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- Evaluation Metrics ---\n",
    "rouge_metric = load_metric(\"rouge\")\n",
    "\n",
    "def compute_metrics(eval_pred):\n",
    "    predictions, labels = eval_pred\n",
    "    # Decode generated summaries, replacing -100 in the labels as we can't decode them.\n",
    "    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)\n",
    "    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)\n",
    "    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
    "    \n",
    "    # Rouge expects a newline after each sentence \n",
    "    decoded_preds = [\"\\n\".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]\n",
    "    decoded_labels = [\"\\n\".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]\n",
    "    \n",
    "    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)\n",
    "    # Extract a few results\n",
    "    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}\n",
    "    \n",
    "    # Add mean generated length\n",
    "    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]\n",
    "    result[\"gen_len\"] = np.mean(prediction_lens)\n",
    "    \n",
    "    return {k: round(v, 4) for k, v in result.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- Training Arguments (Initial) ---\n",
    "training_args = Seq2SeqTrainingArguments(\n",
    "    output_dir=OUTPUT_DIR,\n",
    "    evaluation_strategy=\"epoch\",\n",
    "    learning_rate=LEARNING_RATE,\n",
    "    per_device_train_batch_size=TRAIN_BATCH_SIZE,\n",
    "    per_device_eval_batch_size=EVAL_BATCH_SIZE,\n",
    "    weight_decay=WEIGHT_DECAY,\n",
    "    save_total_limit=3,\n",
    "    num_train_epochs=NUM_TRAIN_EPOCHS,\n",
    "    predict_with_generate=True,\n",
    "    fp16=torch.cuda.is_available(), # Enable mixed precision if GPU is available\n",
    "    logging_dir=LOGGING_DIR,\n",
    "    logging_steps=10,\n",
    "    save_strategy=\"epoch\",\n",
    "    load_best_model_at_end=True,\n",
    "    metric_for_best_model=\"rouge2\", # Choose metric to optimize for\n",
    "    greater_is_better=True,\n",
    "    report_to=\"tensorboard\", # Can integrate wandb or others\n",
    "    seed=SEED,\n",
    "    generation_max_length=MAX_TARGET_LENGTH,\n",
    "    generation_num_beams=4 # Add beam search for better generation during evaluation\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- Hyperparameter Tuning (Optuna) ---\n",
    "\n",
    "def model_init(trial):\n",
    "    # Reload the base model for each trial to avoid weight leakage\n",
    "    return AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)\n",

    "def optuna_hp_space(trial):\n",
    "    return {\n",
    "        \"learning_rate\": trial.suggest_float(\"learning_rate\", 1e-6, 1e-4, log=True),\n",
    "        \"num_train_epochs\": trial.suggest_int(\"num_train_epochs\", 1, 5),\n",
    "        \"per_device_train_batch_size\": trial.suggest_categorical(\"per_device_train_batch_size\", [4, 8]),\n",
    "        \"weight_decay\": trial.suggest_float(\"weight_decay\", 0.0, 0.1),\n",
    "        \"generation_num_beams\": trial.suggest_int(\"generation_num_beams\", 2, 6),\n",
    "    }\n",
    "\n",
    "# Need to adjust batch size based on Optuna suggestion\n",
    "# The trainer needs to be re-initialized inside the objective function if batch size changes\n",
    "# However, the default hyperparameter_search handles this if we pass model_init\n",
    "\n",
    "trainer_for_tuning = Seq2SeqTrainer(\n",
    "    model=None, # Model will be initialized by model_init\n",
    "    args=training_args, # Use base training args, Optuna will override some\n",
    "    train_dataset=tokenized_datasets[\"train\"],\n",
    "    eval_dataset=tokenized_datasets[\"validation\"],\n",
    "    tokenizer=tokenizer,\n",
    "    data_collator=data_collator,\n",
    "    compute_metrics=compute_metrics,\n",
    "    model_init=model_init,\n",

    ")\n",
    "\n",
    "print(\"\\n--- Starting Hyperparameter Search ---\")\n",
    "best_run = trainer_for_tuning.hyperparameter_search(\n",
    "    direction=\"maximize\",\n",
    "    backend=\"optuna\",\n",
    "    hp_space=optuna_hp_space,\n",
    "    n_trials=N_TRIALS_OPTUNA,\n",
    "    compute_objective=lambda metrics: metrics[\"eval_rouge2\"], # Objective to maximize\n",
    "    # You might need to adjust resources (n_jobs) depending on your setup\n",
    ")\n",
    "\n",
    "print(\"--- Hyperparameter Search Finished ---\")\n",
    "print(f\"Best run details: {best_run}\")\n",
    "\n",
    "# Extract best hyperparameters\n",
    "best_hyperparameters = best_run.hyperparameters\n",
    "print(\"\\nBest Hyperparameters Found:\")\n",
    "print(json.dumps(best_hyperparameters, indent=2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- Train Final Model with Best Hyperparameters ---\n",
    "print(\"\\n--- Training Final Model with Best Hyperparameters ---\")\n",
    "\n",
    "# Update training arguments with best hyperparameters\n",
    "for param, value in best_hyperparameters.items():\n",
    "    setattr(training_args, param, value)\n",
    "\n",
    "# Ensure output_dir is set correctly after potential modification by Optuna\n",
    "training_args.output_dir = OUTPUT_DIR \n",
    "training_args.logging_dir = LOGGING_DIR\n",
    "\n",
    "# Re-initialize model and trainer with best hyperparameters\n",
    "final_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)\n",
    "\n",
    "final_trainer = Seq2SeqTrainer(\n",
    "    model=final_model,\n",
    "    args=training_args,\n",
    "    train_dataset=tokenized_datasets[\"train\"],\n",
    "    eval_dataset=tokenized_datasets[\"validation\"],\n",
    "    tokenizer=tokenizer,\n",
    "    data_collator=data_collator,\n",
    "    compute_metrics=compute_metrics,\n",
    ")\n",
    "\n",
    "# Train the model\n",
    "train_result = final_trainer.train()\n",
    "\n",
    "print(\"--- Final Model Training Finished ---\")\n",
    "\n",
    "# Save training metrics\n",
    "metrics = train_result.metrics\n",
    "final_trainer.log_metrics(\"train\", metrics)\n",
    "final_trainer.save_metrics(\"train\", metrics)\n",
    "\n",
    "# Save the final model and tokenizer\n",
    "final_trainer.save_model(OUTPUT_DIR)\n",
    "tokenizer.save_pretrained(OUTPUT_DIR)\n",
    "\n",
    "print(f\"\\nFinal model saved to {OUTPUT_DIR}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- Evaluate Final Model on Test Set ---\n",
    "print(\"\\n--- Evaluating Final Model on Test Set ---\")\n",
    "\n",
    "test_results = final_trainer.evaluate(eval_dataset=tokenized_datasets[\"test\"])\n",
    "\n",
    "final_trainer.log_metrics(\"test\", test_results)\n",
    "final_trainer.save_metrics(\"test\", test_results)\n",
    "\n",
    "print(\"Test Set Evaluation Results:\")\n",
    "print(json.dumps(test_results, indent=2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- Demonstrate NER and Sentiment Analysis Integration ---\n",
    "print(\"\\n--- Demonstrating NER and Sentiment Analysis ---\")\n",
    "\n",
    "# Load the fine-tuned summarization model (if not already loaded)\n",
    "summarizer = pipeline(\"summarization\", model=OUTPUT_DIR, tokenizer=OUTPUT_DIR, device=0 if torch.cuda.is_available() else -1)\n",
    "\n",
    "# Load a sentiment analysis pipeline\n",
    "sentiment_analyzer = pipeline(\"sentiment-analysis\", device=0 if torch.cuda.is_available() else -1)\n",
    "\n",
    "# Example transcript for demonstration\n",
    "sample_transcript = \"\"\"\n",
    "Alice: Okay team, project Phoenix deadline is approaching. We need to finalize the deployment plan.\n",
    "Bob: I agree. I've drafted the initial steps. We need DevOps to review the server configurations by Wednesday.\n",
    "Charlie: I can coordinate with DevOps. Action Item for Charlie: Follow up with DevOps on server configs.\n",
    "Alice: Great. Also, Marketing needs the final feature list. Decision: Feature set X and Y are confirmed for launch.\n",
    "Bob: Sounds good. I'm a bit concerned about the integration testing timeline though.\n",
    "Alice: Let's schedule a separate meeting for that. For now, let's stick to the deployment plan. Vibe check: Seems focused but slightly stressed.\n",
    "Charlie: Agreed on the vibe. But we can manage.\n",
    "\"\"\"\n",
    "\n",
    "# 1. Generate Summary\n",
    "summary = summarizer(sample_transcript, max_length=MAX_TARGET_LENGTH, min_length=30, do_sample=False)[0]['summary_text']\n",
    "print(f\"\\nGenerated Summary:\\n{summary}\")\n",
    "\n",
    "# 2. Perform NER using spaCy\n",
    "doc = nlp_spacy(sample_transcript)\n",
    "print(\"\\nNamed Entities:\")\n",
    "entities = []\n",
    "for ent in doc.ents:\n",
    "    if ent.label_ in [\"PERSON\", \"ORG\", \"DATE\", \"GPE\"]: # Filter relevant entities\n",
    "        entities.append((ent.text, ent.label_))\n",
    "        print(f\"- {ent.text} ({ent.label_})\")\n",
    "\n",
    "# Simple Action Item Extraction (Rule-based example)\n",
    "action_items = []\n",
    "for sent in doc.sents:\n",
    "    if \"action item\" in sent.text.lower():\n",
    "        action_items.append(sent.text.strip())\n",
    "print(\"\\nPotential Action Items:\")\n",
    "for item in action_items:\n",
    "    print(f\"- {item}\")\n",
    "\n",
    "# 3. Perform Sentiment Analysis (Overall)\n",
    "# For time segments, you'd split the transcript and run sentiment on each part\n",
    "overall_sentiment = sentiment_analyzer(sample_transcript[:1024]) # Limit input size for standard models\n",
    "print(f\"\\nOverall Sentiment:\\n{overall_sentiment}\")\n",
    "\n",
    "# Example: Sentiment over time (simple split)\n",
    "print(\"\\nSentiment 'Vibe Check' (Segmented):\")\n",
    "sentences = nltk.sent_tokenize(sample_transcript)\n",
    "num_segments = 3\n",
    "segment_len = len(sentences) // num_segments\n",
    "for i in range(num_segments):\n",
    "    start = i * segment_len\n",
    "    end = (i + 1) * segment_len if i < num_segments - 1 else len(sentences)\n",
    "    segment_text = \" \".join(sentences[start:end])\n",
    "    if segment_text:\n",
    "        segment_sentiment = sentiment_analyzer(segment_text[:1024])\n",
    "        print(f\"- Segment {i+1}: {segment_sentiment}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- Results Visualization ---\n",
    "print(\"\\n--- Visualizing Results ---\")\n",
    "\n",
    "# Plot training history (Loss)\n",
    "try:\n",
    "    log_history = final_trainer.state.log_history\n",
    "    \n",
    "    train_logs = [log for log in log_history if 'loss' in log]\n",
    "    eval_logs = [log for log in log_history if 'eval_loss' in log]\n",
    "\n",
    "    train_steps = [log['step'] for log in train_logs]\n",
    "    train_loss = [log['loss'] for log in train_logs]\n",
    "    \n",
    "    eval_steps = [log['step'] for log in eval_logs]\n",
    "    eval_loss = [log['eval_loss'] for log in eval_logs]\n",
    "    eval_rouge2 = [log['eval_rouge2'] for log in eval_logs]\n",
    "\n",
    "    plt.figure(figsize=(12, 6))\n",
    "    sns.set_style(\"whitegrid\")\n",
    "\n",
    "    plt.subplot(1, 2, 1)\n",
    "    plt.plot(train_steps, train_loss, label='Training Loss')\n",
    "    plt.plot(eval_steps, eval_loss, label='Validation Loss', marker='o')\n",
    "    plt.title('Training and Validation Loss')\n",
    "    plt.xlabel('Steps')\n",
    "    plt.ylabel('Loss')\n",
    "    plt.legend()\n",
    "\n",
    "    plt.subplot(1, 2, 2)\n",
    "    plt.plot(eval_steps, eval_rouge2, label='Validation ROUGE-2', marker='o', color='green')\n",
    "    plt.title('Validation ROUGE-2 Score')\n",
    "    plt.xlabel('Steps')\n",
    "    plt.ylabel('ROUGE-2 F1 Score')\n",
    "    plt.legend()\n",
    "\n",
    "    plt.tight_layout()\n",
    "    plt.savefig(os.path.join(OUTPUT_DIR, \"training_plots.png\"))\n",
    "    plt.show()\n",
    "    print(f\"Training plots saved to {os.path.join(OUTPUT_DIR, 'training_plots.png')}\")\n",
    "\n",
    "except Exception as e:\n",
    "    print(f\"Could not plot training history: {e}\")\n",