In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fine-tune DistilBERT for Sentiment Analysis\n",
    "This notebook fine-tunes DistilBERT on IMDb movie reviews for binary sentiment classification."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Setup and Imports\n",
    "import os\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "from datasets import load_dataset\n",
    "from transformers import (\n",
    "    AutoTokenizer, \n",
    "    TFAutoModelForSequenceClassification,\n",
    "    DataCollatorWithPadding\n",
    ")\n",
    "from sklearn.metrics import accuracy_score, classification_report\n",
    "\n",
    "# Set random seeds for reproducibility\n",
    "tf.random.set_seed(42)\n",
    "np.random.seed(42)\n",
    "\n",
    "print(f\"TensorFlow version: {tf.__version__}\")\n",
    "print(f\"GPU available: {tf.config.list_physical_devices('GPU')}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load IMDb Dataset\n",
    "print(\"Loading IMDb dataset...\")\n",
    "dataset = load_dataset(\"imdb\")\n",
    "\n",
    "# Take 10% subset for faster training\n",
    "train_dataset = dataset[\"train\"].shuffle(seed=42).select(range(2500))  # 10% of 25k\n",
    "test_dataset = dataset[\"test\"].shuffle(seed=42).select(range(2500))    # 10% of 25k\n",
    "\n",
    "print(f\"Training samples: {len(train_dataset)}\")\n",
    "print(f\"Test samples: {len(test_dataset)}\")\n",
    "print(f\"Sample: {train_dataset[0]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load Pre-trained Model and Tokenizer\n",
    "model_name = \"distilbert-base-uncased\"\n",
    "\n",
    "print(f\"Loading tokenizer and model: {model_name}\")\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "model = TFAutoModelForSequenceClassification.from_pretrained(\n",
    "    model_name, \n",
    "    num_labels=2,\n",
    "    from_tf=False\n",
    ")\n",
    "\n",
    "print(f\"Model loaded with {model.num_parameters()} parameters\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tokenize Dataset\n",
    "def tokenize_function(examples):\n",
    "    return tokenizer(\n",
    "        examples[\"text\"],\n",
    "        truncation=True,\n",
    "        padding=False,  # We'll pad later with DataCollator\n",
    "        max_length=512\n",
    "    )\n",
    "\n",
    "print(\"Tokenizing datasets...\")\n",
    "tokenized_train = train_dataset.map(tokenize_function, batched=True)\n",
    "tokenized_test = test_dataset.map(tokenize_function, batched=True)\n",
    "\n",
    "print(\"Tokenization complete!\")\n",
    "print(f\"Sample tokenized: {tokenized_train[0]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert to TensorFlow Format\n",
    "data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors=\"tf\")\n",
    "\n",
    "print(\"Converting to TensorFlow datasets...\")\n",
    "\n",
    "tf_train_dataset = model.prepare_tf_dataset(\n",
    "    tokenized_train,\n",
    "    shuffle=True,\n",
    "    batch_size=16,\n",
    "    collate_fn=data_collator,\n",
    ")\n",
    "\n",
    "tf_test_dataset = model.prepare_tf_dataset(\n",
    "    tokenized_test,\n",
    "    shuffle=False,\n",
    "    batch_size=16,\n",
    "    collate_fn=data_collator,\n",
    ")\n",
    "\n",
    "print(\"TensorFlow datasets ready!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compile and Configure Model\n",
    "optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)\n",
    "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
    "metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]\n",
    "\n",
    "model.compile(optimizer=optimizer, loss=loss, metrics=metrics)\n",
    "\n",
    "print(\"Model compiled successfully!\")\n",
    "print(\"\\nModel Summary:\")\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fine-tune Model\n",
    "print(\"Starting fine-tuning...\")\n",
    "\n",
    "# Create callbacks\n",
    "callbacks = [\n",
    "    tf.keras.callbacks.EarlyStopping(\n",
    "        monitor='val_loss',\n",
    "        patience=1,\n",
    "        restore_best_weights=True\n",
    "    )\n",
    "]\n",
    "\n",
    "# Train the model\n",
    "history = model.fit(\n",
    "    tf_train_dataset,\n",
    "    validation_data=tf_test_dataset,\n",
    "    epochs=2,\n",
    "    callbacks=callbacks,\n",
    "    verbose=1\n",
    ")\n",
    "\n",
    "print(\"Training complete!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Evaluate Performance\n",
    "print(\"Evaluating model performance...\")\n",
    "\n",
    "# Get predictions\n",
    "predictions = model.predict(tf_test_dataset)\n",
    "predicted_labels = np.argmax(predictions.logits, axis=-1)\n",
    "\n",
    "# Get true labels\n",
    "true_labels = []\n",
    "for batch in tf_test_dataset:\n",
    "    true_labels.extend(batch['labels'].numpy())\n",
    "\n",
    "# Calculate metrics\n",
    "accuracy = accuracy_score(true_labels, predicted_labels)\n",
    "print(f\"\\nTest Accuracy: {accuracy:.4f}\")\n",
    "\n",
    "print(\"\\nClassification Report:\")\n",
    "print(classification_report(true_labels, predicted_labels, target_names=['Negative', 'Positive']))\n",
    "\n",
    "# Display training history\n",
    "print(f\"\\nTraining History:\")\n",
    "for epoch, (loss, acc, val_loss, val_acc) in enumerate(zip(\n",
    "    history.history['loss'],\n",
    "    history.history['sparse_categorical_accuracy'],\n",
    "    history.history['val_loss'],\n",
    "    history.history['val_sparse_categorical_accuracy']\n",
    ")):\n",
    "    print(f\"Epoch {epoch+1}: loss={loss:.4f}, acc={acc:.4f}, val_loss={val_loss:.4f}, val_acc={val_acc:.4f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save Model\n",
    "save_path = \"../sentiment_model\"\n",
    "\n",
    "print(f\"Saving model to {save_path}...\")\n",
    "model.save(save_path)\n",
    "\n",
    "# Also save the tokenizer for reference\n",
    "tokenizer.save_pretrained(save_path)\n",
    "\n",
    "print(f\"Model and tokenizer saved successfully to {save_path}\")\n",
    "print(f\"Files in directory: {os.listdir(save_path)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Test Inference\n",
    "print(\"Testing inference with saved model...\")\n",
    "\n",
    "# Load the saved model\n",
    "loaded_model = tf.keras.models.load_model(save_path)\n",
    "print(\"Model loaded successfully!\")\n",
    "\n",
    "# Test predictions\n",
    "test_texts = [\n",
    "    \"This movie was absolutely fantastic! Great acting and storyline.\",\n",
    "    \"Terrible film, waste of time. Poor acting and boring plot.\",\n",
    "    \"It was okay, nothing special but not bad either.\"\n",
    "]\n",
    "\n",
    "for text in test_texts:\n",
    "    # Tokenize\n",
    "    inputs = tokenizer(\n",
    "        text,\n",
    "        max_length=512,\n",
    "        truncation=True,\n",
    "        padding=True,\n",
    "        return_tensors=\"tf\"\n",
    "    )\n",
    "    \n",
    "    # Predict\n",
    "    outputs = loaded_model(inputs)\n",
    "    probabilities = tf.nn.softmax(outputs.logits, axis=-1).numpy()[0]\n",
    "    predicted_label = int(np.argmax(probabilities))\n",
    "    confidence = float(probabilities[predicted_label])\n",
    "    \n",
    "    sentiment = \"Positive\" if predicted_label == 1 else \"Negative\"\n",
    "    print(f\"\\nText: {text}\")\n",
    "    print(f\"Prediction: {sentiment} (confidence: {confidence:.4f})\")\n",
    "\n",
    "print(\"\\n✅ Model training and testing complete!\")\n",
    "print(\"🚀 You can now run the FastAPI server: uvicorn app.main:app --host 0.0.0.0 --port 8000\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}