In [None]:
```json
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Operator Assistant - Data Exploration"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Setup and Data Loading"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
    "from scipy import stats\n",
    "import random\n",
    "from datetime import datetime, timedelta\n",
    "\n",
    "sns.set_theme(style=\"whitegrid\")\n",
    "pd.set_option('display.max_columns', 50)\n",
    "pd.set_option('display.width', 1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Simulate data as no dataset was provided\n",
    "def generate_synthetic_data(num_records=1000):\n",
    "    data = []\n",
    "    intents = ['file_create', 'file_list', 'file_delete', 'web_search', 'other', 'greeting']\n",
    "    base_commands = {\n",
    "        'file_create': ['create file {name}.txt', 'make a new document called {name}', 'generate {name}.docx'],\n",
    "        'file_list': ['list files in documents', 'show me my downloads', 'what files are on the desktop?', 'list directory {dir}'],\n",
    "        'file_delete': ['delete {name}.txt', 'remove the file {name}.log', 'get rid of {name}.tmp', 'erase {name}.pdf'],\n",
    "        'web_search': ['search for {query}', 'what is {query}?', 'find information about {query}', 'look up {query} on the web'],\n",
    "        'other': ['what time is it?', 'set a timer for 5 minutes', 'open calculator', 'tell me a joke'],\n",
    "        'greeting': ['hello assistant', 'good morning', 'hey operator', 'hi there']\n",
    "    }\n",
    "    file_names = ['report', 'notes', 'image', 'backup', 'project_data', 'meeting_minutes', 'draft']\n",
    "    file_ext = ['txt', 'docx', 'pdf', 'log', 'tmp', 'jpg', 'png']\n",
    "    search_queries = ['python programming', 'weather today', 'best restaurants near me', 'jupyter notebooks', 'ai assistants', 'natural language processing']\n",
    "    dirs = ['documents', 'downloads', 'desktop', 'pictures', '/var/log']\n",
    "    user_ids = [f'user_{i:03d}' for i in range(1, 21)]\n",
    "    start_date = datetime.now() - timedelta(days=30)\n",
    "\n",
    "    for i in range(num_records):\n",
    "        intent = random.choice(intents)\n",
    "        user_id = random.choice(user_ids)\n",
    "        timestamp = start_date + timedelta(seconds=random.randint(0, 30*24*60*60))\n",
    "        \n",
    "        command_template = random.choice(base_commands[intent])\n",
    "        command_text = command_template\n",
    "        if '{name}' in command_template:\n",
    "            name = random.choice(file_names)\n",
    "            ext = random.choice(file_ext)\n",
    "            command_text = command_template.format(name=f'{name}_{i%10}.{ext}')\n",
    "        elif '{query}' in command_template:\n",
    "            query = random.choice(search_queries)\n",
    "            command_text = command_template.format(query=query)\n",
    "        elif '{dir}' in command_template:\n",
    "             dir_name = random.choice(dirs)\n",
    "             command_text = command_template.format(dir=dir_name)\n",
    "            \n",
    "        response_time_ms = max(50, int(np.random.normal(loc=500, scale=300)) + len(command_text) * 5)\n",
    "        success_prob = 0.9 if intent != 'other' else 0.98\n",
    "        success = np.random.rand() < success_prob\n",
    "        \n",
    "        confirmation_required = False\n",
    "        confirmation_given = np.nan\n",
    "        if intent == 'file_delete':\n",
    "            confirmation_required = True\n",
    "            if success: # Only ask for confirmation if the initial step might proceed\n",
    "                 # Simulate user confirmation (more likely to confirm than not)\n",
    "                 confirmation_given = np.random.rand() < 0.85 \n",
    "                 success = confirmation_given # Final success depends on confirmation\n",
    "            else:\n",
    "                 confirmation_given = False # If initial check failed (e.g. file not found), no confirmation needed/given\n",
    "                 \n",
    "        data.append({\n",
    "            'timestamp': timestamp,\n",
    "            'user_id': user_id,\n",
    "            'command_text': command_text,\n",
    "            'intent': intent,\n",
    "            'response_time_ms': response_time_ms,\n",
    "            'success': success,\n",
    "            'confirmation_required': confirmation_required,\n",
    "            'confirmation_given': confirmation_given\n",
    "        })\n",
    "        \n",
    "    df = pd.DataFrame(data)\n",
    "    df['timestamp'] = pd.to_datetime(df['timestamp'])\n",
    "    # Ensure boolean types where appropriate, handling NaN for confirmation_given\n",
    "    df['success'] = df['success'].astype(bool)\n",
    "    df['confirmation_required'] = df['confirmation_required'].astype(bool)\n",
    "    # Convert confirmation_given to nullable boolean\n",
    "    df['confirmation_given'] = df['confirmation_given'].map({True: True, False: False, np.nan: pd.NA}).astype('boolean')\n",
    "    return df.sort_values(by='timestamp').reset_index(drop=True)\n",
    "\n",
    "df = generate_synthetic_data(num_records=1500)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.1 Data Inspection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"First 5 rows:\")\n",
    "display(df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\nData Info:\")\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\nMissing Values:\")\n",
    "print(df.isnull().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\nDescriptive Statistics (Numerical):\")\n",
    "display(df.describe())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\nDescriptive Statistics (Categorical/Object):\")\n",
    "display(df.describe(include=['object', 'boolean', 'datetime64[ns]']))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Exploratory Data Analysis (EDA)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.1 Intent Distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(10, 6))\n",
    "sns.countplot(data=df, y='intent', order = df['intent'].value_counts().index, palette='viridis')\n",
    "plt.title('Distribution of Command Intents')\n",
    "plt.xlabel('Frequency')\n",
    "plt.ylabel('Intent')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.2 Response Time Distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(12, 5))\n",
    "\n",
    "plt.subplot(1, 2, 1)\n",
    "sns.histplot(df['response_time_ms'], kde=True, bins=30)\n",
    "plt.title('Distribution of Response Time (ms)')\n",
    "plt.xlabel('Response Time (ms)')\n",
    "plt.ylabel('Frequency')\n",
    "\n",
    "plt.subplot(1, 2, 2)\n",
    "sns.boxplot(data=df, x='intent', y='response_time_ms', palette='viridis')\n",
    "plt.title('Response Time by Intent')\n",
    "plt.xlabel('Intent')\n",
    "plt.ylabel('Response Time (ms)')\n",
    "plt.xticks(rotation=45, ha='right')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.3 Success Rate Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(10, 6))\n",
    "success_rate = df.groupby('intent')['success'].mean().sort_values(ascending=False)\n",
    "sns.barplot(x=success_rate.index, y=success_rate.values, palette='viridis')\n",
    "plt.title('Success Rate per Intent')\n",
    "plt.xlabel('Intent')\n",
    "plt.ylabel('Success Rate')\n",
    "plt.xticks(rotation=45, ha='right')\n",
    "plt.ylim(0, 1)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.4 Command Length Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['command_length'] = df['command_text'].apply(len)\n",
    "\n",
    "plt.figure(figsize=(12, 5))\n",
    "\n",
    "plt.subplot(1, 2, 1)\n",
    "sns.histplot(df['command_length'], kde=True, bins=20)\n",
    "plt.title('Distribution of Command Length')\n",
    "plt.xlabel('Command Length (characters)')\n",
    "plt.ylabel('Frequency')\n",
    "\n",
    "plt.subplot(1, 2, 2)\n",
    "sns.scatterplot(data=df, x='command_length', y='response_time_ms', alpha=0.5)\n",
    "plt.title('Response Time vs. Command Length')\n",
    "plt.xlabel('Command Length (characters)')\n",
    "plt.ylabel('Response Time (ms)')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.5 Confirmation Analysis (for 'file_delete')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "delete_df = df[df['intent'] == 'file_delete'].copy()\n",
    "\n",
    "if not delete_df.empty:\n",
    "    # Convert nullable boolean to object/string for crosstab/countplot if needed, handling NA\n",
    "    delete_df['confirmation_given_str'] = delete_df['confirmation_given'].astype(str)\n",
    "    \n",
    "    plt.figure(figsize=(12, 5))\n",
    "    \n",
    "    plt.subplot(1, 2, 1)\n",
    "    sns.countplot(data=delete_df, x='confirmation_given_str', hue='success', palette='coolwarm')\n",
    "    plt.title('Confirmation Given vs. Final Success (for file_delete)')\n",
    "    plt.xlabel('Confirmation Given (True/False/<NA>)')\n",
    "    plt.ylabel('Count')\n",
    "    plt.legend(title='Success')\n",
    "\n",
    "    # Crosstab requires non-NA values usually, let's fill NA for visualization purposes if needed\n",
    "    # Or analyze directly\n",
    "    print(\"\\nConfirmation Status for 'file_delete' intent:\")\n",
    "    print(delete_df[['confirmation_required', 'confirmation_given', 'success']].value_counts(dropna=False))\n",
    "    \n",
    "    # Example: Crosstab of confirmation given vs success (only where confirmation was applicable)\n",
    "    confirmed_subset = delete_df.dropna(subset=['confirmation_given'])\n",
    "    if not confirmed_subset.empty:\n",
    "        ct = pd.crosstab(confirmed_subset['confirmation_given'], confirmed_subset['success'])\n",
    "        plt.subplot(1, 2, 2)\n",
    "        sns.heatmap(ct, annot=True, fmt='d', cmap='coolwarm')\n",
    "        plt.title('Heatmap: Confirmation Given vs Success')\n",
    "        plt.xlabel('Success')\n",
    "        plt.ylabel('Confirmation Given')\n",
    "    else:\n",
    "        print(\"\\nNo records where confirmation was given/denied (all NA or empty subset).\")\n",
    "\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "else:\n",
    "    print(\"No 'file_delete' intents found in the dataset.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.6 Activity Over Time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.set_index('timestamp', inplace=True)\n",
    "plt.figure(figsize=(14, 6))\n",
    "df.resample('D')['command_text'].count().plot()\n",
    "plt.title('Number of Commands Per Day')\n",
    "plt.xlabel('Date')\n",
    "plt.ylabel('Number of Commands')\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "df.reset_index(inplace=True) # Reset index after resampling"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Statistical Analysis"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.1 Correlation Analysis (Numerical Features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "numerical_cols = ['response_time_ms', 'command_length']\n",
    "correlation_matrix = df[numerical_cols].corr()\n",
    "\n",
    "plt.figure(figsize=(6, 4))\n",
    "sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')\n",
    "plt.title('Correlation Matrix of Numerical Features')\n",
    "plt.show()\n",
    "\n",
    "print(\"\\nCorrelation Matrix:\")\n",
    "display(correlation_matrix)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.2 ANOVA: Response Time across Intents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check if response times differ significantly across intents\n",
    "intents_list = df['intent'].unique()\n",
    "grouped_data = [df['response_time_ms'][df['intent'] == intent] for intent in intents_list]\n",
    "\n",
    "if len(intents_list) > 1:\n",
    "    f_val, p_val = stats.f_oneway(*grouped_data)\n",
    "    print(f\"\\nANOVA test for response_time_ms across intents:\")\n",
    "    print(f\"F-statistic: {f_val:.4f}\")\n",
    "    print(f\"P-value: {p_val:.4g}\")\n",
    "    if p_val < 0.05:\n",
    "        print(\"Result: There is a statistically significant difference in mean response times between at least two intents (p < 0.05).\")\n",
    "    else:\n",
    "        print(\"Result: There is no statistically significant difference in mean response times across intents (p >= 0.05).\")\n",
    "else:\n",
    "    print(\"\\nANOVA requires more than one group (intent) to compare.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.3 Chi-Squared Test: Intent vs. Success"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check if there's an association between command intent and success rate\n",
    "contingency_table = pd.crosstab(df['intent'], df['success'])\n",
    "\n",
    "print(\"\\nContingency Table (Intent vs. Success):\")\n",
    "display(contingency_table)\n",
    "\n",
    "if contingency_table.shape[0] > 1 and contingency_table.shape[1] > 1:\n",
    "    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)\n",
    "    print(f\"\\nChi-Squared Test for Intent vs. Success:\")\n",
    "    print(f\"Chi2 Statistic: {chi2:.4f}\")\n",
    "    print(f\"P-value: {p:.4g}\")\n",
    "    print(f\"Degrees of Freedom: {dof}\")\n",
    "    #print(\"Expected Frequencies:\")\n",
    "    #print(expected)\n",
    "    if p < 0.05:\n",
    "        print(\"Result: There is a statistically significant association between command intent and success (p < 0.05).\")\n",
    "    else:\n",
    "        print(\"Result: There is no statistically significant association between command intent and success (p >= 0.05).\")\n",
    "else:\n",
    "     print(\"\\nChi-Squared test requires at least 2 intents and 2 outcomes (success/fail).\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Feature Engineering Experiments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We already created 'command_length'\n",
    "# Add 'word_count'\n",
    "df['word_count'] = df['command_text'].apply(lambda x: len(x.split()))\n",
    "\n",
    "# Add binary features for keywords\n",
    "df['has_delete'] = df['command_text'].str.contains('delete|remove|erase|get rid of', case=False, regex=True)\n",
    "df['has_create'] = df['command_text'].str.contains('create|make|generate', case=False, regex=True)\n",
    "df['has_list'] = df['command_text'].str.contains('list|show me|what files', case=False, regex=True)\n",
    "df['has_search'] = df['command_text'].str.contains('search|what is|find|look up', case=False, regex=True)\n",
    "\n",
    "print(\"DataFrame with new features:\")\n",
    "display(df[['command_text', 'command_length', 'word_count', 'has_delete', 'has_create', 'has_list', 'has_search']].head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze new features\n",
    "print(\"\\nWord Count Distribution:\")\n",
    "plt.figure(figsize=(8, 4))\n",
    "sns.histplot(df['word_count'], bins=15, kde=True)\n",
    "plt.title('Distribution of Word Count in Commands')\n",
    "plt.xlabel('Word Count')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\nKeyword Feature Counts:\")\n",
    "keyword_cols = ['has_delete', 'has_create', 'has_list', 'has_search']\n",
    "print(df[keyword_cols].sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Initial Model Testing (Intent Classification)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5.1 Prepare Data for Modeling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Features: command_text\n",
    "# Target: intent\n",
    "X = df['command_text']\n",
    "y = df['intent']\n",
    "\n",
    "# Split data\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)\n",
    "\n",
    "print(f\"Training set size: {len(X_train)}\")\n",
    "print(f\"Test set size: {len(X_test)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5.2 Feature Extraction (TF-IDF)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)\n",
    "\n",
    "X_train_tfidf = vectorizer.fit_transform(X_train)\n",
    "X_test_tfidf = vectorizer.transform(X_test)\n",
    "\n",
    "print(f\"Shape of TF-IDF matrix (Train): {X_train_tfidf.shape}\")\n",
    "print(f\"Shape of TF-IDF matrix (Test): {X_test_tfidf.shape}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5.3 Train and Evaluate Naive Bayes Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nb_model = MultinomialNB()\n",
    "nb_model.fit(X_train_tfidf, y_train)\n",
    "\n",
    "y_pred_nb = nb_model.predict(X_test_tfidf)\n",
    "\n",
    "accuracy_nb = accuracy_score(y_test, y_pred_nb)\n",
    "print(f\"Naive Bayes Accuracy: {accuracy_nb:.4f}\\n\")\n",
    "\n",
    "print(\"Naive Bayes Classification Report:\")\n",
    "print(classification_report(y_test, y_pred_nb))\n",
    "\n",
    "print(\"Naive Bayes Confusion Matrix:\")\n",
    "cm_nb = confusion_matrix(y_test, y_pred_nb, labels=nb_model.classes_)\n",
    "plt.figure(figsize=(8, 6))\n",
    "sns.heatmap(cm_nb, annot=True, fmt='d', cmap='Blues', xticklabels=nb_model.classes_, yticklabels=nb_model.classes_)\n",
    "plt.title('Naive Bayes Confusion Matrix')\n",
    "plt.xlabel('Predicted Label')\n",
    "plt.ylabel('True Label')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5.4 Train and Evaluate Logistic Regression Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "logreg_model = LogisticRegression(max_iter=1000, random_state=42)\n