# Operator Assistant - Model Training

## 1. Imports

In [None]:
import os
import json
import random
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn imports
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Other project-related libraries (might not be used directly in training but are part of the project context)
import speech_recognition as sr 
import pyttsx3
import shutil
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import sounddevice # or import pyaudio

# Load environment variables (if any API keys are needed later)
load_dotenv()

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

## 2. Data Loading and Preparation

In [None]:
# Simulate loading data from a project module or file
# In a real scenario, this would load data from a CSV, JSON, or database
def load_simulated_data():
    """Creates a simulated dataset of commands and intents."""
    data = {
        'text': [
            # File Creation
            "create a new file named report.txt",
            "make a document called notes",
            "generate an empty file project_plan.docx",
            "new text file data.csv",
            "touch file script.py",
            
            # File Listing
            "list all files in the current directory",
            "show me the files here",
            "what files are in this folder?",
            "directory listing",
            "ls command",
            
            # File Deletion
            "delete the file temp.log",
            "remove old_report.txt",
            "get rid of junk.dat",
            "erase the document draft_v1.doc",
            "rm temporary_file",
            
            # Web Search
            "search the web for python tutorials",
            "find information about machine learning",
            "what is the weather today?",
            "google the capital of France",
            "look up the definition of AI",
            "who won the world series last year?",
            "search for nearby restaurants",
            
            # Other/Ambiguous (Could be expanded)
            "hello operator",
            "tell me a joke",
            "what time is it?",
            "open calculator", # This might become a separate intent later
            "shutdown the computer" # This needs careful handling
        ],
        'intent': [
            # File Creation
            'create_file', 'create_file', 'create_file', 'create_file', 'create_file',
            # File Listing
            'list_files', 'list_files', 'list_files', 'list_files', 'list_files',
            # File Deletion
            'delete_file', 'delete_file', 'delete_file', 'delete_file', 'delete_file',
            # Web Search
            'search_web', 'search_web', 'search_web', 'search_web', 'search_web', 'search_web', 'search_web',
            # Other
            'other', 'other', 'other', 'other', 'other'
        ]
    }
    return pd.DataFrame(data)

# Load the data
df = load_simulated_data()

# Display data info and sample
print("Data Info:")
df.info()
print("\nData Sample:")
print(df.head())
print("\nIntent Distribution:")
print(df['intent'].value_counts())

# Prepare data for modeling
X = df['text']
y = df['intent']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Save the label encoder for later use during inference
joblib.dump(label_encoder, 'label_encoder.joblib')
print("\nLabel Encoder Classes:", label_encoder.classes_)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.25, random_state=42, stratify=y_encoded
)

print(f"\nTraining set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

## 3. Model Definition and Pipeline

In [None]:
# Define a pipeline with TF-IDF Vectorizer and a Classifier (e.g., Logistic Regression)
# We will use GridSearchCV later to choose the best classifier and its hyperparameters
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(random_state=42)),
    ('clf', LogisticRegression(random_state=42, max_iter=1000)) # Placeholder classifier
])

## 4. Hyperparameter Tuning (Grid Search)

In [None]:
# Define parameter grid for GridSearchCV
# We'll test different classifiers and their parameters, plus TF-IDF parameters
param_grid = [
    {
        'tfidf__ngram_range': [(1, 1), (1, 2)],
        'tfidf__use_idf': [True, False],
        'tfidf__norm': ['l1', 'l2'],
        'clf': [LogisticRegression(random_state=42, max_iter=1000, solver='liblinear')],
        'clf__C': [0.1, 1, 10, 100],
        'clf__penalty': ['l1', 'l2']
    },
    {
        'tfidf__ngram_range': [(1, 1), (1, 2)],
        'tfidf__use_idf': [True, False],
        'tfidf__norm': ['l1', 'l2'],
        'clf': [SVC(random_state=42, probability=True)],
        'clf__C': [0.1, 1, 10, 100],
        'clf__gamma': [0.1, 0.01, 0.001],
        'clf__kernel': ['linear', 'rbf']
    },
     {
        'tfidf__ngram_range': [(1, 1), (1, 2)],
        'tfidf__use_idf': [True, False],
        'tfidf__norm': ['l1', 'l2'],
        'clf': [MultinomialNB()],
        'clf__alpha': [0.1, 0.5, 1.0]
    }
]

# Perform Grid Search with Cross-Validation (CV)
# Using accuracy as the scoring metric, cv=3 for faster execution on small dataset
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)

print("Starting Grid Search...")
grid_search.fit(X_train, y_train)

# Print best parameters and best score
print("\nGrid Search Finished.")
print(f"Best Score (Accuracy): {grid_search.best_score_:.4f}")
print("Best Parameters:")
best_params = grid_search.best_params_
for param_name in sorted(best_params.keys()):
    print(f"  {param_name}: {best_params[param_name]}")

# Get the best estimator
best_model = grid_search.best_estimator_

## 5. Model Evaluation

In [None]:
# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)

# Decode predictions and true labels for reporting
y_pred_labels = label_encoder.inverse_transform(y_pred)
y_test_labels = label_encoder.inverse_transform(y_test)
target_names = label_encoder.classes_

# Print classification report
print("\nClassification Report on Test Set:")
print(classification_report(y_test_labels, y_pred_labels, target_names=target_names))

# Calculate overall accuracy
accuracy = accuracy_score(y_test_labels, y_pred_labels)
print(f"Overall Accuracy on Test Set: {accuracy:.4f}")

# Generate confusion matrix
cm = confusion_matrix(y_test_labels, y_pred_labels, labels=target_names)
cm_df = pd.DataFrame(cm, index=target_names, columns=target_names)

## 6. Results Visualization

In [None]:
# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix on Test Set')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## 7. Save the Best Model

In [None]:
# Save the trained pipeline (including vectorizer and classifier)
model_filename = 'intent_classifier_pipeline.joblib'
joblib.dump(best_model, model_filename)
print(f"\nBest model saved to {model_filename}")

# Also save the label encoder classes for reference
label_map_filename = 'label_encoder_classes.json'
label_map = {i: label for i, label in enumerate(label_encoder.classes_)}
with open(label_map_filename, 'w') as f:
    json.dump(label_map, f, indent=4)
print(f"Label mapping saved to {label_map_filename}")

## 8. Example Prediction (Optional)

In [None]:
# Load the model and label encoder (as if in a separate application)
loaded_model = joblib.load('intent_classifier_pipeline.joblib')
loaded_label_encoder = joblib.load('label_encoder.joblib')

# Example new commands
new_commands = [
    "what files do I have?",
    "make a shopping list file",
    "search for python documentation",
    "delete the image screenshot.png",
    "good morning operator"
]

# Predict intents for new commands
predicted_encoded = loaded_model.predict(new_commands)
predicted_labels = loaded_label_encoder.inverse_transform(predicted_encoded)

print("\nExample Predictions:")
for command, intent in zip(new_commands, predicted_labels):
    print(f"  Command: '{command}' -> Predicted Intent: '{intent}'")
    
# Example of getting prediction probabilities (if supported by the best classifier, e.g., LogisticRegression, SVC with probability=True)
if hasattr(loaded_model.named_steps['clf'], 'predict_proba'):
    print("\nExample Prediction Probabilities:")
    probabilities = loaded_model.predict_proba([new_commands[0]])[0]
    prob_map = {label: prob for label, prob in zip(loaded_label_encoder.classes_, probabilities)}
    print(f"  Command: '{new_commands[0]}'")
    for intent, prob in sorted(prob_map.items(), key=lambda item: item[1], reverse=True):
         print(f"    Intent: {intent:<15} Probability: {prob:.4f}")
else:
    print("\nPredict_proba not available for the chosen best classifier.")