# Instruction Detection using TabNet

This notebook demonstrates how to train and evaluate a TabNet model for instruction detection.

In [None]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load and Explore Data

In [None]:
# Load data
data = pd.read_csv('../data/raw/instructions.csv')
print(f"Dataset shape: {data.shape}")
data.head(10)

In [None]:
# Check class distribution
print("Class distribution:")
print(data['label'].value_counts())

# Visualize class distribution
plt.figure(figsize=(12, 6))
data['label'].value_counts().plot(kind='bar')
plt.title('Distribution of Instruction Classes')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 2. Feature Engineering

In [None]:
def preprocess_text(texts):
    """Simple text preprocessing"""
    return [str(text).lower() for text in texts]

def get_char_frequencies(texts):
    """Extract character frequency features"""
    chars = 'abcdefghijklmnopqrstuvwxyz0123456789 ,.!?'
    char_to_idx = {c: i for i, c in enumerate(chars)}
    
    features = np.zeros((len(texts), len(chars)))
    
    for i, text in enumerate(texts):
        for c in str(text).lower():
            if c in char_to_idx:
                features[i, char_to_idx[c]] += 1
        
        if len(text) > 0:
            features[i] /= len(text)
            
    return features

# Preprocess and extract features
X = data['text'].values
y = data['label'].values

X_processed = preprocess_text(X)
X_features = get_char_frequencies(X_processed)

print(f"Feature shape: {X_features.shape}")
print(f"Number of samples: {len(X)}")
print(f"Number of features: {X_features.shape[1]}")

## 3. Train-Test Split

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_features, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

## 4. Train TabNet Model

In [None]:
# Set random seed
np.random.seed(42)
torch.manual_seed(42)

# Initialize TabNet
tabnet_params = {
    'n_d': 8,
    'n_a': 8,
    'n_steps': 3,
    'gamma': 1.3,
    'lambda_sparse': 1e-3,
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': {'lr': 2e-2},
    'scheduler_params': {'step_size': 10, 'gamma': 0.9},
    'scheduler_fn': torch.optim.lr_scheduler.StepLR,
    'mask_type': 'entmax',
    'device_name': 'cuda' if torch.cuda.is_available() else 'cpu'
}

clf = TabNetClassifier(**tabnet_params)

# Train
batch_size = min(64, len(X_train))
clf.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_name=['valid'],
    eval_metric=['accuracy'],
    max_epochs=50,
    patience=10,
    batch_size=batch_size,
    virtual_batch_size=min(32, batch_size),
    num_workers=0,
    drop_last=False
)

## 5. Evaluate Model

In [None]:
# Make predictions
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
labels = np.unique(np.concatenate([y_test, y_pred]))

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## 6. Test with Custom Instructions

In [None]:
# Test with custom instructions
custom_instructions = [
    "Turn on the bedroom lights",
    "What's the forecast for tomorrow?",
    "Set a timer for 20 minutes",
    "Play my favorite song",
    "What time is the meeting?"
]

# Preprocess and predict
custom_processed = preprocess_text(custom_instructions)
custom_features = get_char_frequencies(custom_processed)
custom_predictions = clf.predict(custom_features)

# Display results
print("Custom Instruction Predictions:")
print("="*60)
for instr, pred in zip(custom_instructions, custom_predictions):
    print(f"Instruction: {instr}")
    print(f"Predicted class: {pred}\n")