# 🎨 Doodle Recognition Model Training
## Train on Kaggle GPU and Download to Local Machine

This notebook will:
1. Download Quick Draw dataset (100 categories)
2. Preprocess the data
3. Train a CNN model
4. Evaluate performance
5. Export model for download

**⚠️ IMPORTANT:** Enable GPU in Kaggle (Settings → Accelerator → GPU T4 x2)

## 📦 Step 1: Install Dependencies

In [None]:
!pip install -q scikit-learn matplotlib seaborn tqdm

## 🔧 Step 2: Configuration

In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
import requests
from tqdm import tqdm
import json
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")

# Configuration
CATEGORIES = [
    'airplane', 'ambulance', 'angel', 'ant', 'apple', 'axe', 'banana',
    'baseball', 'basketball', 'bat', 'bathtub', 'bear', 'bed', 'bee',
    'bicycle', 'bird', 'book', 'bread', 'bus', 'butterfly', 'cake',
    'car', 'cat', 'chair', 'cloud', 'computer', 'cookie', 'cow',
    'crab', 'cup', 'deer', 'dog', 'dolphin', 'donut', 'dragon',
    'duck', 'elephant', 'eye', 'face', 'fish', 'flower', 'frog',
    'giraffe', 'guitar', 'hamburger', 'hammer', 'hat', 'helicopter',
    'horse', 'house', 'ice cream', 'key', 'knight', 'ladder',
    'lighthouse', 'lion', 'monkey', 'moon', 'mosquito', 'mouse',
    'mushroom', 'octopus', 'owl', 'panda', 'parrot', 'pear',
    'penguin', 'piano', 'pig', 'pineapple', 'pizza', 'rabbit',
    'raccoon', 'rhinoceros', 'saw', 'scissors', 'sea turtle',
    'shark', 'sheep', 'snail', 'snake', 'snowman', 'spider',
    'squirrel', 'star', 'strawberry', 'swan', 'sword', 'table',
    'teapot', 'teddy-bear', 'telephone', 'tiger', 'train', 'tree',
    'truck', 'umbrella', 'van', 'violin', 'watermelon', 'whale',
    'wheel', 'windmill', 'zebra'
]

SAMPLES_PER_CATEGORY = 10000
BATCH_SIZE = 128
EPOCHS = 30
LEARNING_RATE = 0.001

os.makedirs('data', exist_ok=True)
os.makedirs('models', exist_ok=True)
os.makedirs('plots', exist_ok=True)

## 📥 Step 3: Download Quick Draw Dataset

In [None]:
def download_category(category, base_url, samples_limit=None):
    filename = f"{category.replace(' ', '_')}.npy"
    url = f"{base_url}/{filename}"
    filepath = os.path.join('data', filename)
    
    if os.path.exists(filepath):
        print(f"✅ {category} already downloaded")
        return filepath
    
    try:
        print(f"📥 Downloading {category}...")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        total_size = int(response.headers.get('content-length', 0))
        
        with open(filepath, 'wb') as file:
            with tqdm(total=total_size, unit='B', unit_scale=True) as pbar:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        file.write(chunk)
                        pbar.update(len(chunk))
        
        if samples_limit:
            data = np.load(filepath)
            if len(data) > samples_limit:
                np.save(filepath, data[:samples_limit])
        
        print(f"✅ {category} downloaded")
        return filepath
    except Exception as e:
        print(f"❌ Failed: {e}")
        return None

base_url = "https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap"
print(f"📥 Downloading {len(CATEGORIES)} categories...")
for category in CATEGORIES:
    download_category(category, base_url, SAMPLES_PER_CATEGORY)

## 🔄 Step 4: Load and Preprocess Data

In [None]:
def load_and_preprocess_data(categories, samples_per_category):
    print("📂 Loading data...")
    all_data, all_labels = [], []
    
    for category in tqdm(categories, desc="Loading"):
        filename = f"{category.replace(' ', '_')}.npy"
        filepath = os.path.join('data', filename)
        
        if not os.path.exists(filepath):
            continue
        
        try:
            data = np.load(filepath)
            if samples_per_category and len(data) > samples_per_category:
                data = data[:samples_per_category]
            all_data.append(data)
            all_labels.extend([category] * len(data))
        except Exception as e:
            print(f"❌ Error loading {category}: {e}")
    
    X = np.vstack(all_data)
    y = np.array(all_labels)
    print(f"✅ Loaded {len(X):,} samples across {len(set(y))} categories")
    
    print("🖼️ Preprocessing...")
    X = X.reshape(-1, 28, 28, 1).astype('float32') / 255.0
    X = 1.0 - X
    
    print("🏷️ Encoding labels...")
    label_encoder = LabelEncoder()
    y_int = label_encoder.fit_transform(y)
    y_categorical = to_categorical(y_int)
    
    return X, y_categorical, label_encoder

X, y, label_encoder = load_and_preprocess_data(CATEGORIES, SAMPLES_PER_CATEGORY)

print("✂️ Splitting data...")
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42, stratify=y_temp)

print(f"Training: {len(X_train):,}")
print(f"Validation: {len(X_val):,}")
print(f"Test: {len(X_test):,}")

## 🏗️ Step 5: Build Model

In [None]:
def build_model(num_classes):
    return models.Sequential([
        layers.Input(shape=(28, 28, 1)),
        layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        layers.Flatten(),
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(num_classes, activation='softmax')
    ])

num_classes = len(label_encoder.classes_)
model = build_model(num_classes)
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
    loss='categorical_crossentropy',
    metrics=['accuracy', keras.metrics.TopKCategoricalAccuracy(k=3, name='top_3_accuracy')]
)
model.summary()

## 🏋️ Step 6: Train Model

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

callbacks = [
    ModelCheckpoint('models/best_model.h5', monitor='val_accuracy', save_best_only=True, verbose=1),
    EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-7, verbose=1)
]

print("🚀 Starting training...")
history = model.fit(
    X_train, y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    verbose=1
)
print("✅ Training completed!")

## 📊 Step 7: Evaluate Model

In [None]:
print("📊 Evaluating...")
test_loss, test_accuracy, test_top3 = model.evaluate(X_test, y_test, verbose=0)

print(f"\n📈 Results:")
print(f"   Accuracy: {test_accuracy:.4f}")
print(f"   Top-3: {test_top3:.4f}")
print(f"   Loss: {test_loss:.4f}")

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
ax1.plot(history.history['accuracy'], label='Train')
ax1.plot(history.history['val_accuracy'], label='Val')
ax1.set_title('Accuracy')
ax1.legend()
ax1.grid(True)

ax2.plot(history.history['loss'], label='Train')
ax2.plot(history.history['val_loss'], label='Val')
ax2.set_title('Loss')
ax2.legend()
ax2.grid(True)

plt.tight_layout()
plt.savefig('plots/training_history.png', dpi=300)
plt.show()

## 💾 Step 8: Save Model

In [None]:
model.save('models/doodle_classifier.h5')
print("✅ Model saved")

with open('models/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

metadata = {
    "model_name": "doodle_classifier",
    "class_names": label_encoder.classes_.tolist(),
    "num_classes": len(label_encoder.classes_),
    "input_shape": [28, 28, 1],
    "test_accuracy": float(test_accuracy),
    "test_top3_accuracy": float(test_top3),
    "test_loss": float(test_loss),
    "model_parameters": int(model.count_params()),
    "training_samples": len(X_train),
    "preprocessing": {"normalize": True, "invert_colors": True, "resize_to": [28, 28]}
}

with open('models/doodle_classifier_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

with open('models/class_names.json', 'w') as f:
    json.dump(label_encoder.classes_.tolist(), f, indent=2)

print("✅ Metadata saved")

## 📦 Step 9: Create Download Package

In [None]:
import shutil

export_dir = 'export_for_local'
os.makedirs(export_dir, exist_ok=True)

shutil.copy('models/doodle_classifier.h5', export_dir)
shutil.copy('models/doodle_classifier_metadata.json', export_dir)
shutil.copy('models/label_encoder.pkl', export_dir)
shutil.copy('models/class_names.json', export_dir)
shutil.copy('plots/training_history.png', export_dir)

print("✅ Files ready in export_for_local/")
print("\n📁 Files to download:")
for file in os.listdir(export_dir):
    size_mb = os.path.getsize(os.path.join(export_dir, file)) / (1024 * 1024)
    print(f"   - {file} ({size_mb:.2f} MB)")

## 🎯 Step 10: Test Predictions

In [None]:
indices = np.random.choice(len(X_test), 10, replace=False)
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
axes = axes.ravel()

for i, idx in enumerate(indices):
    image = X_test[idx]
    true_label = label_encoder.classes_[y_test[idx].argmax()]
    pred = model.predict(image.reshape(1, 28, 28, 1), verbose=0)[0]
    pred_label = label_encoder.classes_[pred.argmax()]
    confidence = pred.max()
    
    axes[i].imshow(image.squeeze(), cmap='gray')
    color = 'green' if true_label == pred_label else 'red'
    axes[i].set_title(f'{true_label}\n{pred_label} ({confidence:.2f})', fontsize=8, color=color)
    axes[i].axis('off')

plt.tight_layout()
plt.savefig('plots/predictions.png', dpi=300)
plt.show()

## 🎉 Training Complete!\n\n### Download Instructions:\n\n1. **In Kaggle:** Click Output tab → export_for_local → Download\n2. **On Local Machine:** Copy files to `colab_backend/models/`\n3. **Update Code:** Edit `colab_backend/recognizer.py`:\n\n```python\ndef __init__(self):\n    import os, json, tensorflow as tf\n    model_path = "models/doodle_classifier.h5"\n    if os.path.exists(model_path):\n        self.model = tf.keras.models.load_model(model_path)\n        with open("models/doodle_classifier_metadata.json") as f:\n            self.classes = json.load(f)['class_names']\n        print(f"✅ Loaded: {len(self.classes)} classes")\n```\n\n4. **Run:** `npm run dev`\n\nYour model is ready! 🚀

In [None]:
print("="*60)
print("🎉 TRAINING COMPLETE!")
print("="*60)
print(f"\nAccuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"Top-3: {test_top3:.4f} ({test_top3*100:.2f}%)")
print(f"Classes: {num_classes}")
print(f"Parameters: {model.count_params():,}")
print("\n📥 Download 'export_for_local' folder from Output tab")
print("="*60)