## üîß Setup Environment

In [None]:
import sys
import os

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("‚úì Running on Google Colab")
    !nvidia-smi -L
    !nvcc --version
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
else:
    print("Running locally")
    print("Current directory:", os.getcwd())

## üì• Clone Repository

In [None]:
if IN_COLAB:
    !git clone https://github.com/Ryen264/Parallel-Autoencoder-Unsupervised-Feature-Learning.git
    %cd Parallel-Autoencoder-Unsupervised-Feature-Learning
    print("\n‚úì Repository cloned")
    !ls -la src/
    !ls -la include/
else:
    print("Using local directory")

## üì¶ Install Dependencies

In [None]:
!pip install numpy matplotlib scikit-learn seaborn pandas -q
if IN_COLAB:
    !apt-get install -y libsvm-dev > /dev/null 2>&1
print("‚úì Dependencies installed")

## üìä Download CIFAR-10

In [None]:
import urllib.request
import tarfile

if not os.path.exists("data/cifar-10-batches-bin"):
    print("Downloading CIFAR-10...")
    urllib.request.urlretrieve(
        "https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz",
        "cifar-10-binary.tar.gz"
    )
    os.makedirs("data", exist_ok=True)
    with tarfile.open("cifar-10-binary.tar.gz", 'r:gz') as tar:
        tar.extractall('data/')
    os.remove("cifar-10-binary.tar.gz")
    print("‚úì Downloaded")
else:
    print("‚úì CIFAR-10 exists")

!ls -lh data/cifar-10-batches-bin/

## üî® Compile CUDA Code

In [None]:
%%writefile src/main.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda_runtime.h>
#include <fstream>
#include "cpu/cpu_autoencoder.h"
#include "dataset.h"
#include "constants.h"
#include "data_loader.h"

Dataset convertToDataset(float* images, int* labels, int n, int width, int depth) {
    std::unique_ptr<float[]> data = std::make_unique<float[]>(n * width * width * depth);
    std::unique_ptr<int[]> label_data = std::make_unique<int[]>(n);
    memcpy(data.get(), images, n * width * width * depth * sizeof(float));
    memcpy(label_data.get(), labels, n * sizeof(int));
    return Dataset(data, label_data, n, width, depth);
}

void saveFeatures(const char* filename, Dataset& features) {
    std::ofstream file(filename, std::ios::binary);
    int n = features.n, width = features.width, depth = features.depth;
    int size = n * width * width * depth;
    file.write((char*)&n, sizeof(int));
    file.write((char*)&width, sizeof(int));
    file.write((char*)&depth, sizeof(int));
    file.write((char*)features.get_data(), size * sizeof(float));
    file.write((char*)features.get_labels(), n * sizeof(int));
    file.close();
}

int main() {
    printf("==========================================================\n");
    printf("CIFAR-10 Autoencoder Pipeline - CUDA Implementation\n");
    printf("==========================================================\n");
    
    const char* data_dir = "./data/cifar-10-batches-bin";
    const int n_epochs = 20, batch_size = 128;
    const float learning_rate = 0.001f;
    const int checkpoint = 5;
    
    int deviceCount;
    bool use_cuda = (cudaGetDeviceCount(&deviceCount) == cudaSuccess && deviceCount > 0);
    if (use_cuda) {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, 0);
        printf("\nCUDA Device: %s\n", prop.name);
    } else {
        printf("\nCPU mode\n");
    }
    
    printf("\n==========================================================\n");
    printf("Step 1: Loading CIFAR-10\n");
    printf("==========================================================\n");
    CIFAR10Dataset* cifar_data = initCIFAR10Dataset(data_dir, use_cuda);
    printf("‚úì Loaded\n");
    
    printf("\n==========================================================\n");
    printf("Step 2: Training Autoencoder\n");
    printf("==========================================================\n");
    printf("Epochs: %d, Batch: %d, LR: %.4f\n\n", n_epochs, batch_size, learning_rate);
    
    Cpu_Autoencoder autoencoder;
    Dataset train_dataset = convertToDataset(getTrainImages(cifar_data), 
                                             getTrainLabels(cifar_data), 50000, 32, 3);
    autoencoder.fit(train_dataset, n_epochs, batch_size, learning_rate, true, checkpoint);
    printf("\n‚úì Training completed\n");
    
    printf("\n==========================================================\n");
    printf("Step 3: Extracting Features\n");
    printf("==========================================================\n");
    
    Dataset train_features = autoencoder.encode(train_dataset);
    printf("Train features: (%d,%d,%d,%d)\n", train_features.n, train_features.width, 
           train_features.width, train_features.depth);
    
    Dataset test_dataset = convertToDataset(getTestImages(cifar_data),
                                           getTestLabels(cifar_data), 10000, 32, 3);
    Dataset test_features = autoencoder.encode(test_dataset);
    printf("Test features: (%d,%d,%d,%d)\n", test_features.n, test_features.width,
           test_features.width, test_features.depth);
    
    printf("\nSaving features...\n");
    saveFeatures("train_features.bin", train_features);
    saveFeatures("test_features.bin", test_features);
    printf("‚úì Saved\n");
    
    freeCIFAR10Dataset(cifar_data);
    printf("\n==========================================================\n");
    printf("C++ Pipeline Complete\n");
    printf("==========================================================\n");
    return 0;
}

In [None]:
print("Compiling CUDA code...\n")
!nvcc -std=c++14 -O3 -I./include \
      src/main_colab.cu \
      src/data_loader.cu \
      src/autoencoder.cu \
      src/dataset.cu \
      src/cpu/*.cu \
      -o pipeline_cuda

if os.path.exists('pipeline_cuda'):
    print("\n‚úì Compilation successful!")
    !ls -lh pipeline_cuda
else:
    print("\n‚úó Compilation failed")

## üöÄ Run CUDA Pipeline (Steps 1-3)

In [None]:
print("Running CUDA pipeline (15-30 minutes)...\n")
!./pipeline_cuda

print("\nChecking output files:")
if os.path.exists('train_features.bin') and os.path.exists('test_features.bin'):
    !ls -lh train_features.bin test_features.bin
    print("‚úì C++ pipeline completed!")
else:
    print("‚úó Feature files not found")

## üì• Load Features

In [None]:
import numpy as np
import struct

def load_features(filename):
    with open(filename, 'rb') as f:
        n = struct.unpack('i', f.read(4))[0]
        width = struct.unpack('i', f.read(4))[0]
        depth = struct.unpack('i', f.read(4))[0]
        feature_size = n * width * width * depth
        features = np.frombuffer(f.read(feature_size * 4), dtype=np.float32)
        features = features.reshape(n, width, width, depth)
        labels = np.frombuffer(f.read(n * 4), dtype=np.int32)
    return features, labels

train_features, train_labels = load_features('train_features.bin')
test_features, test_labels = load_features('test_features.bin')

print(f"‚úì Train features: {train_features.shape}")
print(f"‚úì Test features: {test_features.shape}")

train_features_flat = train_features.reshape(train_features.shape[0], -1)
test_features_flat = test_features.reshape(test_features.shape[0], -1)

print(f"\n‚úì Flattened train: {train_features_flat.shape}")
print(f"‚úì Flattened test: {test_features_flat.shape}")
print(f"‚úì Feature dimension: {train_features_flat.shape[1]:,}")

## üéì Step 4: Train SVM

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import time

print("=" * 60)
print("Step 4: Training SVM")
print("=" * 60)
print("Kernel: RBF, C: 10.0, Gamma: auto\n")

scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_features_flat)
test_scaled = scaler.transform(test_features_flat)

svm = SVC(kernel='rbf', C=10.0, gamma='auto', verbose=True, cache_size=1000)

print("Training SVM (10-20 minutes)...")
start = time.time()
svm.fit(train_scaled, train_labels)
duration = time.time() - start

print(f"\n‚úì Trained in {duration:.2f}s ({duration/60:.2f} min)")
print(f"‚úì Support vectors: {svm.n_support_.sum():,}")

## üìä Step 5: Evaluate

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer',
               'dog', 'frog', 'horse', 'ship', 'truck']

y_pred = svm.predict(test_scaled)
accuracy = accuracy_score(test_labels, y_pred)

print(f"\n‚úì Test Accuracy: {accuracy*100:.2f}%\n")
print(classification_report(test_labels, y_pred, target_names=class_names, digits=4))

cm = confusion_matrix(test_labels, y_pred)

plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix - CUDA Autoencoder + SVM', fontsize=16, fontweight='bold')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

class_acc = cm.diagonal() / cm.sum(axis=1)

plt.figure(figsize=(12, 6))
bars = plt.bar(class_names, class_acc * 100)
plt.axhline(y=accuracy*100, color='r', linestyle='--', linewidth=2,
            label=f'Overall: {accuracy*100:.2f}%')
plt.title('Per-Class Accuracy', fontsize=16, fontweight='bold')
plt.xlabel('Class')
plt.ylabel('Accuracy (%)')
plt.ylim(0, 100)
plt.xticks(rotation=45, ha='right')
plt.legend()
plt.grid(axis='y', alpha=0.3)

for bar, acc in zip(bars, class_acc):
    bar.set_color('green' if acc >= accuracy else 'orange')
    bar.set_alpha(0.7)

plt.tight_layout()
plt.show()

print("\nPer-Class Results:")
for name, acc in zip(class_names, class_acc):
    print(f"{name:12s}: {acc*100:5.2f}%")

## üìù Final Summary

In [None]:
print("\n" + "="*70)
print("CUDA AUTOENCODER + SVM PIPELINE - FINAL RESULTS")
print("="*70)

print("\nüîß Implementation:")
print("  ‚Ä¢ Data: data_loader.cu (CUDA)")
print("  ‚Ä¢ Autoencoder: autoencoder.cu + cpu_autoencoder.cu")
print("  ‚Ä¢ SVM: scikit-learn (Python)")

print("\nüìä Results:")
print(f"  ‚Ä¢ Training samples: 50,000")
print(f"  ‚Ä¢ Test samples: 10,000")
print(f"  ‚Ä¢ Feature dimension: {train_features_flat.shape[1]:,}")
print(f"  ‚Ä¢ Compression: {(32*32*3) / train_features_flat.shape[1]:.2f}x")
print(f"  ‚Ä¢ Test Accuracy: {accuracy*100:.2f}%")
print(f"  ‚Ä¢ Best class: {class_names[np.argmax(class_acc)]} ({class_acc.max()*100:.2f}%)")
print(f"  ‚Ä¢ Worst class: {class_names[np.argmin(class_acc)]} ({class_acc.min()*100:.2f}%)")

print("\n‚úÖ Pipeline completed! C++ (CUDA) ‚Üí Python (SVM) working!")
print("="*70)

import json
with open('cuda_results.json', 'w') as f:
    json.dump({
        'accuracy': float(accuracy),
        'confusion_matrix': cm.tolist(),
        'class_accuracy': class_acc.tolist(),
        'implementation': 'CUDA C++ + Python'
    }, f, indent=2)

print("\nüíæ Results saved to cuda_results.json")