In [None]:
import os
import shutil
import pandas as pd
import numpy as np
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from functools import partial


def create_labeled_dataset(base_path, output_path, num_subjects=27, ignore_subjects=[1, 2, 3],
                           test_size=0.2, valid_size=0.1, random_state=42, n_workers=4):
    os.makedirs(output_path, exist_ok=True)

    csv_headers = {
        'index': 'uint32',
        'horiz_coord': 'uint16',
        'vert_coord': 'uint16',
        'gaze_type': 'str',
        'timestamp_microseconds': 'uint64',
        'center0': 'float32',
        'center1': 'float32',
        'large_pupil_movement': 'bool',
        'fname': 'str'
    }

    subjects = [i for i in range(1, num_subjects + 1) if i not in ignore_subjects]
    eye_ids = [0, 1]

    print(f"Processing subjects: {subjects}")
    print(f"Ignoring subjects: {ignore_subjects}")

    subject_eye_combinations = [(subject, eye_id) for subject in subjects for eye_id in eye_ids]

    print("Loading CSV files")
    dataframes = []

    for subject, eye_id in subject_eye_combinations:
        csv_path = os.path.join(base_path, f'user{subject}/data_{eye_id}.csv')
        if not os.path.exists(csv_path):
            print(f"Warning: {csv_path} does not exist, skipping.")
            continue

        try:
            df = pd.read_csv(csv_path, names=list(csv_headers.keys()), dtype=csv_headers)
            df['subject'] = subject
            df['eye_id'] = eye_id
            dataframes.append(df)
            print(f"Loaded data for user {subject}, eye {eye_id}")
        except Exception as e:
            print(f"Error loading {csv_path}: {e}")

    if not dataframes:
        raise ValueError("No data was loaded. Check paths and file formats.")

    combined_df = pd.concat(dataframes, ignore_index=True)

    gaze_types = combined_df['gaze_type'].unique()
    print(f"Found {len(gaze_types)} unique gaze types: {gaze_types}")

    np.random.seed(random_state)
    available_subjects = np.unique(combined_df['subject'])

    num_valid_subjects = max(1, int(len(available_subjects) * valid_size))
    num_test_subjects = max(1, int(len(available_subjects) * test_size))

    valid_subjects = np.random.choice(available_subjects, size=num_valid_subjects, replace=False)
    remaining_subjects = np.setdiff1d(available_subjects, valid_subjects)
    test_subjects = np.random.choice(remaining_subjects, size=num_test_subjects, replace=False)
    train_subjects = np.setdiff1d(remaining_subjects, test_subjects)

    print(f"Train subjects: {train_subjects}")
    print(f"Validation subjects: {valid_subjects}")
    print(f"Test subjects: {test_subjects}")

    train_df = combined_df[combined_df['subject'].isin(train_subjects)]
    valid_df = combined_df[combined_df['subject'].isin(valid_subjects)]
    test_df = combined_df[combined_df['subject'].isin(test_subjects)]

    print(f"Train set: {len(train_df)} samples")
    print(f"Validation set: {len(valid_df)} samples")
    print(f"Test set: {len(test_df)} samples")

    for split in ['train', 'valid', 'test']:
        split_dirs = [os.path.join(output_path, split, gaze_type) for gaze_type in gaze_types]
        for directory in split_dirs:
            os.makedirs(directory, exist_ok=True)

    def copy_single_image(row, split_name):
        subject, eye_id, fname, gaze_type = row['subject'], row['eye_id'], row['fname'], row['gaze_type']
        src_path = os.path.join(base_path, f'user{subject}/{eye_id}/frames/{fname}')
        filename, ext = os.path.splitext(fname)
        new_fname = f"user{subject}_eye{eye_id}_{filename}{ext}"
        dst_path = os.path.join(output_path, split_name, gaze_type, new_fname)

        if os.path.exists(src_path):
            try:
                shutil.copy2(src_path, dst_path)
                return True
            except Exception as e:
                print(f"Error copying {src_path} to {dst_path}: {e}")
                return False
        else:
            print(f"Warning: Source image {src_path} does not exist")
            return False

    def process_dataframe(df, split_name):
        print(f"Processing {split_name} set with {len(df)} images...")
        copy_func = partial(copy_single_image, split_name=split_name)
        with ThreadPoolExecutor(max_workers=n_workers) as executor:
            records = df.to_dict('records')
            results = list(tqdm(executor.map(copy_func, records), total=len(records), desc=f"{split_name} set"))
        successes = sum(results)
        failures = len(results) - successes
        return successes, failures

    print("Organizing images by split and gaze type...")
    train_count, train_errors = process_dataframe(train_df, 'train')
    valid_count, valid_errors = process_dataframe(valid_df, 'valid')
    test_count, test_errors = process_dataframe(test_df, 'test')

    print("\nDataset organization complete!")
    print(f"Train set: {train_count} images copied, {train_errors} errors")
    print(f"Validation set: {valid_count} images copied, {valid_errors} errors")
    print(f"Test set: {test_count} images copied, {test_errors} errors")

    train_df.to_csv(os.path.join(output_path, 'train_data.csv'), index=False)
    valid_df.to_csv(os.path.join(output_path, 'valid_data.csv'), index=False)
    test_df.to_csv(os.path.join(output_path, 'test_data.csv'), index=False)
    combined_df.to_csv(os.path.join(output_path, 'all_data.csv'), index=False)

    print("\nSaved split data to CSV files")

    print("\nDataset Statistics:")
    for split_name, df in [('Train', train_df), ('Validation', valid_df), ('Test', test_df)]:
        print(f"\n{split_name} Set:")
        stats = df['gaze_type'].value_counts()
        for gaze_type, count in stats.items():
            print(f"  {gaze_type}: {count} images")


base_path = "D:/Triya Belani/angelopoulos_eye_data"
output_path = "D:/Triya Belani/Output"


create_labeled_dataset(
    base_path,
    output_path,
    ignore_subjects=[1, 2, 3],
    test_size=0.2,
    valid_size=0.1,
    n_workers=2  
)



In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
from tqdm import tqdm
import time


torch.manual_seed(42)
np.random.seed(42)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"[INFO] Using device: {device}")


DATA_PATH = r"D:\Triya Belani\Output"  

class GazeDataset(Dataset):
    """Dataset for eye gaze classification"""

    def __init__(self, root_dir, split='train', transform=None):
        self.root_dir = os.path.join(root_dir, split)
        self.transform = transform
        print(f"[INFO] Initializing GazeDataset for split: {split} at {self.root_dir}")

        if not os.path.exists(self.root_dir):
            print(f"[ERROR] Directory not found: {self.root_dir}")
            raise FileNotFoundError(f"Directory not found: {self.root_dir}")

        # Get all class directories
        self.classes = [d for d in sorted(os.listdir(self.root_dir))
                        if os.path.isdir(os.path.join(self.root_dir, d))]
        print(f"[DEBUG] Classes found: {self.classes}")

        if not self.classes:
            print(f"[ERROR] No class directories found in {self.root_dir}")
            raise ValueError(f"No class directories found in {self.root_dir}")

        self.class_to_idx = {cls_name: i for i, cls_name in enumerate(self.classes)}
        print(f"[DEBUG] Class to index mapping: {self.class_to_idx}")

        # Get all image paths and labels
        self.image_paths = []
        self.labels = []

        for class_name in self.classes:
            class_dir = os.path.join(self.root_dir, class_name)
            for img_name in os.listdir(class_dir):
                if img_name.lower().endswith(('.jpg', '.jpeg', '.png')):
                    self.image_paths.append(os.path.join(class_dir, img_name))
                    self.labels.append(self.class_to_idx[class_name])
        print(f"[INFO] Number of images found: {len(self.image_paths)} for split: {split}")

        if not self.image_paths:
            print(f"[ERROR] No images found in {self.root_dir}")
            raise ValueError(f"No images found in {self.root_dir}")

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        try:
            image = Image.open(img_path).convert('RGB')
            label = self.labels[idx]
            if self.transform:
                image = self.transform(image)
            return image, label
        except Exception as e:
            print(f"[ERROR] Error loading image {img_path}: {e}")
            # Return a placeholder in case of error
            placeholder = torch.zeros((3, 64, 64)) if self.transform else Image.new('RGB', (64, 64))
            return placeholder, self.labels[idx]

# Define optimized transformations - reduced image size for faster processing
train_transforms = transforms.Compose([
    transforms.Resize((64, 64)),  
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.1, contrast=0.1),  # Add some data augmentation
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_transforms = transforms.Compose([
    transforms.Resize((64, 64)),  
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

class LightweightGazeClassifier(nn.Module):
    def __init__(self, num_classes):
        super(LightweightGazeClassifier, self).__init__()
        print(f"[INFO] Initializing LightweightGazeClassifier with {num_classes} classes")
        
        self.features = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(64 * 8 * 8, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        print(f"[DEBUG] Input shape: {x.shape}")
        x = self.features(x)
        print(f"[DEBUG] Shape after features: {x.shape}")
        x = torch.flatten(x, 1)
        print(f"[DEBUG] Shape after flatten: {x.shape}")
        x = self.classifier(x)
        print(f"[DEBUG] Output shape: {x.shape}")
        return x

def load_datasets():
    try:
        print("[INFO] Loading datasets...")
        train_dataset = GazeDataset(root_dir=DATA_PATH, split='train', transform=train_transforms)
        valid_dataset = GazeDataset(root_dir=DATA_PATH, split='valid', transform=test_transforms)
        test_dataset = GazeDataset(root_dir=DATA_PATH, split='test', transform=test_transforms)

        # Use more workers for CPU-bound operations and smaller batch size for memory efficiency
        train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=0, pin_memory=True)
        valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=False, num_workers=0, pin_memory=True)
        test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=0, pin_memory=True)

        print(f"[INFO] Number of training samples: {len(train_dataset)}")
        print(f"[INFO] Number of validation samples: {len(valid_dataset)}")
        print(f"[INFO] Number of test samples: {len(test_dataset)}")
        print(f"[INFO] Number of classes: {len(train_dataset.classes)}")
        print(f"[INFO] Classes: {train_dataset.classes}")

        return train_dataset, valid_dataset, test_dataset, train_loader, valid_loader, test_loader

    except Exception as e:
        print(f"[ERROR] Error loading datasets: {e}")
        raise

def train_model(model, train_loader, valid_loader, criterion, optimizer, scheduler, num_epochs=10):
    history = {
        'train_loss': [],
        'train_acc': [],
        'val_loss': [],
        'val_acc': []
    }
    best_val_acc = 0.0

    # validate only every other epoch
    validate_every = 2

    for epoch in range(num_epochs):
        start_time = time.time()
        print(f"[INFO] Starting epoch {epoch+1}/{num_epochs}")

        # Training phase
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        train_loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
        for batch_idx, (inputs, labels) in enumerate(train_loop):
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            # Limit debug output to save time
            if batch_idx < 1:
                print(f"[DEBUG] Batch {batch_idx}: Loss={loss.item():.4f}, Predicted={predicted[:3].cpu().numpy()}, Labels={labels[:3].cpu().numpy()}")

            train_loop.set_postfix(loss=f"{loss.item():.4f}", accuracy=f"{100 * correct / total:.2f}%")

        epoch_train_loss = running_loss / len(train_loader.dataset)
        epoch_train_acc = 100 * correct / total

        history['train_loss'].append(epoch_train_loss)
        history['train_acc'].append(epoch_train_acc)

        # only run every validate_every epochs or on the last epoch
        do_validate = (epoch % validate_every == 0) or (epoch == num_epochs - 1)

        if do_validate:
            model.eval()
            running_loss = 0.0
            correct = 0
            total = 0

            with torch.no_grad():
                for batch_idx, (inputs, labels) in enumerate(valid_loader):
                    inputs, labels = inputs.to(device), labels.to(device)
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)

                    running_loss += loss.item() * inputs.size(0)
                    _, predicted = torch.max(outputs, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()

                    if batch_idx < 1:
                        print(f"[DEBUG] [Validation] Batch {batch_idx}: Loss={loss.item():.4f}, Predicted={predicted[:3].cpu().numpy()}, Labels={labels[:3].cpu().numpy()}")

            epoch_val_loss = running_loss / len(valid_loader.dataset)
            epoch_val_acc = 100 * correct / total

            history['val_loss'].append(epoch_val_loss)
            history['val_acc'].append(epoch_val_acc)

            if epoch_val_acc > best_val_acc:
                best_val_acc = epoch_val_acc
                torch.save(model.state_dict(), os.path.join(DATA_PATH, 'best_model.pth'))
                print(f"[INFO] Model saved with validation accuracy: {best_val_acc:.2f}%")

            print(f"[INFO] Epoch {epoch+1}/{num_epochs} - "
                f"Train Loss: {epoch_train_loss:.4f}, Train Acc: {epoch_train_acc:.2f}% - "
                f"Val Loss: {epoch_val_loss:.4f}, Val Acc: {epoch_val_acc:.2f}% - ")
        else:
            # If not validating, append previous values to keep history aligned
            if history['val_loss']:
                history['val_loss'].append(history['val_loss'][-1])
                history['val_acc'].append(history['val_acc'][-1])
            else:
                history['val_loss'].append(0)
                history['val_acc'].append(0)

            print(f"[INFO] Epoch {epoch+1}/{num_epochs} - "
                f"Train Loss: {epoch_train_loss:.4f}, Train Acc: {epoch_train_acc:.2f}% - "
                f"Skipping validation")

        scheduler.step()
        epoch_time = time.time() - start_time
        print(f"[INFO] Epoch time: {epoch_time:.1f}s")

    return history

def test_model(model, test_loader, criterion, classes):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        test_loop = tqdm(test_loader, desc="Testing")
        for batch_idx, (inputs, labels) in enumerate(test_loop):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            if batch_idx < 1:
                print(f"[DEBUG] [Testing] Batch {batch_idx}: Loss={loss.item():.4f}, Predicted={predicted[:3].cpu().numpy()}, Labels={labels[:3].cpu().numpy()}")

            test_loop.set_postfix(loss=f"{loss.item():.4f}", accuracy=f"{100 * correct / total:.2f}%")

    test_loss = running_loss / len(test_loader.dataset)
    test_acc = 100 * correct / total

    print(f"[RESULT] Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.2f}%")

    cm = confusion_matrix(all_labels, all_predictions)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig(os.path.join(DATA_PATH, 'confusion_matrix.png'))

    print("\n[RESULT] Classification Report:")
    print(classification_report(all_labels, all_predictions, target_names=classes))

    return test_loss, test_acc

def plot_history(history):
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history['train_loss'], label='Train Loss')
    plt.plot(history['val_loss'], label='Validation Loss')
    plt.title('Loss over epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history['train_acc'], label='Train Accuracy')
    plt.plot(history['val_acc'], label='Validation Accuracy')
    plt.title('Accuracy over epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (%)')
    plt.legend()

    plt.tight_layout()
    plt.savefig(os.path.join(DATA_PATH, 'training_history.png'))
    plt.show()


def main():
    try:
        print("[INFO] Starting main execution...")
        train_dataset, valid_dataset, test_dataset, train_loader, valid_loader, test_loader = load_datasets()
        num_classes = len(train_dataset.classes)
        print(f"[INFO] Number of classes for model: {num_classes}")
        model = LightweightGazeClassifier(num_classes=num_classes)
        model = model.to(device)

        # Use more efficient criterion and optimizer
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2)#, verbose=True)

        print("[INFO] Starting training...")

        history = train_model(
                model=model,
                train_loader=train_loader,
                valid_loader=valid_loader,
                criterion=criterion,
                optimizer=optimizer,
                scheduler=scheduler,
                num_epochs=5)

        print("[INFO] Training complete. Plotting history...")
        plot_history(history)

        best_model_path = os.path.join(DATA_PATH, 'best_model.pth')
        if os.path.exists(best_model_path):
            model.load_state_dict(torch.load(best_model_path))
            print("[INFO] Loaded best model for testing")

        print("[INFO] Starting testing...")
        test_loss, test_acc = test_model(model, test_loader, criterion, classes=test_dataset.classes)
        print(f"[RESULT] Final test accuracy: {test_acc:.2f}%")

    except Exception as e:
        print(f"[FATAL] An error occurred: {e}")
        import traceback
        traceback.print_exc()

# if __name__ == "__main__":
#     start_time = time.time()
#     main()
#     total_time = time.time() - start_time
#     print(f"[INFO] Total execution time: {total_time:.2f} seconds")


if __name__ == "__main__":
   
    train_dataset, valid_dataset, _, _, _, _ = load_datasets()
    
    # Prepare quick training subsets and data loaders
    quick_train_dataset = torch.utils.data.Subset(train_dataset, range(min(100, len(train_dataset))))
    quick_valid_dataset = torch.utils.data.Subset(valid_dataset, range(min(50, len(valid_dataset))))
    quick_train_loader = DataLoader(quick_train_dataset, batch_size=8, shuffle=True, num_workers=0)
    quick_valid_loader = DataLoader(quick_valid_dataset, batch_size=8, shuffle=False, num_workers=0)
    
    # Quick test training setup
    print("[INFO] Starting quick test training...")
    num_classes = len(train_dataset.classes) 
    model = LightweightGazeClassifier(num_classes=num_classes).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)
    
    # Run training for 2 epochs
    train_model(model, quick_train_loader, quick_valid_loader, criterion, optimizer, scheduler, num_epochs=2)
    print("[INFO] Quick test completed successfully!")

