<a href="https://colab.research.google.com/github/OneFineStarstuff/OneFineStarstuff/blob/main/_SMOTE_Cross_Validation_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
from transformers import get_scheduler
from imblearn.over_sampling import SMOTE
from tqdm import tqdm
import numpy as np
import logging

# Logging Configuration
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

# Define Model Class
class DummyModel(nn.Module):
    def __init__(self, input_size=3082, output_size=2):
        super(DummyModel, self).__init__()
        self.fc = nn.Linear(input_size, output_size)

    def forward(self, x):
        return self.fc(x)

# Function to Apply SMOTE
def apply_smote(data):
    images, sensors, labels = data["images"], data["sensors"], data["labels"]

    # Flatten image and sensor data
    image_features = np.array(images).reshape(len(images), -1)
    sensor_features = np.array(sensors).reshape(len(sensors), -1)

    combined_features = np.hstack((image_features, sensor_features))

    unique_classes, class_counts = np.unique(labels, return_counts=True)
    min_samples = class_counts.min()

    # Adjust k_neighbors dynamically
    if len(unique_classes) > 1 and min_samples > 1:
        k_neighbors = min(5, min_samples - 1)  # Ensure k_neighbors <= n_samples - 1
        smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
        resampled_features, resampled_labels = smote.fit_resample(combined_features, labels)
        return torch.tensor(resampled_features, dtype=torch.float32), torch.tensor(resampled_labels, dtype=torch.long)
    else:
        logger.warning(f"Skipping SMOTE: insufficient samples (min_samples={min_samples}).")
        return torch.tensor(combined_features, dtype=torch.float32), torch.tensor(labels, dtype=torch.long)

# Training Function with Cross-Validation
def train_with_cross_validation(model_class, data, criterion, optimizer_class, scheduler_class, epochs=5, device='cpu', k_folds=5):
    kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    val_losses = []

    for fold, (train_idx, val_idx) in enumerate(kfold.split(data['images'])):
        logger.info(f"Training Fold {fold + 1}/{k_folds}...")

        train_data = {key: [data[key][i] for i in train_idx] for key in data}
        val_data = {key: [data[key][i] for i in val_idx] for key in data}

        train_features, train_labels = apply_smote(train_data)
        val_features, val_labels = apply_smote(val_data)

        train_loader = DataLoader(TensorDataset(train_features, train_labels), batch_size=8, shuffle=True)
        val_loader = DataLoader(TensorDataset(val_features, val_labels), batch_size=8)

        model = model_class(input_size=train_features.shape[1]).to(device)
        optimizer = optimizer_class(model.parameters(), lr=2e-5)
        num_training_steps = len(train_loader) * epochs
        scheduler = scheduler_class(
            name='linear',
            optimizer=optimizer,
            num_warmup_steps=0,
            num_training_steps=num_training_steps
        )

        for epoch in range(epochs):
            model.train()
            for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}, Fold {fold+1}"):
                inputs, labels = batch[0].to(device), batch[1].to(device)
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
            scheduler.step()

        # Validation Loop
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                inputs, labels = batch[0].to(device), batch[1].to(device)
                outputs = model(inputs)
                val_loss += criterion(outputs, labels).item()
        val_losses.append(val_loss / len(val_loader))

    return val_losses

# Simulated Data
data = {
    "images": np.random.rand(100, 3, 32, 32),
    "sensors": np.random.rand(100, 10),
    "labels": np.random.randint(0, 2, 100)
}

# Loss Function and Training
loss_fn = nn.CrossEntropyLoss()
val_losses = train_with_cross_validation(
    model_class=DummyModel,
    data=data,
    criterion=loss_fn,
    optimizer_class=optim.AdamW,
    scheduler_class=get_scheduler,
    device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
)

print("Validation losses:", val_losses)