<a href="https://colab.research.google.com/github/Ponczeks/image-classification-comparison/blob/main/Cifar_10_cross_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install cup



#Przygotowanie danych

In [None]:
import torch
import numpy as np
from torchvision import datasets, transforms
from sklearn.decomposition import PCA
from skimage.feature import hog
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import random
import time
import psutil
from cuml.svm import LinearSVC
from sklearn.kernel_approximation import RBFSampler
import cupy as cp
from cuml.ensemble import RandomForestClassifier

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print("Urządzenie:", device)

transform_flat = transforms.Compose([
    transforms.ToTensor(),
    transforms.Lambda(lambda x: x.view(-1))
])
dataset = datasets.CIFAR10(root='./data',
                           train=True,
                           download=True,
                           transform=transform_flat)

N=50000
print(N)
X_raw = torch.stack([dataset[i][0] for i in range(N)])
y_raw = torch.tensor([dataset[i][1] for i in range(N)])
print(f"Rozmiar danych (klasyczne): {X_raw.shape}")
X_raw_clean = X_raw.clone()

def add_gaussian_noise(X: torch.Tensor, std: float = 0.2) -> torch.Tensor:
    """Additive białe szum Gaussian, obcinasz do [0,1]."""
    return torch.clamp(X + torch.randn_like(X) * std, 0.0, 1.0)


def add_salt_and_pepper_noise(X: torch.Tensor, amount: float = 0.1) -> torch.Tensor:
    """Salt‑and‑pepper w procentach pikseli."""
    X_noisy = X.clone()
    for i in range(X.shape[0]):
        idx = np.random.choice(X.shape[1], int(amount * X.shape[1]), replace=False)
        X_noisy[i][idx] = torch.tensor(np.random.choice([0.0, 1.0], size=len(idx))).float()
    return X_noisy


def add_label_noise(y: torch.Tensor, noise_ratio: float = 0.1, num_classes: int = 10) -> torch.Tensor:
    """Losowa podmiana etykiet — symetryczny label‑flip."""
    y_noisy = y.clone()
    n_noisy = int(len(y) * noise_ratio)
    idx = np.random.choice(len(y), n_noisy, replace=False)
    for i in idx:
        y_noisy[i] = random.choice([c for c in range(num_classes) if c != y[i]])
    return y_noisy

def extract_hog(X: torch.Tensor) -> torch.Tensor:
    hog_feats = []
    for i in range(X.shape[0]):
        img = X[i].reshape(3, 32, 32).permute(1, 2, 0).numpy()
        feat = hog(
            img,
            pixels_per_cell=(8, 8),
            cells_per_block=(2, 2),
            orientations=9,
            channel_axis=-1
        )
        hog_feats.append(feat)
    return torch.tensor(np.array(hog_feats), dtype=torch.float32)



print("\n=== PREKOMPUTACJA SZUMÓW ===")
precomputed_noisy_features = {}
precomputed_noisy_labels   = {}

for std in [0.1, 0.2, 0.3]:
    key = f"RAW_Gaussian_{std}"
    precomputed_noisy_features[key] = add_gaussian_noise(X_raw_clean, std)
    print(f"{key}: obliczony.")

for amt in [0.05, 0.10, 0.20]:
    key = f"RAW_SP_{amt}"
    precomputed_noisy_features[key] = add_salt_and_pepper_noise(X_raw_clean, amt)
    print(f"{key}: obliczony.")

for noise in [0.1, 0.2, 0.3]:
    key = f"RAW_Label_{int(noise*100)}"
    precomputed_noisy_labels[key] = add_label_noise(y_raw, noise)
    print(f"{key}: obliczony.")



print("\n=== PREKOMPUTACJA CECH ===")
start_pca = time.perf_counter()
pca = PCA(n_components=50)
pca.fit(X_raw_clean.numpy())
X_pca_clean = torch.tensor(pca.transform(X_raw_clean.numpy()), dtype=torch.float32)
pca_time = time.perf_counter() - start_pca

start_hog = time.perf_counter()
X_hog_clean = extract_hog(X_raw_clean)
hog_time = time.perf_counter() - start_hog
print(f"PCA: czas = {pca_time:.3f}s")
print(f"HOG: czas = {hog_time:.3f}s")
precomputed_noisy_pca = {}
precomputed_noisy_hog = {}

print("\n=== PREKOMPUTACJA CECH (dla zaszumionych danych testowych) ===")
for key, X_noisy in precomputed_noisy_features.items():
    print(f"{key} - PCA i HOG...")
    precomputed_noisy_pca[key] = torch.tensor(pca.transform(X_noisy.numpy()), dtype=torch.float32)
    precomputed_noisy_hog[key] = extract_hog(X_noisy)
print("Zakończono prekomputację cech zaszumionych.")



Urządzenie: cuda
50000
Rozmiar danych (klasyczne): torch.Size([50000, 3072])

=== PREKOMPUTACJA SZUMÓW ===
RAW_Gaussian_0.1: obliczony.
RAW_Gaussian_0.2: obliczony.
RAW_Gaussian_0.3: obliczony.
RAW_SP_0.05: obliczony.
RAW_SP_0.1: obliczony.
RAW_SP_0.2: obliczony.
RAW_Label_10: obliczony.
RAW_Label_20: obliczony.
RAW_Label_30: obliczony.

=== PREKOMPUTACJA CECH ===
PCA: czas = 4.537s
HOG: czas = 26.817s

=== PREKOMPUTACJA CECH (dla zaszumionych danych testowych) ===
RAW_Gaussian_0.1 - PCA i HOG...
RAW_Gaussian_0.2 - PCA i HOG...
RAW_Gaussian_0.3 - PCA i HOG...
RAW_SP_0.05 - PCA i HOG...
RAW_SP_0.1 - PCA i HOG...
RAW_SP_0.2 - PCA i HOG...
Zakończono prekomputację cech zaszumionych.


#KNN

In [None]:

def knn_classify(x_train: torch.Tensor,
                 y_train: torch.Tensor,
                 x_test:  torch.Tensor,
                 k: int = 3) -> torch.Tensor:
    """
    Prosty brute‑force k‑NN wykorzystujący torch.cdist (L2).
    x_train:  (N_train, D)  – D = 3072 lub o mniejszym wymiarze (PCA/HOG)
    x_test:   (N_test,  D)
    """
    dists = torch.cdist(x_test, x_train, p=2)
    _, idx = torch.topk(dists, k, largest=False)
    neighbors = y_train[idx]
    return torch.mode(neighbors, dim=1).values


def crossval_metrics_knn(modality, noise_key=None, label_noise_key=None, k=3):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    metrics = {m: [] for m in ["accuracy", "precision_macro", "recall_macro", "f1_macro"]}
    total_train_time = 0.0
    total_pred_time = 0.0
    X_np = X_raw_clean.cpu().numpy()
    y_np = y_raw.cpu().numpy()

    for train_idx, test_idx in skf.split(X_np, y_np):
        if modality == "RAW":
            x_train = X_raw_clean[train_idx]
        elif modality == "PCA":
            x_train = X_pca_clean[train_idx]
        elif modality == "HOG":
            x_train = X_hog_clean[train_idx]
        else:
            raise ValueError("Nieznany typ ekstrakcji cech")

        y_train = y_raw[train_idx]

        if modality == "RAW":
            x_test = precomputed_noisy_features[noise_key][test_idx] if noise_key else X_raw_clean[test_idx]
        elif modality == "PCA":
            x_test = precomputed_noisy_pca[noise_key][test_idx] if noise_key else X_pca_clean[test_idx]
        elif modality == "HOG":
            x_test = precomputed_noisy_hog[noise_key][test_idx] if noise_key else X_hog_clean[test_idx]

        y_test = precomputed_noisy_labels[label_noise_key][test_idx] if label_noise_key else y_raw[test_idx]

        t1 = time.perf_counter()
        t2 = time.perf_counter()
        total_train_time += (t2 - t1)

        t3 = time.perf_counter()
        y_pred = knn_classify(x_train.to(device), y_train.to(device), x_test.to(device), k)
        t4 = time.perf_counter()
        total_pred_time += (t4 - t3)

        yt = y_test.cpu().numpy()
        yp = y_pred.cpu().numpy()
        metrics["accuracy"].append(accuracy_score(yt, yp))
        metrics["precision_macro"].append(precision_score(yt, yp, average="macro", zero_division=0))
        metrics["recall_macro"].append(recall_score(yt, yp, average="macro", zero_division=0))
        metrics["f1_macro"].append(f1_score(yt, yp, average="macro", zero_division=0))

    timing = {
        "train_time": total_train_time,
        "pred_time": total_pred_time,
        "total_time": total_train_time + total_pred_time
    }
    return metrics, timing

def summarize_metrics(metrics):
    return {m: f"{np.mean(v):.4f} ± {np.std(v):.4f}" for m, v in metrics.items()}

def run_gridsearch_knn(modality, noise_key, label_noise_key, name):
    print(f"\n{name}")
    best_f1 = -1
    best_k = None
    best_summary = None
    best_timing = None

    for k in [1, 3, 5, 7]:
        print(f"  k={k}")
        metrics, timing = crossval_metrics_knn(modality, noise_key, label_noise_key, k)
        summary = summarize_metrics(metrics)
        f1_val = float(summary["f1_macro"].split(" ±")[0])
        print(f"    F1_macro: {summary['f1_macro']} | Recall_macro: {summary['recall_macro']} | Precision_macro: {summary['precision_macro']} | Accuracy: {summary['accuracy']} | Train: {timing['train_time']:.3f}s | Pred: {timing['pred_time']:.3f}s")

        if f1_val > best_f1:
            best_f1 = f1_val
            best_k = k
            best_summary = summary
            best_timing = timing

    if modality == "RAW":
        precomp_time = 0.0
    elif modality == "PCA":
        precomp_time = pca_time
    elif modality == "HOG":
        precomp_time = hog_time
    else:
        precomp_time = 0.0

    print(f"Najlepsze k={best_k}")
    return {
        "name": name,
        "params": f"k={best_k}",
        "metrics": best_summary,
        "timing": best_timing,
        "precomp_time": precomp_time
    }

knn_experiments = []
knn_experiments.append(("RAW", "RAW", None, None))
for std in [0.1, 0.2, 0.3]:
    knn_experiments.append((f"RAW + Gaussian (std={std})", "RAW", f"RAW_Gaussian_{std}", None))
for amt in [0.05, 0.1, 0.2]:
    knn_experiments.append((f"RAW + S&P (amt={amt})", "RAW", f"RAW_SP_{amt}", None))
for noise in [0.1, 0.2, 0.3]:
    knn_experiments.append((f"RAW + LabelNoise ({int(noise*100)}%)", "RAW", None, f"RAW_Label_{int(noise*100)}"))

knn_experiments.append(("PCA", "PCA", None, None))
for std in [0.1, 0.2, 0.3]:
    knn_experiments.append((f"PCA + Gaussian (std={std})", "PCA", f"RAW_Gaussian_{std}", None))
for amt in [0.05, 0.1, 0.2]:
    knn_experiments.append((f"PCA + S&P (amt={amt})", "PCA", f"RAW_SP_{amt}", None))
for noise in [0.1, 0.2, 0.3]:
    knn_experiments.append((f"PCA + LabelNoise ({int(noise*100)}%)", "PCA", None, f"RAW_Label_{int(noise*100)}"))

knn_experiments.append(("HOG", "HOG", None, None))
for std in [0.1, 0.2, 0.3]:
    knn_experiments.append((f"HOG + Gaussian (std={std})", "HOG", f"RAW_Gaussian_{std}", None))
for amt in [0.05, 0.1, 0.2]:
    knn_experiments.append((f"HOG + S&P (amt={amt})", "HOG", f"RAW_SP_{amt}", None))
for noise in [0.1, 0.2, 0.3]:
    knn_experiments.append((f"HOG + LabelNoise ({int(noise*100)}%)", "HOG", None, f"RAW_Label_{int(noise*100)}"))

knn_results = [run_gridsearch_knn(mod, noise_key, label_noise_key, name)
               for name, mod, noise_key, label_noise_key in knn_experiments]

print("\nTabela wyników kNN:")
print(f"{'Metoda':<35} | {'Parametry':<10} | {'Accuracy':<20} | {'Precision_macro':<20} | {'Recall_macro':<20} | {'F1_macro':<20} | {'Train [s]':<10} | {'Pred [s]':<10}")
print("-" * 160)
for r in knn_results:
    print(f"{r['name']:<35} | {r['params']:<10} | {r['metrics']['accuracy']:<20} | {r['metrics']['precision_macro']:<20} | "
          f"{r['metrics']['recall_macro']:<20} | {r['metrics']['f1_macro']:<20} | {r['timing']['train_time']:<10.3f} | {r['timing']['pred_time']:<10.3f}")




RAW
  k=1
    F1_macro: 0.3345 ± 0.0025 | Recall_macro: 0.3405 ± 0.0018 | Precision_macro: 0.3970 ± 0.0026 | Accuracy: 0.3405 ± 0.0018 | Train: 0.000s | Pred: 0.650s
  k=3
    F1_macro: 0.3111 ± 0.0025 | Recall_macro: 0.3236 ± 0.0024 | Precision_macro: 0.4232 ± 0.0066 | Accuracy: 0.3236 ± 0.0024 | Train: 0.000s | Pred: 0.662s
  k=5
    F1_macro: 0.3256 ± 0.0022 | Recall_macro: 0.3384 ± 0.0019 | Precision_macro: 0.4236 ± 0.0051 | Accuracy: 0.3384 ± 0.0019 | Train: 0.000s | Pred: 0.659s
  k=7
    F1_macro: 0.3158 ± 0.0025 | Recall_macro: 0.3316 ± 0.0025 | Precision_macro: 0.4304 ± 0.0022 | Accuracy: 0.3316 ± 0.0025 | Train: 0.000s | Pred: 0.654s
Najlepsze k=1

RAW + Gaussian (std=0.1)
  k=1
    F1_macro: 0.3270 ± 0.0030 | Recall_macro: 0.3340 ± 0.0020 | Precision_macro: 0.3938 ± 0.0035 | Accuracy: 0.3340 ± 0.0020 | Train: 0.000s | Pred: 0.650s
  k=3
    F1_macro: 0.3053 ± 0.0036 | Recall_macro: 0.3198 ± 0.0026 | Precision_macro: 0.4240 ± 0.0027 | Accuracy: 0.3198 ± 0.0026 | Train: 0.000

#SVM


In [None]:

def crossval_metrics_svm(modality, noise_key=None, label_noise_key=None, svm_params={"C": 1, "gamma": 0.01}):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    acc_list, prec_list, rec_list, f1_list = [], [], [], []
    train_times, pred_times = [], []
    X_np = X_raw_clean.cpu().numpy()
    y_np = y_raw.cpu().numpy()

    for train_idx, test_idx in skf.split(X_np, y_np):
        if modality == "RAW":
            X_train = X_raw_clean[train_idx]
            X_test = precomputed_noisy_features[noise_key][test_idx] if noise_key else X_raw_clean[test_idx]
        elif modality == "PCA":
            X_train = X_pca_clean[train_idx]
            X_test = precomputed_noisy_pca[noise_key][test_idx] if noise_key else X_pca_clean[test_idx]
        elif modality == "HOG":
            X_train = X_hog_clean[train_idx]
            X_test = precomputed_noisy_hog[noise_key][test_idx] if noise_key else X_hog_clean[test_idx]
        else:
            raise ValueError("Nieznany typ ekstrakcji cech")

        y_train = y_raw[train_idx]
        y_test = precomputed_noisy_labels[label_noise_key][test_idx] if label_noise_key else y_raw[test_idx]

        X_train_np = X_train.cpu().numpy()
        X_test_np = X_test.cpu().numpy()

        sampler = RBFSampler(gamma=svm_params["gamma"], n_components=2000, random_state=42)
        X_train_trans = sampler.fit_transform(X_train_np)
        X_test_trans = sampler.transform(X_test_np)

        X_train_cp = cp.asarray(X_train_trans)
        X_test_cp = cp.asarray(X_test_trans)
        y_train_cp = cp.asarray(y_train.cpu().numpy())

        t0 = time.perf_counter()
        model_svm = LinearSVC(C=svm_params["C"])
        model_svm.fit(X_train_cp, y_train_cp)
        t1 = time.perf_counter()
        train_times.append(t1 - t0)

        t2 = time.perf_counter()
        y_pred_cp = model_svm.predict(X_test_cp)
        t3 = time.perf_counter()
        pred_times.append(t3 - t2)

        y_pred = cp.asnumpy(y_pred_cp)
        yt = y_test.cpu().numpy()

        acc_list.append(accuracy_score(yt, y_pred))
        prec_list.append(precision_score(yt, y_pred, average="macro", zero_division=0))
        rec_list.append(recall_score(yt, y_pred, average="macro", zero_division=0))
        f1_list.append(f1_score(yt, y_pred, average="macro", zero_division=0))

    timing = {
        "train_time": np.mean(train_times),
        "pred_time": np.mean(pred_times),
        "total_time": np.mean(train_times) + np.mean(pred_times)
    }
    metrics = {
        "accuracy": (np.mean(acc_list), np.std(acc_list)),
        "precision_macro": (np.mean(prec_list), np.std(prec_list)),
        "recall_macro": (np.mean(rec_list), np.std(rec_list)),
        "f1_macro": (np.mean(f1_list), np.std(f1_list))
    }
    return metrics, timing

def run_gridsearch_svm(modality, noise_key, label_noise_key, name, param_grid):
    print(f"\n{name}")
    best_f1 = -1
    best_params = None
    best_metrics = None
    best_timing = None

    for params in param_grid:
        print(f"  Params: {params}")
        metrics, timing = crossval_metrics_svm(modality, noise_key, label_noise_key, svm_params=params)
        summary = {k: f"{v[0]:.4f} ± {v[1]:.4f}" for k, v in metrics.items()}
        f1_val = float(summary["f1_macro"].split(" ±")[0])
        print(f"    F1_macro: {summary['f1_macro']} | Recall_macro: {summary['recall_macro']} | Accuracy: {summary['accuracy']} | Train: {timing['train_time']:.3f}s | Pred: {timing['pred_time']:.3f}s")

        if f1_val > best_f1:
            best_f1 = f1_val
            best_params = params
            best_metrics = summary
            best_timing = timing

    precomp_time = {"RAW": 0.0, "PCA": pca_time, "HOG": hog_time}.get(modality, 0.0)

    print(f"Najlepsze parametry: {best_params}")
    return {
        "name": name,
        "params": best_params,
        "metrics": best_metrics,
        "timing": best_timing,
        "precomp_time": precomp_time
    }

svm_experiments = []
svm_experiments.append(("RAW", "RAW", None, None))
for std in [0.1, 0.2, 0.3]:
    svm_experiments.append((f"RAW + Gaussian (std={std})", "RAW", f"RAW_Gaussian_{std}", None))
for amt in [0.05, 0.1, 0.2]:
    svm_experiments.append((f"RAW + S&P (amt={amt})", "RAW", f"RAW_SP_{amt}", None))
for noise in [0.1, 0.2, 0.3]:
    svm_experiments.append((f"RAW + LabelNoise ({int(noise*100)}%)", "RAW", None, f"RAW_Label_{int(noise*100)}"))

svm_experiments.append(("PCA", "PCA", None, None))
for std in [0.1, 0.2, 0.3]:
    svm_experiments.append((f"PCA + Gaussian (std={std})", "PCA", f"RAW_Gaussian_{std}", None))
for amt in [0.05, 0.1, 0.2]:
    svm_experiments.append((f"PCA + S&P (amt={amt})", "PCA", f"RAW_SP_{amt}", None))
for noise in [0.1, 0.2, 0.3]:
    svm_experiments.append((f"PCA + LabelNoise ({int(noise*100)}%)", "PCA", None, f"RAW_Label_{int(noise*100)}"))

svm_experiments.append(("HOG", "HOG", None, None))
for std in [0.1, 0.2, 0.3]:
    svm_experiments.append((f"HOG + Gaussian (std={std})", "HOG", f"RAW_Gaussian_{std}", None))
for amt in [0.05, 0.1, 0.2]:
    svm_experiments.append((f"HOG + S&P (amt={amt})", "HOG", f"RAW_SP_{amt}", None))
for noise in [0.1, 0.2, 0.3]:
    svm_experiments.append((f"HOG + LabelNoise ({int(noise*100)}%)", "HOG", None, f"RAW_Label_{int(noise*100)}"))

svm_param_grid = [
    {"C": 1, "gamma": 0.01},
    {"C": 1, "gamma": 0.001},
    {"C": 1, "gamma": 0.1}
]

svm_results = [
    run_gridsearch_svm(mod, noise_key, label_noise_key, name, svm_param_grid)
    for name, mod, noise_key, label_noise_key in svm_experiments
]

print("\nTabela wyników SVM:")
print(f"{'Metoda':<35} | {'Parametry':<25} | {'Accuracy':<20} | {'Precision_macro':<20} | {'Recall_macro':<20} | {'F1_macro':<20} | {'Train [s]':<10} | {'Pred [s]':<10}")
print("-" * 160)
for r in svm_results:
    print(f"{r['name']:<35} | {str(r['params']):<25} | {r['metrics']['accuracy']:<20} | {r['metrics']['precision_macro']:<20} | "
          f"{r['metrics']['recall_macro']:<20} | {r['metrics']['f1_macro']:<20} | {r['timing']['train_time']:<10.3f} | {r['timing']['pred_time']:<10.3f}")



RAW
  Params: {'C': 1, 'gamma': 0.01}
    F1_macro: 0.3923 ± 0.0067 | Recall_macro: 0.4013 ± 0.0070 | Accuracy: 0.4013 ± 0.0070 | Train: 0.502s | Pred: 0.005s
  Params: {'C': 1, 'gamma': 0.001}
    F1_macro: 0.3678 ± 0.0075 | Recall_macro: 0.3777 ± 0.0062 | Accuracy: 0.3777 ± 0.0062 | Train: 0.550s | Pred: 0.004s
  Params: {'C': 1, 'gamma': 0.1}
    F1_macro: 0.1060 ± 0.0040 | Recall_macro: 0.1063 ± 0.0039 | Accuracy: 0.1063 ± 0.0039 | Train: 0.098s | Pred: 0.009s
Najlepsze parametry: {'C': 1, 'gamma': 0.01}

RAW + Gaussian (std=0.1)
  Params: {'C': 1, 'gamma': 0.01}
    F1_macro: 0.3736 ± 0.0049 | Recall_macro: 0.3820 ± 0.0051 | Accuracy: 0.3820 ± 0.0051 | Train: 0.500s | Pred: 0.004s
  Params: {'C': 1, 'gamma': 0.001}
    F1_macro: 0.3655 ± 0.0071 | Recall_macro: 0.3753 ± 0.0057 | Accuracy: 0.3753 ± 0.0057 | Train: 0.554s | Pred: 0.004s
  Params: {'C': 1, 'gamma': 0.1}
    F1_macro: 0.1012 ± 0.0028 | Recall_macro: 0.1013 ± 0.0027 | Accuracy: 0.1013 ± 0.0027 | Train: 0.098s | Pred: 0

#Random Forest

In [None]:
def crossval_metrics_rf(modality, noise_key=None, label_noise_key=None, rf_params={"n_estimators": 100, "max_depth": 20}):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    acc_list, prec_list, rec_list, f1_list = [], [], [], []
    train_times, pred_times = [], []
    X_np = X_raw_clean.cpu().numpy()
    y_np = y_raw.cpu().numpy()

    for train_idx, test_idx in skf.split(X_np, y_np):
        if modality == "RAW":
            x_train = X_raw_clean[train_idx]
            x_test = precomputed_noisy_features[noise_key][test_idx] if noise_key else X_raw_clean[test_idx]
        elif modality == "PCA":
            x_train = X_pca_clean[train_idx]
            x_test = precomputed_noisy_pca[noise_key][test_idx] if noise_key else X_pca_clean[test_idx]
        elif modality == "HOG":
            x_train = X_hog_clean[train_idx]
            x_test = precomputed_noisy_hog[noise_key][test_idx] if noise_key else X_hog_clean[test_idx]
        else:
            raise ValueError("Nieznany typ ekstrakcji cech")

        y_train = y_raw[train_idx]
        y_test = precomputed_noisy_labels[label_noise_key][test_idx] if label_noise_key else y_raw[test_idx]

        X_train_cp = cp.asarray(x_train.cpu().numpy())
        X_test_cp = cp.asarray(x_test.cpu().numpy())
        y_train_cp = cp.asarray(y_train.cpu().numpy())

        t0 = time.perf_counter()
        model_rf = RandomForestClassifier(**rf_params)
        model_rf.fit(X_train_cp, y_train_cp)
        t1 = time.perf_counter()
        train_times.append(t1 - t0)

        t2 = time.perf_counter()
        y_pred_cp = model_rf.predict(X_test_cp)
        t3 = time.perf_counter()
        pred_times.append(t3 - t2)

        y_pred = cp.asnumpy(y_pred_cp)
        y_true = y_test.cpu().numpy()

        acc_list.append(accuracy_score(y_true, y_pred))
        prec_list.append(precision_score(y_true, y_pred, average="macro", zero_division=0))
        rec_list.append(recall_score(y_true, y_pred, average="macro", zero_division=0))
        f1_list.append(f1_score(y_true, y_pred, average="macro", zero_division=0))

    timing = {
        "train_time": np.mean(train_times),
        "pred_time": np.mean(pred_times),
        "total_time": np.mean(train_times) + np.mean(pred_times)
    }
    metrics = {
        "accuracy": (np.mean(acc_list), np.std(acc_list)),
        "precision_macro": (np.mean(prec_list), np.std(prec_list)),
        "recall_macro": (np.mean(rec_list), np.std(rec_list)),
        "f1_macro": (np.mean(f1_list), np.std(f1_list))
    }
    return metrics, timing


def run_gridsearch_rf(modality, noise_key, label_noise_key, name, param_grid):
    print(f"\n{name}")
    best_f1 = -1
    best_params = None
    best_metrics = None
    best_timing = None

    for params in param_grid:
        print(f"  Params: {params}")
        metrics, timing = crossval_metrics_rf(modality, noise_key, label_noise_key, rf_params=params)
        summary = {k: f"{v[0]:.4f} ± {v[1]:.4f}" for k, v in metrics.items()}
        f1_val = float(summary["f1_macro"].split(" ±")[0])
        print(f"    Accuracy: {summary['accuracy']} | Precision: {summary['precision_macro']} | Recall: {summary['recall_macro']} | F1_macro: {summary['f1_macro']} | Train: {timing['train_time']:.3f}s | Pred: {timing['pred_time']:.3f}s")

        if f1_val > best_f1:
            best_f1 = f1_val
            best_params = params
            best_metrics = summary
            best_timing = timing

    precomp_time = {"RAW": 0.0, "PCA": pca_time, "HOG": hog_time}.get(modality, 0.0)

    print(f"Najlepsze parametry: {best_params}")
    return {
        "name": name,
        "params": best_params,
        "metrics": best_metrics,
        "timing": best_timing,
        "precomp_time": precomp_time
    }

# Scenariusze
rf_experiments = [("RAW", "RAW", None, None)]
for std in [0.1, 0.2, 0.3]:
    rf_experiments.append((f"RAW + Gaussian (std={std})", "RAW", f"RAW_Gaussian_{std}", None))
for amt in [0.05, 0.1, 0.2]:
    rf_experiments.append((f"RAW + S&P (amt={amt})", "RAW", f"RAW_SP_{amt}", None))
for noise in [0.1, 0.2, 0.3]:
    rf_experiments.append((f"RAW + LabelNoise ({int(noise*100)}%)", "RAW", None, f"RAW_Label_{int(noise*100)}"))

for modality in ["PCA", "HOG"]:
    rf_experiments.append((modality, modality, None, None))
    for std in [0.1, 0.2, 0.3]:
        rf_experiments.append((f"{modality} + Gaussian (std={std})", modality, f"RAW_Gaussian_{std}", None))
    for amt in [0.05, 0.1, 0.2]:
        rf_experiments.append((f"{modality} + S&P (amt={amt})", modality, f"RAW_SP_{amt}", None))
    for noise in [0.1, 0.2, 0.3]:
        rf_experiments.append((f"{modality} + LabelNoise ({int(noise*100)}%)", modality, None, f"RAW_Label_{int(noise*100)}"))

# Parametry RF
rf_param_grid = [
    {"n_estimators": 100, "max_depth": d, "max_features": f}
    for d in [20, 30]
    for f in ["sqrt", "log2"]
]


rf_results = [run_gridsearch_rf(mod, noise_key, label_noise_key, name, rf_param_grid)
              for name, mod, noise_key, label_noise_key in rf_experiments]

print("\nTabela wyników Random Forest:")
print(f"{'Metoda':<35} | {'Parametry':<25} | {'Accuracy':<20} | {'Precision_macro':<20} | {'Recall_macro':<20} | {'F1_macro':<20} | {'Train [s]':<10} | {'Pred [s]':<10}")
print("-" * 160)
for r in rf_results:
    print(f"{r['name']:<35} | {str(r['params']):<25} | {r['metrics']['accuracy']:<20} | {r['metrics']['precision_macro']:<20} | {r['metrics']['recall_macro']:<20} | {r['metrics']['f1_macro']:<20} | {r['timing']['train_time']:<10.3f} | {r['timing']['pred_time']:<10.3f}")



RAW
  Params: {'n_estimators': 100, 'max_depth': 20, 'max_features': 'sqrt'}
    Accuracy: 0.4552 ± 0.0061 | Precision: 0.4517 ± 0.0064 | Recall: 0.4552 ± 0.0061 | F1_macro: 0.4505 ± 0.0061 | Train: 2.532s | Pred: 0.127s
  Params: {'n_estimators': 100, 'max_depth': 20, 'max_features': 'log2'}
    Accuracy: 0.4448 ± 0.0024 | Precision: 0.4416 ± 0.0024 | Recall: 0.4448 ± 0.0024 | F1_macro: 0.4396 ± 0.0025 | Train: 1.030s | Pred: 0.132s
  Params: {'n_estimators': 100, 'max_depth': 30, 'max_features': 'sqrt'}
    Accuracy: 0.4559 ± 0.0047 | Precision: 0.4523 ± 0.0049 | Recall: 0.4559 ± 0.0047 | F1_macro: 0.4519 ± 0.0047 | Train: 2.840s | Pred: 0.136s
  Params: {'n_estimators': 100, 'max_depth': 30, 'max_features': 'log2'}
    Accuracy: 0.4463 ± 0.0035 | Precision: 0.4429 ± 0.0035 | Recall: 0.4463 ± 0.0035 | F1_macro: 0.4421 ± 0.0034 | Train: 1.144s | Pred: 0.156s
Najlepsze parametry: {'n_estimators': 100, 'max_depth': 30, 'max_features': 'sqrt'}

RAW + Gaussian (std=0.1)
  Params: {'n_est

#CNN

In [None]:
import torch, torch.nn as nn, torch.nn.functional as F, torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets, transforms
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
import numpy as np, random, time

SEED = 42
BATCH_SIZE = 256
EPOCHS = 60
PATIENCE = 10
N = 50000

random.seed(SEED); np.random.seed(SEED)
torch.manual_seed(SEED); torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True; torch.backends.cudnn.benchmark = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("GPU:", device)


_CIFAR_MEAN, _CIFAR_STD = (0.4914,0.4822,0.4465), (0.2470,0.2435,0.2616)
normalize_tf = transforms.Normalize(_CIFAR_MEAN, _CIFAR_STD)
full_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transforms.ToTensor())
X_raw = torch.stack([normalize_tf(full_dataset[i][0]) for i in range(N)])
y_raw = torch.tensor([full_dataset[i][1] for i in range(N)])


def add_label_noise(y, ratio, n_cls=10):
    y_noisy = y.clone()
    idx = np.random.choice(len(y), int(ratio*len(y)), replace=False)
    for i in idx:
        y_noisy[i] = random.choice([c for c in range(n_cls) if c != y[i]])
    return y_noisy

def add_gauss(x, std): return torch.clamp(x + torch.randn_like(x) * std, 0.0, 1.0)

def add_sp(x, amt):
    x_noisy = x.clone()
    flat = x_noisy.view(-1)
    idx = np.random.choice(len(flat), int(amt*len(flat)), replace=False)
    flat[idx] = torch.tensor(np.random.choice([0., 1.], len(idx))).float()
    return x_noisy

def apply_noise(x_tensor, kind, param):
    if kind == "gaussian": return add_gauss(x_tensor, param)
    if kind == "s&p": return add_sp(x_tensor, param)
    return x_tensor


class SimpleCNN_BN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 64, 3, padding=1); self.bn1 = nn.BatchNorm2d(64)
        self.conv2 = nn.Conv2d(64,128,3,padding=1);  self.bn2 = nn.BatchNorm2d(128)
        self.pool  = nn.MaxPool2d(2,2)
        self.fc1   = nn.Linear(128*8*8,256);         self.fc2 = nn.Linear(256,10)
        self.drop  = nn.Dropout(.3)
    def forward(self,x):
        x=self.pool(F.relu(self.bn1(self.conv1(x)))); x=self.pool(F.relu(self.bn2(self.conv2(x))));
        x=self.drop(F.relu(self.fc1(x.flatten(1))));  return self.fc2(x)


def compute_metrics(y_true, y_pred):
    return (
        accuracy_score(y_true, y_pred),
        precision_score(y_true, y_pred, average="macro", zero_division=0),
        recall_score(y_true, y_pred, average="macro", zero_division=0),
        f1_score(y_true, y_pred, average="macro", zero_division=0),
    )

def evaluate_model(model, loader):
    model.eval(); preds, targs = [], []
    with torch.no_grad():
        for xb, yb in loader:
            preds.extend(model(xb.to(device)).argmax(1).cpu().numpy())
            targs.extend(yb.numpy())
    return compute_metrics(targs, preds)

def crossval_metrics_cnn(noise_cfg=None):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    acc_list, prec_list, rec_list, f1_list, train_times = [], [], [], [], []

    for fold, (train_idx, test_idx) in enumerate(skf.split(X_raw, y_raw)):
        print(f"\nFold {fold+1}/5")
        X_train, y_train = X_raw[train_idx], y_raw[train_idx]
        X_test, y_test = X_raw[test_idx], y_raw[test_idx]

        if noise_cfg:
            if noise_cfg.get("img_noise"):
                X_test = torch.stack([apply_noise(x.clone(), noise_cfg["img_noise"], noise_cfg["img_param"]) for x in X_test])
            if noise_cfg.get("lbl_noise"):
                y_test = add_label_noise(y_test, noise_cfg["lbl_param"])

        train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
        test_loader  = DataLoader(TensorDataset(X_test, y_test), batch_size=BATCH_SIZE, shuffle=False)

        model = SimpleCNN_BN().to(device)
        opt = optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)
        sch = optim.lr_scheduler.CosineAnnealingLR(opt, T_max=EPOCHS)
        crit = nn.CrossEntropyLoss(label_smoothing=0.1)

        best_f1, best_state, wait = 0, None, 0
        t0 = time.perf_counter()
        for ep in range(EPOCHS):
            model.train()
            for xb, yb in train_loader:
                xb, yb = xb.to(device), yb.to(device)
                opt.zero_grad(); crit(model(xb), yb).backward(); opt.step()
            sch.step()

            model.eval(); preds, targs = [], []
            with torch.no_grad():
                for xb, yb in test_loader:
                    out = model(xb.to(device))
                    preds.extend(out.argmax(1).cpu().numpy()); targs.extend(yb.numpy())
            _,_,_,f1 = compute_metrics(targs, preds)
            if f1 > best_f1: best_f1, best_state, wait = f1, model.state_dict(), 0
            else:
                wait += 1
                if wait >= PATIENCE: break

        train_time = time.perf_counter() - t0
        model.load_state_dict(best_state)
        acc, prec, rec, f1 = evaluate_model(model, test_loader)
        acc_list.append(acc); prec_list.append(prec); rec_list.append(rec); f1_list.append(f1); train_times.append(train_time)
        print(f"Fold {fold+1} - ACC={acc:.4f} F1={f1:.4f} Train={train_time:.1f}s")

    return {
        "accuracy": (np.mean(acc_list), np.std(acc_list)),
        "precision_macro": (np.mean(prec_list), np.std(prec_list)),
        "recall_macro": (np.mean(rec_list), np.std(rec_list)),
        "f1_macro": (np.mean(f1_list), np.std(f1_list))
    }, {
        "train_time": np.mean(train_times),
        "pred_time": 0.0,
        "total_time": np.mean(train_times)
    }

def format_cnn_results(cnn_raw_results):
    formatted = []
    for name, summary, timing in cnn_raw_results:
        formatted.append({
            "name": name,
            "params": "-",
            "metrics": summary,
            "timing": timing,
            "precomp_time": 0.0
        })
    return formatted

experiments = [
    ("RAW", dict(img_noise=None, img_param=None, lbl_noise=False, lbl_param=None)),
    ("Gauss 0.1", dict(img_noise="gaussian", img_param=0.1, lbl_noise=False, lbl_param=None)),
    ("Gauss 0.2", dict(img_noise="gaussian", img_param=0.2, lbl_noise=False, lbl_param=None)),
    ("Gauss 0.3", dict(img_noise="gaussian", img_param=0.3, lbl_noise=False, lbl_param=None)),
    ("S&P 0.05", dict(img_noise="s&p", img_param=0.05, lbl_noise=False, lbl_param=None)),
    ("S&P 0.10", dict(img_noise="s&p", img_param=0.10, lbl_noise=False, lbl_param=None)),
    ("S&P 0.20", dict(img_noise="s&p", img_param=0.20, lbl_noise=False, lbl_param=None)),
    ("Label 10%", dict(img_noise=None, img_param=None, lbl_noise=True, lbl_param=0.1)),
    ("Label 20%", dict(img_noise=None, img_param=None, lbl_noise=True, lbl_param=0.2)),
    ("Label 30%", dict(img_noise=None, img_param=None, lbl_noise=True, lbl_param=0.3))
]

print("\n=== Testowanie CNN ===")
cnn_raw_results = []
for name, cfg in experiments:
    print(f"\n{name}")
    metrics, timing = crossval_metrics_cnn(cfg)
    summary = {k: f"{v[0]:.4f} ± {v[1]:.4f}" for k, v in metrics.items()}
    print(f"Acc: {summary['accuracy']} | Prec: {summary['precision_macro']} | Rec: {summary['recall_macro']} | F1: {summary['f1_macro']} | Train: {timing['train_time']:.1f}s")
    cnn_raw_results.append((name, summary, timing))

cnn_results = format_cnn_results(cnn_raw_results)

print("\nTabela wyników CNN:")
print(f"{'Metoda':<35} | {'Parametry':<10} | {'Accuracy':<20} | {'Precision_macro':<20} | {'Recall_macro':<20} | {'F1_macro':<20} | {'Train [s]':<10} | {'Pred [s]':<10}")
print("-" * 160)
for r in cnn_results:
    print(f"{r['name']:<35} | {r['params']:<10} | {r['metrics']['accuracy']:<20} | {r['metrics']['precision_macro']:<20} | "
          f"{r['metrics']['recall_macro']:<20} | {r['metrics']['f1_macro']:<20} | {r['timing']['train_time']:<10.2f} | {r['timing']['pred_time']:<10.2f}")



GPU: cuda

=== Testowanie CNN ===

RAW

Fold 1/5
Fold 1 - ACC=0.7798 F1=0.7791 Train=192.5s

Fold 2/5
Fold 2 - ACC=0.7612 F1=0.7638 Train=105.6s

Fold 3/5
Fold 3 - ACC=0.7613 F1=0.7626 Train=107.9s

Fold 4/5
Fold 4 - ACC=0.7691 F1=0.7691 Train=141.0s

Fold 5/5
Fold 5 - ACC=0.7782 F1=0.7780 Train=191.3s
Acc: 0.7699 ± 0.0080 | Prec: 0.7743 ± 0.0035 | Rec: 0.7699 ± 0.0080 | F1: 0.7705 ± 0.0069 | Train: 147.7s

Gauss 0.1

Fold 1/5
Fold 1 - ACC=0.3703 F1=0.3571 Train=134.7s

Fold 2/5
Fold 2 - ACC=0.3565 F1=0.3233 Train=57.1s

Fold 3/5
Fold 3 - ACC=0.3359 F1=0.3072 Train=67.7s

Fold 4/5
Fold 4 - ACC=0.3669 F1=0.3482 Train=124.7s

Fold 5/5
Fold 5 - ACC=0.3540 F1=0.3284 Train=69.9s
Acc: 0.3567 ± 0.0121 | Prec: 0.5561 ± 0.0212 | Rec: 0.3567 ± 0.0121 | F1: 0.3328 ± 0.0178 | Train: 90.8s

Gauss 0.2

Fold 1/5
Fold 1 - ACC=0.2779 F1=0.2370 Train=77.3s

Fold 2/5
Fold 2 - ACC=0.3113 F1=0.2990 Train=82.4s

Fold 3/5
Fold 3 - ACC=0.2521 F1=0.2292 Train=83.6s

Fold 4/5
Fold 4 - ACC=0.3336 F1=0.3238 Train

#Zapis Wyników

In [None]:
import pandas as pd

def save_results_to_excel(knn_results, rf_results, svm_results, cnn_results, filename="results.xlsx"):
    def flatten_result(r, model_name):
        return {
            "Model": model_name,
            "Metoda": r["name"],
            "Parametry": r.get("params", "-"),
            "Accuracy": r["metrics"]["accuracy"],
            "Precision_macro": r["metrics"]["precision_macro"],
            "Recall_macro": r["metrics"]["recall_macro"],
            "F1_macro": r["metrics"]["f1_macro"],
            "Train_time": r["timing"]["train_time"],
            "Pred_time": r["timing"]["pred_time"],
            "Precomp_time": r.get("precomp_time", 0.0)
        }

    all_rows = []
    for model_name, results in [
        ("kNN", knn_results),
        ("Random Forest", rf_results),
        ("SVM", svm_results),
        ("CNN", cnn_results)
    ]:
        for r in results:
            all_rows.append(flatten_result(r, model_name))

    df = pd.DataFrame(all_rows)
    df.to_excel(filename, index=False)
    print(f"\n Wyniki zostały zapisane do pliku: {filename}")


In [None]:

save_results_to_excel(knn_results, rf_results, svm_results, cnn_results)



 Wyniki zostały zapisane do pliku: results.xlsx
