# **Pipeline**

## **1. Introduction**

Here you need to set all the necessary libraries and the data that will be used 
in the project. Also, it is important to specify all the path and all the parameters
that will be used in the project. 

In [1]:
import os
import cv2
import sys
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

sys.path.insert(0, "..\\Scripts")

from tqdm import tqdm
from torchvision import transforms
from torch.utils.data import DataLoader
from ImagePipeline import ImagePipeline
from ImageDatastore import ImageDatastore
from Utils import create_or_clear_directory
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from NeuralFeatureExtractor import MobileNetFeatureExtractor
from sklearn.metrics import accuracy_score, top_k_accuracy_score

### **Params**

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"
num_workers = os.cpu_count() // 2

print("Device:", device)
print("Number of workers:", num_workers)

Device: cpu
Number of workers: 4


In [12]:
extract_features = True

type_of_train = "train_unlabeled"

batch_size = 512

transform = transforms.Compose(
    [
        transforms.Resize(232, interpolation=transforms.InterpolationMode.BILINEAR),
        transforms.Pad(padding=(0, 0, 0, 0), fill=0),
        transforms.CenterCrop((232, 232)),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)

In [13]:
extractor = MobileNetFeatureExtractor()

In [14]:
preprocess_image = False
type_of_preprocess = "neural"

### **Utils**

In [15]:
def save_features(x, y, filename):
    np.save(
        os.path.join("..", "Features", "features", filename),
        x,
        allow_pickle=False,
    )
    np.save(
        os.path.join("..", "Features", "labels", filename),
        y,
        allow_pickle=False,
    )


def load_features(filename):
    x = np.load(
        os.path.join("..", "Features", "features", filename),
        allow_pickle=False,
    )
    y = np.load(
        os.path.join("..", "Features", "labels", filename),
        allow_pickle=False,
    )
    return x, y

## **2. Dataset**

Create the dataloaders and the dataset that will be used in the project.

In [16]:
train_set = ImageDatastore(type_of_train, transform=transform)
train_set_loader = DataLoader(
    train_set,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    pin_memory=True,
)

In [17]:
val_set = ImageDatastore("val_set", transform=transform)
val_set_loader = DataLoader(
    val_set,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers,
    pin_memory=True,
)

In [18]:
val_deg = ImageDatastore("val_degradate_clear", transform=transform)
val_deg_loader = DataLoader(
    val_deg,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers,
    pin_memory=True,
)

## **3. Feature extraction**

Extract the features from the dataset only if it is necessary, using the feature extraction
methods specified in the params.

In [19]:
if extract_features:
    x_train, y_train = extractor.compute_features(train_set_loader)
    save_features(x_train, y_train, "train_features_retrieval.npy")
else:
    x_train, y_train = load_features("train_features_retrieval.npy")

x_train.shape, y_train.shape

100%|██████████| 19/19 [03:56<00:00, 12.47s/it]


((9244, 1280), (9244,))

In [20]:
if extract_features:
    x_val, y_val = extractor.compute_features(val_set_loader)
    save_features(x_val, y_val, "val_features.npy")
else:
    x_val, y_val = load_features("val_features.npy")

x_val.shape, y_val.shape

100%|██████████| 24/24 [05:28<00:00, 13.71s/it]


((11994, 1280), (11994,))

In [21]:
if extract_features:
    x_val_deg, y_val_deg = extractor.compute_features(val_deg_loader)
    save_features(x_val_deg, y_val_deg, "val_deg_features.npy")
else:
    x_val_deg, y_val_deg = load_features("val_deg_features.npy")

x_val_deg.shape, y_val_deg.shape

100%|██████████| 20/20 [04:18<00:00, 12.91s/it]


((9945, 1280), (9945,))

## **4. Models**

Use the features extracted in the previous step to train the models.

### **4.1 k-NN Classifier**

In [22]:
knn = KNeighborsClassifier(
    n_neighbors=51, n_jobs=-1, weights="distance", metric="cosine"
)
knn.fit(x_train, y_train)

### **4.2 Random Forest Classifier**

In [23]:
rf = RandomForestClassifier(criterion="gini", n_estimators=100, n_jobs=-1)
rf.fit(x_train, y_train)

## **5. Evaluation**

Evaluate the models using the metrics specified in the params.

### **5.1 k-NN Classifier**

In [24]:
knn_pred = knn.predict(x_val)
print(f"Val: {accuracy_score(y_val, knn_pred)}")

knn_score = knn.predict_proba(x_val)
print(f"Val top 5: {top_k_accuracy_score(y_val, knn_score, k=5)}")

Val: 0.29389694847423714
Val top 5: 0.5610305152576288


### **5.2 Random Forest Classifier**

In [25]:
rf_pred = rf.predict(x_val)
print(f"Val: {accuracy_score(y_val, rf_pred)}")

rf_score = rf.predict_proba(x_val)
print(f"Val top 5: {top_k_accuracy_score(y_val, rf_score, k=5)}")

Val: 0.21677505419376356
Val top 5: 0.4093713523428381


## **6. Evaluation Degraded**

In this section, we will evaluate the models using the degraded dataset.

### **6.1 Clean the dataset**

In [17]:
if preprocess_image:

    create_or_clear_directory(os.path.join("..", "Dataset", "Preprocessed"))

    records = []  # Use a list instead of DataFrame concatenation

    pipe = ImagePipeline(knn, extractor)

    for row in tqdm(
        val_deg.labels.itertuples(index=False),
        total=len(val_deg.labels),
        file=sys.stdout,
    ):
        img_path = os.path.join("..", "Dataset", "val_set_degraded", row.Image)

        if not os.path.exists(img_path):
            print(f"Warning: File {img_path} not found!", file=sys.stderr)
            continue

        img = cv2.imread(img_path)
        post_img = pipe.preprocess(img, (20, 70))

        if not isinstance(post_img, np.ndarray):
            continue

        # Save preprocessed image
        save_path = os.path.join("..", "Dataset", "Preprocessed", row.Image)
        cv2.imwrite(save_path, post_img)

        # Append to list (avoiding slow DataFrame updates in loop)
        records.append((row.Image, row.Label))

    # Create DataFrame in one step
    df = pd.DataFrame(records, columns=["Image", "label"])
    df.to_csv(os.path.join("..", "Dataset", "val_deg_process.csv"), index=False)

### **6.2 k-NN Classifier**

In [26]:
knn_pred_deg = knn.predict(x_val_deg)
print(f"Deg: {accuracy_score(y_val_deg, knn_pred_deg)}")

knn_score_deg = knn.predict_proba(x_val_deg)
print(f"Deg top 5: {top_k_accuracy_score(y_val_deg, knn_score_deg, k=5)}")

Deg: 0.24012066365007542
Deg top 5: 0.4772247360482655


### **6.3 Random Forest Classifier**

In [27]:
rf_pred_deg = rf.predict(x_val_deg)
print(accuracy_score(y_val_deg, rf_pred_deg))

rf_score_deg = rf.predict_proba(x_val_deg)
print(top_k_accuracy_score(y_val_deg, rf_score_deg, k=5))

0.16520864756158873
0.3207642031171443
