In [1]:
import numpy as np
import pandas as pd
from torchvision.models import resnet50
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import shutil
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit


# Classification using KNN small training + 10% unlabeled training

In [30]:
def fit_knn(n_neighbors, dataset, validation, standardize = True, print_conf_matrix=True, print_class_report = True, weights='uniform'):
    std_pipeline = Pipeline([
        ('scaler', StandardScaler()),              # Standardization step
        ('knn', KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights)) 
    ])

    pipeline = Pipeline([
        ('knn', KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights))  
    ])

    if standardize:
        std_pipeline.fit(dataset[0], dataset[1])
        predictions = std_pipeline.predict(validation[0])
    else:
        pipeline.fit(dataset[0], dataset[1])
        predictions = pipeline.predict(validation[0])

    if print_conf_matrix:
        cm = confusion_matrix(validation[1], predictions, labels=range(251))

        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

        # Set the figure size
        plt.figure(figsize=(20, 15))

        # Plot using Seaborn's heatmap
        sns.heatmap(
            cm_normalized,
            cmap='viridis',
            cbar=True,
            square=False,  # Use rectangular cells for better fit
            xticklabels=25,  # Show only 25 x-axis labels to avoid clutter
            yticklabels=25  # Show only 25 y-axis labels to avoid clutter
        )

        # Rotate and set axis labels
        plt.xlabel('Predicted Labels', fontsize=14)
        plt.ylabel('True Labels', fontsize=14)
        plt.title('Normalized Confusion Matrix (250 Classes)', fontsize=18)
        plt.xticks(rotation=45, fontsize=10)
        plt.yticks(fontsize=10)

        # Display the plot
        plt.show()
    
    if print_class_report:
        cr = classification_report(validation[1], predictions, labels=range(251), zero_division=0)
        print("\nClassification Report:")
        print(cr)
    
    if standardize:
        std_pipeline.fit(np.concatenate([dataset[0], validation[0]]), 
                         np.concatenate([dataset[1], validation[1]]))
        return std_pipeline
    else:
        pipeline.fit(np.concatenate([dataset[0], validation[0]]), 
                     np.concatenate([dataset[1], validation[1]]))
        return pipeline
    

In [2]:
dataset = np.load('./dataset/features_extended_10.npz')

In [None]:
knn = fit_knn(9, (dataset['X_train'], dataset['y_train']), (dataset['X_val'], dataset['y_val']), standardize=False, print_conf_matrix=False)

In [3]:
test_dataset = np.load("./dataset/test_features_resnet50.npz")
X_test = test_dataset['X']
y_test = test_dataset['y']

In [41]:
y_pred = knn.predict(X_test)

In [None]:
cr = classification_report(y_test, y_pred, labels=range(251), zero_division=0)
print("\nClassification Report:")
print(cr)

# SVM NO

In [4]:




def fit_svm(dataset, validation, print_conf_matrix=True, print_class_report = True):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),      # Step 1: Standardize the data
        ('svc', SVC())                     # Step 2: Apply SVC model
    ])

    # 4. Set up the parameter grid for GridSearchCV
    param_grid = {
        'scaler': [StandardScaler(), None],          # Include 'None' for no scaling
        'svc__kernel': ['linear', 'rbf'],             # Testing linear and RBF kernels
        'svc__C': [0.1, 1, 10],                       # Regularization parameter
        'svc__gamma': ['scale', 'auto']          # Kernel coefficient
    }

    X_grid = np.concatenate([dataset[0], validation[0]])
    y_grid = np.concatenate([dataset[1], validation[1]])

    predefined_split = PredefinedSplit(test_fold=np.concatenate([np.zeros(len(dataset[0])), np.ones(len(validation[0]))]))

    grid_search = GridSearchCV(pipeline, param_grid, cv=predefined_split, verbose=1, n_jobs=-1)
    grid_search.fit(X_grid, y_grid)


    # Best parameters found by GridSearchCV
    print("Best Parameters:", grid_search.best_params_)

    # Use the best model from grid search
    best_svc_model = grid_search.best_estimator_

    best_svc_model.fit(dataset[0], dataset[1])
    predictions = best_svc_model.predict(validation[0])

    if print_conf_matrix:
        cm = confusion_matrix(validation[1], predictions, labels=range(251))

        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

        # Set the figure size
        plt.figure(figsize=(20, 15))

        # Plot using Seaborn's heatmap
        sns.heatmap(
            cm_normalized,
            cmap='viridis',
            cbar=True,
            square=False,  # Use rectangular cells for better fit
            xticklabels=25,  # Show only 25 x-axis labels to avoid clutter
            yticklabels=25  # Show only 25 y-axis labels to avoid clutter
        )

        # Rotate and set axis labels
        plt.xlabel('Predicted Labels', fontsize=14)
        plt.ylabel('True Labels', fontsize=14)
        plt.title('Normalized Confusion Matrix (250 Classes)', fontsize=18)
        plt.xticks(rotation=45, fontsize=10)
        plt.yticks(fontsize=10)

        # Display the plot
        plt.show()
    
    if print_class_report:
        cr = classification_report(validation[1], predictions, labels=range(251), zero_division=0)
        print("\nClassification Report:")
        print(cr)
    
    best_svc_model.fit(np.concatenate([dataset[0], validation[0]]), 
                        np.concatenate([dataset[1], validation[1]]))
    return best_svc_model
    

In [5]:
svm = fit_svm((dataset['X_train'], dataset['y_train']), (dataset['X_val'], dataset['y_val']), print_conf_matrix=False)

Fitting 2 folds for each of 24 candidates, totalling 48 fits


: 

In [53]:
y_pred = svm.predict(X_test)
cr = classification_report(y_test, y_pred, labels=range(251), zero_division=0)
print("\nClassification Report:")
print(cr)

KeyboardInterrupt: 