# Information Retrieval Project - Baseline System

The code below represents a baseline system serving as a comparison point for the other adopted implementations.


# Installation of Requirements

This part of the code installs necessary dependencies specified in the `requirements.txt` file.
Utilizes the `!pip install -r requirements.txt` command to install packages listed in the `requirements.txt` file. This ensures that all required dependencies are installed before proceeding with execution.


In [None]:
!pip install -r requirements.txt

# Importing Required Libraries

This part of the code imports necessary libraries.

In [1]:
# Import
import os
import kaggle

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split


# Definition of constants

This part of the code defines useful constants and paths.


In [2]:
# ************** CONSTANTS ************* #


# PATHS
DATASET_PATH = "archives"  # Path to the dataset
DATASET_ID = "plameneduardo/sarscov2-ctscan-dataset"  # ID for Kaggle API. Format: dataset_owner_name/dataset_name
COVID_PATH = os.path.join(DATASET_PATH, "COVID")  # Path to data labeled as COVID
NON_COVID_PATH = os.path.join(DATASET_PATH, "non-COVID")  # Path to data labeled as non-COVID
DATA_PATH = "data"  # Path to the data folder 
PLOT_PATH = "plot"  # Path to the plot folder

# DATA RELATED
CLASSES = ["COVID", "non-COVID"]
BASELINE = "baseline_system"

# SIMILARITY RELATED
TOP_N = 100  # Number of images retrieved
K_VALUES = [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]  # K ranking values


**Dataset Download Function**:
    - `download_dataset_from_kaggle()` checks if the dataset exists locally; if not, it downloads and unzips it using the Kaggle API.

In [None]:
# Download dataset from Kaggle website
def download_dataset_from_kaggle(dataset_id: str, dataset_path: str) -> None:
    """
    Download dataset using Kaggle module

    :param dataset_id: identify the dataset to download.
        Format: dataset_owner_name/dataset_name
    :param dataset_path: location to save the dataset

    :return: None
    """
    # Download the dataset if not exist in the workplace
    if not os.path.exists(path=dataset_path):
        
        print("\n> Download the dataset from Kaggle...")
        # Download dataset and unzip it
        kaggle.api.dataset_download_files(dataset=dataset_id, path=dataset_path, quiet=False, unzip=True)
    else:
        print("\n> Dataset already downloaded.")
        
        
## ************************************ DOWNLOAD THE DATASET ********************************** ##        


# Execute the download operation
download_dataset_from_kaggle(dataset_id=DATASET_ID, dataset_path=DATASET_PATH)


**DataFrame Construction**:
- `build_dataframe()` creates a DataFrame from the dataset, with file paths and corresponding class labels.

**Random Index Selection**:
- `get_random_indices()` returns a specified number of random indices from the training dataset.

**Evaluation Metrics Calculation**:
- `compute_evaluation_metrics()` computes precision, recall, F1-score, and Discounted Cumulative Gain (DCG) based on the labels of the retrieved images compared to the query image's label.

**Average Model Performance Evaluation**:
- `get_average_model_performance()` iterates over the test dataset, retrieves random images from the training set, and computes the average precision, recall, F1-score, and DCG for various values of `k`.

**Execution**:
- The dataset is downloaded and a DataFrame is built from the images.
- The dataset is split into training and test sets.
- The baseline system's performance is evaluated by comparing the retrieved images with the query images.
- The average performance metrics are printed.

In [4]:
# Define a pandas dataframe
def build_dataframe(dataset_path: str) -> pd.DataFrame:
    """
    Build a dataframe from dataset.
    
    :param dataset_path: Path to the dataset directory.
    
    :return: DataFrame containing file paths and corresponding class labels.
    """
    # Generate a list of tuples containing file paths and their corresponding class labels
    data = [
        (os.path.join(dataset_path, directory_name, file), class_label)
        for class_label, directory_name in enumerate(CLASSES)
        for file in os.listdir(os.path.join(dataset_path, directory_name))
    ]
    # Create a DataFrame from the list of tuples with appropriate column names
    data_df = pd.DataFrame(data, columns=["file_paths", "labels"])
    
    return data_df


# Return n random indices from train dataset
def get_random_indices(train_dataframe: pd.DataFrame, top_n: int=TOP_N) -> list:
    """
    Return N random indices from the train set
    
    :param train_dataframe: DataFrame containing the data.
    :param top_n: Number of random indices to return.
    
    :return: List of N random indices
    """
    return train_dataframe.sample(n=top_n).index.tolist()


# Evaluation of the model performance
def compute_evaluation_metrics(query_img_label: int, image_labels_retrieved: np.ndarray[int], k: int) -> tuple[float, float, float, float]:
    """
    Evaluate the similarity between the query image and the Top N most similar images.

    :param query_img_label: Label of the query image.
    :param_ image_labels_retrieved: Labels of the Top-N most similar images retrieved.
    :param k: The number of top items to consider.

    :return: precision, recall, F1-score.
    """
    # Count of the Relevant K items
    k_relevant_retrieved = np.sum(image_labels_retrieved[:k] == query_img_label)
    # Compute Precision score
    precision_score = k_relevant_retrieved / k
    
    # Count of the All Relevant items in the Top N images retrieved
    total_relevant_items = np.sum(image_labels_retrieved == query_img_label)
    # Compute Recall score 
    recall_score = k_relevant_retrieved / total_relevant_items if total_relevant_items != 0 else 0.0
    
    # Compute F1-score
    if precision_score + recall_score > 0.0:
        f1_score = 2 * (precision_score * recall_score) / (precision_score + recall_score)
    else:
        f1_score = 0.0
    
    # Relevance scores for DCG (1 for relevant, 0 for non-relevant)
    relevance_scores = np.array(image_labels_retrieved == query_img_label)[:k]
    # Compute DCG
    dcg = np.sum(relevance_scores / np.log2(np.arange(2, relevance_scores.size + 2))) if relevance_scores.size != 0 else 0.0 

    return precision_score, recall_score, f1_score, dcg


# Evaluation performed on all the element of the test set
def get_average_model_performance(query_dataset: pd.DataFrame, training_dataset: pd.DataFrame) -> pd.DataFrame:
    """
    Evaluate the test dataset by computing precision, recall, and F1-score and output the average values.

    :param query_dataset: DataFrame containing the test dataset.
    :param training_dataset: DataFrame containing the training dataset.

    :return: average precision, average recall, and  average f1-score.
    """
    # Dictionary to collect data
    metrics = {k: {"precision": [], "recall": [], "f1": [], "dcg": []} for k in K_VALUES}
    
    print("\n> Computing evaluation metrics for each test sample...\n")
    # Iterate through each feature in the test dataset
    for query_file_path, query_label in zip(query_dataset["file_paths"], query_dataset["labels"]):
        # Print query name
        query_file_name = os.path.splitext(os.path.basename(query_file_path))[0]
        print(f"\n-- Processing: {os.path.basename(query_file_name)} - label: {query_label}")
        
        # Retrieve random images 
        similar_index = get_random_indices(train_dataframe=training_dataset)
        
        # Print retrieved random files name
        retrieved_files = training_dataset.loc[similar_index]["file_paths"]
        retrieved_file_names = [os.path.splitext(os.path.basename(file))[0] for file in retrieved_files]
        print(f"-- Images retrieved: {", ".join(retrieved_file_names)}")

        # Extract labels of similar images
        similar_label = training_dataset.loc[similar_index]["labels"].to_numpy()
        print(f"-- Labels: {similar_label}")
        
        # Loop through K
        for k in K_VALUES:
            # Compute precision, recall, and F1-score
            precision, recall, f1, dcg = compute_evaluation_metrics(query_img_label=query_label, image_labels_retrieved=similar_label, k=k)
            # Append scores to Dictionary
            metrics[k]["precision"].append(precision)
            metrics[k]["recall"].append(recall)
            metrics[k]["f1"].append(f1)
            metrics[k]["dcg"].append(dcg)
        
    print("\n> Process completed!")

    # Compute metrics average and build a dataframe
    avg_metrics = {k: {"avg_precision": np.mean(metrics[k]["precision"]),
                       "avg_recall": np.mean(metrics[k]["recall"]),
                       "avg_f1": np.mean(metrics[k]["f1"]),
                       "avg_dcg": np.mean(metrics[k]["dcg"])}
                   for k in K_VALUES}
    df_avg_metrics = pd.DataFrame.from_dict(avg_metrics, orient="index")
    df_avg_metrics.index.name = "K"
    
    # Save the Dataframe as CSV
    metrics_path = os.path.join(DATA_PATH, f"{BASELINE}_evaluation.csv")
    df_avg_metrics.to_csv(path_or_buf=metrics_path, float_format="%.4f")

    return df_avg_metrics


In [None]:
# ******************************* BASELINE SYSTEM - PERFORMANCE EVALUATION ***************************** ##


# Build the dataframe 
dataset_df = build_dataframe(dataset_path=DATASET_PATH)

# Splitting dataset into Training and Test
train_df, test_df = train_test_split(dataset_df, test_size=0.2, shuffle=True, random_state=4)

# Compute Average Classification metrics values
df = get_average_model_performance(
    query_dataset=test_df, 
    training_dataset=train_df
)


In [7]:
# Print average performance
print(f"\n> BASELINE SYSTEM AVERAGE PERFORMANCE:")
print(df.to_string())



> BASELINE SYSTEM AVERAGE PERFORMANCE:
     avg_precision  avg_recall    avg_f1    avg_dcg
K                                                  
5         0.471630    0.047543  0.086305   1.394764
10        0.481288    0.097100  0.161371   2.176542
20        0.494668    0.199354  0.283620   3.445352
30        0.493159    0.297944  0.370630   4.485812
40        0.494416    0.398063  0.439998   5.446419
50        0.497304    0.500408  0.497664   6.365891
60        0.496244    0.599352  0.541657   7.209804
70        0.496465    0.699727  0.579482   8.032058
80        0.496756    0.799953  0.611526   8.829484
90        0.496982    0.900452  0.639063   9.604945
100       0.496821    1.000000  0.662421  10.356471
