In [131]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import euclidean_distances



In [113]:
# Define the paths to your CSV files
base_path = r'C:\Users\Admin\Documents\Cinderella\csvs'

csv_file_1 = base_path + '\\patient_info.csv'
csv_file_2 = base_path + '\\catalogue_info.csv'

# Read the CSV file into a Pandas DataFrame
patient_info = pd.read_csv(csv_file_1)
catalogue_info = pd.read_csv(csv_file_2)

In [114]:
# Drop the specified columns
columns_to_drop = ['Surgery Date','Date Posted', 'Owner', 'Surgery Type','Patient Status','Radiotherapy Start Date','Radiotherapy End Date']
patient_info = patient_info.drop(columns=columns_to_drop)

catalogue_info=catalogue_info[catalogue_info['Ordered Exc Good Catalogue'].notna()]
# Rename the 'query patient' column in the second DataFrame to 'Patient ID'
catalogue_info.rename(columns={'Query Patient': 'Patient ID'}, inplace=True)

In [115]:
# Custom date parser to handle out-of-bounds dates
def custom_date_parser(date_str):
    try:
        return pd.to_datetime(date_str, format='%Y-%m-%d', errors='coerce')
    except:
        return pd.NaT

# Convert "Patient Birthday" to datetime using the custom parser
patient_info['Patient Birthday'] = patient_info['Patient Birthday'].apply(custom_date_parser)

# Function to calculate age from a datetime object
def calculate_age(birthday):
    if pd.isna(birthday):
        return None
    today = datetime.today()
    return today.year - birthday.year - ((today.month, today.day) < (birthday.month, birthday.day))

bra_cup_mapping = {'AA': 1, 'A': 2, 'B': 3, 'C': 4, 'c': 5, 'D': 6, 'DD': 7, 'E': 8, 'F': 9, 'FF': 10, 'G': 11}
patient_info['Bra Cup Numeric'] = patient_info['Bra Cup'].map(bra_cup_mapping)

# Apply the age calculation function
patient_info['Age'] = patient_info['Patient Birthday'].apply(calculate_age)

# Get the unique values of 'Bra Size' and sort them
unique_bra_sizes = sorted(patient_info['Bra Size'].dropna().unique(), key=lambda x: (isinstance(x, str), x))

# Create a mapping from each unique bra size to a unique integer starting from 1
bra_size_mapping = {size: idx + 1 for idx, size in enumerate(unique_bra_sizes)}

# Map the 'Bra Size' values to the new integer values
patient_info['Bra Size Mapped'] = patient_info['Bra Size'].map(bra_size_mapping)

patient_info = patient_info.drop(columns=['Patient Birthday','Bra Cup','Bra Size' ])


In [116]:
patient_info.dropna()

Unnamed: 0,Patient Institutional Number,Patient ID,Patient Height,Patient Weight,Had Radiotherapy,Radiotherapy Gray Total Dose,Had Radiotherapy Boost,Cancer Side,Bra Cup Numeric,Age,Bra Size Mapped
2,100032,1812,160.0,54.0,True,48.0,True,Right,8.0,54.0,9.0
3,100131,1810,153.0,73.0,True,48.0,True,Right,8.0,73.0,16.0
4,100176,1809,167.0,57.0,True,50.0,False,Left,4.0,50.0,13.0
5,100209,1808,158.0,56.0,True,48.0,True,Right,6.0,56.0,13.0
6,100278,1807,172.0,63.0,True,48.0,True,Right,1.0,52.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...
1735,99759,1266,163.0,78.0,True,48.0,True,Right,4.0,38.0,13.0
1736,99767,670,170.0,55.0,True,28.0,False,Right,1.0,54.0,10.0
1737,99874,1815,171.0,53.0,True,48.0,True,Right,1.0,60.0,9.0
1738,99910,1814,160.0,70.0,True,48.0,True,Left,4.0,49.0,15.0


In [145]:
# Only select the 'patient_id' and 'image' columns from patient_images
merged_df = pd.merge(catalogue_info, patient_info, on='Patient ID')
merged_df= merged_df.dropna()
# Display the resulting DataFrame
print(merged_df)

     Catalogue ID  Patient ID            Retrieved Exc Good Patients  \
0             152        1744       911,437,646,776,23,1209,1538,647   
1             140         648  1014,1886,1597,734,2071,1710,1892,474   
2             141         480       480,739,1815,433,949,1162,65,439   
3             142         590  1643,1130,1292,309,1161,1025,646,1754   
4             151         590  1643,1130,1292,309,1161,1025,646,1754   
..            ...         ...                                    ...   
196           393        1202  1213,1715,1144,1249,1773,582,464,1895   
198           395          84       84,281,250,1862,553,631,1306,942   
199           396        1438   1559,1158,2127,391,828,1643,856,1087   
200           402          51    51,624,574,2050,1895,2060,1776,1695   
201           404        1641        1646,583,608,78,1733,86,138,377   

               Retrieved Fair Poor Patients Random Patients Exc Good  \
0     1962,1648,1778,1243,1145,599,1312,592       739,1607,1203

In [106]:
merged_df

Unnamed: 0,Catalogue ID,Patient ID,Retrieved Exc Good Patients,Retrieved Fair Poor Patients,Random Patients Exc Good,Random Patients Fair Poor,Ordered Exc Good Catalogue,Ordered Fair Poor Catalogue,Patient Institutional Number,Patient Height,Patient Weight,Had Radiotherapy,Radiotherapy Gray Total Dose,Had Radiotherapy Boost,Cancer Side,Bra Cup Numeric,Age,Bra Size Mapped
0,152,1744,9114376467762312091538647,196216481778124311455991312592,739160712031240,160075714451411,"1607,1538,1209,739,646,647,911,1240,776,23,437...","1411,592,1312,1600,1962,1243,1648,1445,1145,59...",102758,150.0,64.0,True,48.0,True,Right,2.0,64.0,15.0
1,140,648,101418861597734207117101892474,648119332720117579201376604,18691615179564,1662176819671758,"474,2071,1597,1014,1886,1795,734,1892,1869,171...","201,648,604,327,1376,1193,1967,1662,1757,1768,...",96840,150.0,43.0,True,40.5,True,Left,3.0,56.0,10.0
2,141,480,4807391815433949116265439,92437012431312138012541734592,6711381138535,1480148044412811281,7396511624804334396753511381815949,"1281,924,1380,1312,1243,592,1254,444,370,1480,...",77064,160.0,64.0,True,40.5,True,Right,11.0,60.0,14.0
3,142,590,164311301292309116110256461754,590148085926918851561445552,73973910251643,17451778587616,309646116112927391643102517541130,"1445,1745,156,1778,587,552,859,1480,269,616,59...",92857,154.0,65.0,True,28.5,False,Right,4.0,79.0,16.0
4,151,590,164311301292309116110256461754,590148085926918851561445552,595116511651026,11591159757344,"1026,1161,309,1165,646,1292,1025,1130,1643,175...",1480757590156552144518851159269859344,92857,154.0,65.0,True,28.5,False,Right,4.0,79.0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,393,1202,121317151144124917735824641895,1234114063660911891418364660,169159971778,40013232255515,"169,582,464,1213,1895,1249,1773,1144,1599,1715...","1189,609,1234,1418,2255,1140,364,660,636,1323,...",208675,158.0,82.0,True,48.0,True,Right,4.0,66.0,16.0
198,395,84,8428125018625536311306942,171632663114060914236361234,123619962461553,33662587,"553,250,942,1862,84,631,1236,1996,1306,1553,28...",3123460911406631423662636632171587,18852,59.0,156.0,True,40.5,False,Right,3.0,77.0,13.0
199,396,1438,15591158212739182816438561087,14651800159917781226737337445,3379090269,132351584662,3911087337164321271158828155985690269,"737,445,1800,1778,662,1465,337,1226,1323,1599,...",110096,168.0,52.0,True,50.5,True,Left,2.0,62.0,9.0
200,402,51,5162457420501895206017761695,859182188437098241417341242,43526918841734,1137132340284,"51,574,1895,1776,2050,624,2060,1695,435,269,17...","859,1242,982,1884,1137,1734,182,370,402,1323,8...",13055,153.0,62.0,True,48.0,True,Left,6.0,48.0,15.0


In [107]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 180 entries, 0 to 201
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Catalogue ID                  180 non-null    int64  
 1   Patient ID                    180 non-null    int64  
 2   Retrieved Exc Good Patients   180 non-null    object 
 3   Retrieved Fair Poor Patients  180 non-null    object 
 4   Random Patients Exc Good      180 non-null    object 
 5   Random Patients Fair Poor     180 non-null    object 
 6   Ordered Exc Good Catalogue    180 non-null    object 
 7   Ordered Fair Poor Catalogue   180 non-null    object 
 8   Patient Institutional Number  180 non-null    object 
 9   Patient Height                180 non-null    float64
 10  Patient Weight                180 non-null    float64
 11  Had Radiotherapy              180 non-null    object 
 12  Radiotherapy Gray Total Dose  180 non-null    float64
 13  Had Radiot

In [156]:
retrieval_df = merged_df.drop(columns=['Had Radiotherapy','Radiotherapy Gray Total Dose', 'Had Radiotherapy Boost','Retrieved Exc Good Patients','Retrieved Fair Poor Patients','Random Patients Exc Good','Random Patients Fair Poor','Patient Institutional Number','Catalogue ID'])

In [157]:
Cancer_Side = {'Right': 0, 'Left': 1, 'Bilateral':2}
retrieval_df['cancer_side'] = retrieval_df['Cancer Side'].map(Cancer_Side)
retrieval_df = retrieval_df.drop(columns='Cancer Side')

In [158]:
# Columns to normalize
columns_to_normalize = [
    'Patient Height', 'Patient Weight',
    'Bra Cup Numeric', 'Age', 'Bra Size Mapped', 'cancer_side'
]

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Normalize the specified columns
retrieval_df[columns_to_normalize] = scaler.fit_transform(retrieval_df[columns_to_normalize])
retrieval_df

Unnamed: 0,Patient ID,Ordered Exc Good Catalogue,Ordered Fair Poor Catalogue,Patient Height,Patient Weight,Bra Cup Numeric,Age,Bra Size Mapped,cancer_side
0,1744,"1607,1538,1209,739,646,647,911,1240,776,23,437...","1411,592,1312,1600,1962,1243,1648,1445,1145,59...",0.771186,0.186047,0.1,0.534483,0.933333,0.0
1,648,"474,2071,1597,1014,1886,1795,734,1892,1869,171...","201,648,604,327,1376,1193,1967,1662,1757,1768,...",0.771186,0.023256,0.2,0.396552,0.600000,0.5
2,480,7396511624804334396753511381815949,"1281,924,1380,1312,1243,592,1254,444,370,1480,...",0.855932,0.186047,1.0,0.465517,0.866667,0.0
3,590,309646116112927391643102517541130,"1445,1745,156,1778,587,552,859,1480,269,616,59...",0.805085,0.193798,0.3,0.793103,1.000000,0.0
4,590,"1026,1161,309,1165,646,1292,1025,1130,1643,175...",1480757590156552144518851159269859344,0.805085,0.193798,0.3,0.793103,1.000000,0.0
...,...,...,...,...,...,...,...,...,...
196,1202,"169,582,464,1213,1895,1249,1773,1144,1599,1715...","1189,609,1234,1418,2255,1140,364,660,636,1323,...",0.838983,0.325581,0.3,0.568966,1.000000,0.0
198,84,"553,250,942,1862,84,631,1236,1996,1306,1553,28...",3123460911406631423662636632171587,0.000000,0.899225,0.2,0.758621,0.800000,0.0
199,1438,3911087337164321271158828155985690269,"737,445,1800,1778,662,1465,337,1226,1323,1599,...",0.923729,0.093023,0.1,0.500000,0.533333,0.5
200,51,"51,574,1895,1776,2050,624,2060,1695,435,269,17...","859,1242,982,1884,1137,1734,182,370,402,1323,8...",0.796610,0.170543,0.5,0.258621,0.933333,0.5


In [159]:
retrieval_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 180 entries, 0 to 201
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Patient ID                   180 non-null    int64  
 1   Ordered Exc Good Catalogue   180 non-null    object 
 2   Ordered Fair Poor Catalogue  180 non-null    object 
 3   Patient Height               180 non-null    float64
 4   Patient Weight               180 non-null    float64
 5   Bra Cup Numeric              180 non-null    float64
 6   Age                          180 non-null    float64
 7   Bra Size Mapped              180 non-null    float64
 8   cancer_side                  180 non-null    float64
dtypes: float64(6), int64(1), object(2)
memory usage: 18.1+ KB


In [160]:
# Drop specific columns
retrieval_similarity = retrieval_df.drop(columns=['Ordered Exc Good Catalogue', 'Ordered Fair Poor Catalogue'])

# Reset the index and drop the old index
retrieval_similarity.reset_index(drop=True, inplace=True)

In [161]:
retrieval_similarity

Unnamed: 0,Patient ID,Patient Height,Patient Weight,Bra Cup Numeric,Age,Bra Size Mapped,cancer_side
0,1744,0.771186,0.186047,0.1,0.534483,0.933333,0.0
1,648,0.771186,0.023256,0.2,0.396552,0.600000,0.5
2,480,0.855932,0.186047,1.0,0.465517,0.866667,0.0
3,590,0.805085,0.193798,0.3,0.793103,1.000000,0.0
4,590,0.805085,0.193798,0.3,0.793103,1.000000,0.0
...,...,...,...,...,...,...,...
175,1202,0.838983,0.325581,0.3,0.568966,1.000000,0.0
176,84,0.000000,0.899225,0.2,0.758621,0.800000,0.0
177,1438,0.923729,0.093023,0.1,0.500000,0.533333,0.5
178,51,0.796610,0.170543,0.5,0.258621,0.933333,0.5


In [162]:
# Normalize the data
columns_to_calculate= ['Patient Height', 'Patient Weight','cancer_side', 'Bra Cup Numeric', 'Age', 'Bra Size Mapped']

retrieval_similarity_df = retrieval_similarity[columns_to_calculate]

In [135]:
retrieval_similarity_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 180 entries, 0 to 201
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Patient Height   180 non-null    float64
 1   Patient Weight   180 non-null    float64
 2   cancer_side      176 non-null    float64
 3   Bra Cup Numeric  180 non-null    float64
 4   Age              180 non-null    float64
 5   Bra Size Mapped  180 non-null    float64
dtypes: float64(6)
memory usage: 9.8 KB


In [140]:
cancer_side_counts = retrieval_similarity_df['cancer_side'].value_counts()
cancer_side_counts

cancer_side
1.0    89
0.0    87
Name: count, dtype: int64

In [129]:
retrieval_similarity_df

Unnamed: 0,Patient Height,Patient Weight,cancer_side,Bra Cup Numeric,Age,Bra Size Mapped
0,0.771186,0.186047,0.0,0.1,0.534483,0.933333
1,0.771186,0.023256,1.0,0.2,0.396552,0.600000
2,0.855932,0.186047,0.0,1.0,0.465517,0.866667
3,0.805085,0.193798,0.0,0.3,0.793103,1.000000
4,0.805085,0.193798,0.0,0.3,0.793103,1.000000
...,...,...,...,...,...,...
196,0.838983,0.325581,0.0,0.3,0.568966,1.000000
198,0.000000,0.899225,0.0,0.2,0.758621,0.800000
199,0.923729,0.093023,1.0,0.1,0.500000,0.533333
200,0.796610,0.170543,1.0,0.5,0.258621,0.933333


In [163]:

# Calculate the Euclidean distance matrix
distance_matrix = euclidean_distances(retrieval_similarity_df)

# Function to find the most similar and dissimilar patients
def find_similar_dissimilar(patient_index, n=8):
    distances = distance_matrix[patient_index]
    
    # Get the indices of the most similar and dissimilar patients
    similar_indices = np.argsort(distances)[1:n+1]
    dissimilar_indices = np.argsort(distances)[-n:]
    
    # Get the patient IDs of the most similar and dissimilar patients
    most_similar = retrieval_similarity.iloc[similar_indices]['Patient ID'].values
    most_dissimilar = retrieval_similarity.iloc[dissimilar_indices]['Patient ID'].values
    
    return most_similar, most_dissimilar

# Add columns for the most similar and dissimilar patients
retrieval_similarity['Most Similar Patients'] = ''
retrieval_similarity['Most Dissimilar Patients'] = ''

for index in retrieval_similarity.index:
    most_similar, most_dissimilar = find_similar_dissimilar(index, n=8)
    retrieval_similarity.at[index, 'Most Similar Patients'] = ','.join(map(str, most_similar))
    retrieval_similarity.at[index, 'Most Dissimilar Patients'] = ','.join(map(str, most_dissimilar))

# Display the updated DataFrame
print(retrieval_similarity)

     Patient ID  Patient Height  Patient Weight  Bra Cup Numeric       Age  \
0          1744        0.771186        0.186047              0.1  0.534483   
1           648        0.771186        0.023256              0.2  0.396552   
2           480        0.855932        0.186047              1.0  0.465517   
3           590        0.805085        0.193798              0.3  0.793103   
4           590        0.805085        0.193798              0.3  0.793103   
..          ...             ...             ...              ...       ...   
175        1202        0.838983        0.325581              0.3  0.568966   
176          84        0.000000        0.899225              0.2  0.758621   
177        1438        0.923729        0.093023              0.1  0.500000   
178          51        0.796610        0.170543              0.5  0.258621   
179        1641        0.822034        0.224806              0.1  0.327586   

     Bra Size Mapped  cancer_side                  Most Similar

In [164]:
retrieval_similarity

Unnamed: 0,Patient ID,Patient Height,Patient Weight,Bra Cup Numeric,Age,Bra Size Mapped,cancer_side,Most Similar Patients,Most Dissimilar Patients
0,1744,0.771186,0.186047,0.1,0.534483,0.933333,0.0,52603100910251183185918591678,44749728117117376251655631
1,648,0.771186,0.023256,0.2,0.396552,0.600000,0.5,756143862117551981198121403,4804611737165563117184281
2,480,0.855932,0.186047,1.0,0.465517,0.866667,0.0,461169617661758193339511481772,15624471877168449784171281
3,590,0.805085,0.193798,0.3,0.793103,1.000000,0.0,590100962921315174217421458,625281168444715621711655631
4,590,0.805085,0.193798,0.3,0.793103,1.000000,0.0,590100962921315174217421458,625281168444715621711655631
...,...,...,...,...,...,...,...,...,...
175,1202,0.838983,0.325581,0.3,0.568966,1.000000,0.0,174217421458195621194818591859,168417149744715626251655631
176,84,0.000000,0.899225,0.2,0.758621,0.800000,0.0,2812112021181174217421744590,3683241877373631168419681655
177,1438,0.923729,0.093023,0.1,0.500000,0.533333,0.5,17551981198175664818063371072,949165517148046163184281
178,51,0.796610,0.170543,0.5,0.258621,0.933333,0.5,969938413413132085821272127,1251861625173763184171281


In [165]:
# Ensure list format function
def ensure_list_format(value):
    if isinstance(value, str):
        return list(map(int, value.split(',')))
    elif isinstance(value, list):
        return value
    else:
        return [value]

# Precision at k
def precision_at_k(y_true, y_pred, k):
    y_pred = y_pred[:k]
    if len(y_true) == 0:
        return 0.0
    return len(set(y_true) & set(y_pred)) / len(y_pred)

# Recall at k
def recall_at_k(y_true, y_pred, k):
    y_pred = y_pred[:k]
    if len(y_true) == 0:
        return 0.0
    return len(set(y_true) & set(y_pred)) / len(y_true)

# NDCG at k
def ndcg_at_k(y_true, y_pred, k):
    y_pred = y_pred[:k]
    if len(y_true) == 0:
        return 0.0
    idcg = sum([1.0 / np.log2(i + 2) for i in range(min(len(y_true), k))])
    dcg = sum([1.0 / np.log2(i + 2) for i, p in enumerate(y_pred) if p in y_true])
    return dcg / idcg

# Prepare the merged DataFrame for evaluation
merged_df = retrieval_df.copy()
merged_df['Most Similar Patients'] = retrieval_similarity['Most Similar Patients']
merged_df['Most Dissimilar Patients'] = retrieval_similarity['Most Dissimilar Patients']

# Similar images metrics
k = 8
precision_list = []
recall_list = []
ndcg_list = []

for i, row in merged_df.iterrows():
    y_true = ensure_list_format(row['Ordered Exc Good Catalogue'])
    y_pred = ensure_list_format(row['Most Similar Patients'])

    precision = precision_at_k(y_true, y_pred, k)
    recall = recall_at_k(y_true, y_pred, k)
    ndcg = ndcg_at_k(y_true, y_pred, k)

    precision_list.append(precision)
    recall_list.append(recall)
    ndcg_list.append(ndcg)

# Create a DataFrame to store the results
evaluation_df = pd.DataFrame({
    'Patient ID': merged_df['Patient ID'],
    'precision@k': precision_list,
    'recall@k': recall_list,
    'ndcg@k': ndcg_list
})

# Calculate the average metrics
average_precision = np.mean(precision_list)
average_recall = np.mean(recall_list)
average_ndcg = np.mean(ndcg_list)

print(f'Similar images average Precision@{k}: {average_precision:.4f}')
print(f'Similar images average Recall@{k}: {average_recall:.4f}')
print(f'Similar images average NDCG@{k}: {average_ndcg:.4f}')

# For Dissimilar images, assuming we have a similar ground truth column for dissimilar patients
dissimilar_precision_list = []
dissimilar_recall_list = []
dissimilar_ndcg_list = []

for i, row in merged_df.iterrows():
    diss_y_true = ensure_list_format(row['Ordered Exc Good Catalogue'])  # Use appropriate ground truth column for dissimilar images
    diss_y_pred = ensure_list_format(row['Most Dissimilar Patients'])

    precision = precision_at_k(diss_y_true, diss_y_pred, k)
    recall = recall_at_k(diss_y_true, diss_y_pred, k)
    ndcg = ndcg_at_k(diss_y_true, diss_y_pred, k)

    dissimilar_precision_list.append(precision)
    dissimilar_recall_list.append(recall)
    dissimilar_ndcg_list.append(ndcg)

# Calculate the average metrics for dissimilar images
average_dissimilar_precision = np.mean(dissimilar_precision_list)
average_dissimilar_recall = np.mean(dissimilar_recall_list)
average_dissimilar_ndcg = np.mean(dissimilar_ndcg_list)

print(f'Dissimilar images average Precision@{k}: {average_dissimilar_precision:.4f}')
print(f'Dissimilar images average Recall@{k}: {average_dissimilar_recall:.4f}')
print(f'Dissimilar images average NDCG@{k}: {average_dissimilar_ndcg:.4f}')

Similar images average Precision@8: 0.0125
Similar images average Recall@8: 0.0084
Similar images average NDCG@8: 0.0104
Dissimilar images average Precision@8: 0.0132
Dissimilar images average Recall@8: 0.0089
Dissimilar images average NDCG@8: 0.0140


In [166]:
# Ensure list format function
def ensure_list_format(value):
    if isinstance(value, str):
        return list(map(int, value.split(',')))
    elif isinstance(value, list):
        return value
    else:
        return [value]

# Precision at k
def precision_at_k(y_true, y_pred, k):
    y_pred = y_pred[:k]
    if len(y_true) == 0:
        return 0.0
    return len(set(y_true) & set(y_pred)) / len(y_pred)

# Recall at k
def recall_at_k(y_true, y_pred, k):
    y_pred = y_pred[:k]
    if len(y_true) == 0:
        return 0.0
    return len(set(y_true) & set(y_pred)) / len(y_true)

# NDCG at k
def ndcg_at_k(y_true, y_pred, k):
    y_pred = y_pred[:k]
    if len(y_true) == 0:
        return 0.0
    idcg = sum([1.0 / np.log2(i + 2) for i in range(min(len(y_true), k))])
    dcg = sum([1.0 / np.log2(i + 2) for i, p in enumerate(y_pred) if p in y_true])
    return dcg / idcg

# Prepare the merged DataFrame for evaluation
merged_df = retrieval_df.copy()
merged_df['Most Similar Patients'] = retrieval_similarity['Most Similar Patients']
merged_df['Most Dissimilar Patients'] = retrieval_similarity['Most Dissimilar Patients']

# Similar images metrics
k = 8
precision_list = []
recall_list = []
ndcg_list = []

for i, row in merged_df.iterrows():
    y_true = ensure_list_format(row['Ordered Fair Poor Catalogue'])
    y_pred = ensure_list_format(row['Most Similar Patients'])

    precision = precision_at_k(y_true, y_pred, k)
    recall = recall_at_k(y_true, y_pred, k)
    ndcg = ndcg_at_k(y_true, y_pred, k)

    precision_list.append(precision)
    recall_list.append(recall)
    ndcg_list.append(ndcg)

# Create a DataFrame to store the results
evaluation_df = pd.DataFrame({
    'Patient ID': merged_df['Patient ID'],
    'precision@k': precision_list,
    'recall@k': recall_list,
    'ndcg@k': ndcg_list
})

# Calculate the average metrics
average_precision = np.mean(precision_list)
average_recall = np.mean(recall_list)
average_ndcg = np.mean(ndcg_list)

print(f'Similar images average Precision@{k}: {average_precision:.4f}')
print(f'Similar images average Recall@{k}: {average_recall:.4f}')
print(f'Similar images average NDCG@{k}: {average_ndcg:.4f}')

# For Dissimilar images, assuming we have a similar ground truth column for dissimilar patients
dissimilar_precision_list = []
dissimilar_recall_list = []
dissimilar_ndcg_list = []

for i, row in merged_df.iterrows():
    diss_y_true = ensure_list_format(row['Ordered Fair Poor Catalogue'])  # Use appropriate ground truth column for dissimilar images
    diss_y_pred = ensure_list_format(row['Most Dissimilar Patients'])

    precision = precision_at_k(diss_y_true, diss_y_pred, k)
    recall = recall_at_k(diss_y_true, diss_y_pred, k)
    ndcg = ndcg_at_k(diss_y_true, diss_y_pred, k)

    dissimilar_precision_list.append(precision)
    dissimilar_recall_list.append(recall)
    dissimilar_ndcg_list.append(ndcg)

# Calculate the average metrics for dissimilar images
average_dissimilar_precision = np.mean(dissimilar_precision_list)
average_dissimilar_recall = np.mean(dissimilar_recall_list)
average_dissimilar_ndcg = np.mean(dissimilar_ndcg_list)

print(f'Dissimilar images average Precision@{k}: {average_dissimilar_precision:.4f}')
print(f'Dissimilar images average Recall@{k}: {average_dissimilar_recall:.4f}')
print(f'Dissimilar images average NDCG@{k}: {average_dissimilar_ndcg:.4f}')

Similar images average Precision@8: 0.0160
Similar images average Recall@8: 0.0111
Similar images average NDCG@8: 0.0181
Dissimilar images average Precision@8: 0.0257
Dissimilar images average Recall@8: 0.0172
Dissimilar images average NDCG@8: 0.0213
