In [4]:
import pandas as pd 
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, balanced_accuracy_score, confusion_matrix


In [5]:
data=pd.read_excel('data_lineC_2.xlsx')

In [6]:
data_full=pd.read_parquet('full_data.parquet')

In [14]:
data_full

Unnamed: 0,SESSION_DATE,SniffTime,SniffThreshold,age,Cumulative Sniff Time,ReadTotalSnifftime,id_rat,rat_name,birth_date,gender,weight,temperature,id_sample,tb,hit,match_flag,evaluation_sniff_threshold,session_sniff_threshold
0,2016-11-03 00:00:00,0,0.0,2.0,,,98.0,Stephen,2014-07-21,M,,30.5,515562,False,False,,,
1,2016-11-03 00:00:00,0,0.0,2.0,,,98.0,Stephen,2014-07-21,M,,30.5,515546,False,False,,,
2,2016-11-03 00:00:00,0,0.0,2.0,,,98.0,Stephen,2014-07-21,M,,30.5,515422,False,False,,,
3,2016-11-03 00:00:00,0,0.0,2.0,,,98.0,Stephen,2014-07-21,M,,30.5,515559,False,False,,,
4,2016-11-03 00:00:00,0,0.0,2.0,,,98.0,Stephen,2014-07-21,M,,30.5,515553,False,False,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240218,45565,220,,,,True,,Malaika,NaT,,,21.0,694768,False,False,,0.0,2000.0
240219,45565,442,,,,True,,Malaika,NaT,,,21.0,694740,False,False,,0.0,2000.0
240220,45565,151,,,,True,,Malaika,NaT,,,21.0,694628,False,False,,0.0,2000.0
240221,45565,517,,,,True,,Malaika,NaT,,,21.0,694832,False,False,,0.0,2000.0


In [None]:
labels = sorted(data['TB'].unique())  # Ensure consistent label order

# Compute confusion matrix
cm = confusion_matrix(data['TB'], data['HIT'], labels=labels)


cm_df = pd.DataFrame(cm, index=[f"Actual: {label}" for label in labels],
                         columns=[f"Predicted: {label}" for label in labels])
print(cm_df)
print(cm)


               Predicted: False  Predicted: True
Actual: False             26561             3603
Actual: True               2077             2612
[[26561  3603]
 [ 2077  2612]]


In [16]:
cm = confusion_matrix(data_full['tb'], data_full['hit'])
cm

array([[193352,  14353],
       [ 11017,  21501]])

In [20]:

def categorize_individuals(df):
    categories = {
        'high_recall': [],
        'high_specificity': [],
        'high_precision': [],
        'low_false_negative_rate': []
    }

    individuals = df['rat_name'].unique()

    for person in individuals:
        sub_df = df[df['rat_name'] == person]
        y_true = sub_df['tb']
        y_pred = sub_df['hit']

        # Compute confusion matrix
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()

        # Compute metrics
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        false_negative_rate = fn / (fn + tp) if (fn + tp) > 0 else 1

        # Categorize based on thresholds
        if recall >= 0.8:
            categories['high_recall'].append(person)
        if specificity >= 0.8:
            categories['high_specificity'].append(person)
        if precision >= 0.8:
            categories['high_precision'].append(person)
        if false_negative_rate <= 0.2:
            categories['low_false_negative_rate'].append(person)

    return categories


In [21]:
categorize_individuals(data_full)

{'high_recall': ['Daudi', 'Doreen', 'Maui', 'Moureen', 'Rui'],
 'high_specificity': ['Stephen',
  'Catia',
  'Hakim',
  'Happy',
  'Pink',
  'Petrobas',
  'Mkuta',
  'Riziwani',
  'Julius',
  'Genovive',
  'Nala',
  'Maliwaza',
  'Daudi',
  'Doreen',
  'Kahu',
  'Maui',
  'Moana',
  'Moureen',
  'Ngaio',
  'Rui',
  'Samiry',
  'Ella',
  'Chilleta',
  'Tamasha',
  'Bertha',
  'Malaika',
  'Splinter',
  'Bieber',
  'Tivane',
  'Mayele',
  'Chamy',
  'Tirunesh',
  'Salvina',
  'Gea',
  'Orpheus',
  'Kenenisa'],
 'high_precision': [],
 'low_false_negative_rate': ['Daudi', 'Doreen', 'Maui', 'Moureen', 'Rui']}

In [24]:
data_full['rat_name'].unique()

array(['Stephen', 'Catia', 'Hakim', 'Happy', 'Pink', 'Petrobas', 'Mkuta',
       'Riziwani', 'Julius', 'Genovive', 'Nala', 'Maliwaza', 'Daudi',
       'Doreen', 'Kahu', 'Maui', 'Moana', 'Moureen', 'Ngaio', 'Rui',
       'Samiry', 'Ella', 'Chilleta', 'Tamasha', 'Bertha', 'Malaika',
       'Splinter', 'Bieber', 'Tivane', 'Mayele', 'Chamy', 'Tirunesh',
       'Salvina', 'Gea', 'Orpheus', 'Kenenisa'], dtype=object)

In [25]:

def compute_metrics_per_individual(df, ground_truth_col='tb', prediction_col='hit', individual_col='rat_name'):

    # Get unique individuals
    individuals = df[individual_col].unique()
    results = []

    for person in individuals:
        # Filter data for the individual
        person_df = df[df[individual_col] == person]

        # Extract ground truth and predictions
        y_true = person_df[ground_truth_col]
        y_pred = person_df[prediction_col]

        # Compute metrics
        precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
        accuracy = accuracy_score(y_true, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_true, y_pred)

        # Append results
        results.append({
            'Individual': person,
            'Precision': round(precision, 3),
            'Recall': round(recall, 3),
            'F1 Score': round(f1, 3),
            'Accuracy': round(accuracy, 3),
            'Balanced Accuracy': round(balanced_accuracy, 3)
        })

    # Create a DataFrame for display
    metrics_df = pd.DataFrame(results)
    return metrics_df



In [26]:
compute_metrics_per_individual(data_full)    

Unnamed: 0,Individual,Precision,Recall,F1 Score,Accuracy,Balanced Accuracy
0,Stephen,0.854,0.835,0.843,0.835,0.702
1,Catia,0.867,0.83,0.844,0.83,0.744
2,Hakim,0.86,0.853,0.856,0.853,0.704
3,Happy,0.869,0.83,0.844,0.83,0.751
4,Pink,0.861,0.868,0.864,0.868,0.689
5,Petrobas,0.809,0.8,0.804,0.8,0.653
6,Mkuta,0.868,0.829,0.844,0.829,0.748
7,Riziwani,0.884,0.816,0.838,0.816,0.795
8,Julius,0.828,0.825,0.826,0.825,0.629
9,Genovive,0.865,0.847,0.855,0.847,0.728


In [27]:

from sklearn.preprocessing import LabelEncoder

def cluster_individuals(df, ground_truth_col='tb', prediction_col='hit', individual_col='rat_name'):
    # Encode labels if they are not binary
    le = LabelEncoder()
    df[ground_truth_col] = le.fit_transform(df[ground_truth_col])
    df[prediction_col] = le.transform(df[prediction_col])

    # Initialize metrics storage
    metrics = []

    # Compute metrics per individual
    for name, group in df.groupby(individual_col):
        y_true = group[ground_truth_col]
        y_pred = group[prediction_col]

        # Confusion matrix: TN, FP, FN, TP
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()

        # Avoid division by zero
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        fn_rate = fn / (fn + tp) if (fn + tp) > 0 else 0
        fp_rate = fp / (fp + tn) if (fp + tn) > 0 else 0

        metrics.append({
            'individual': name,
            'false_negative_rate': fn_rate,
            'false_positive_rate': fp_rate,
            'precision': precision
        })

    # Create DataFrame
    metrics_df = pd.DataFrame(metrics)

    # Normalize metrics for clustering
    norm_df = metrics_df[['false_negative_rate', 'false_positive_rate', 'precision']]
    norm_df = (norm_df - norm_df.min()) / (norm_df.max() - norm_df.min())

    # Assign clusters based on best performance
    clusters = []
    for _, row in norm_df.iterrows():
        if row['false_negative_rate'] <= row['false_positive_rate'] and row['false_negative_rate'] <= (1 - row['precision']):
            clusters.append('Minimize False Negative')
        elif row['false_positive_rate'] <= row['false_negative_rate'] and row['false_positive_rate'] <= (1 - row['precision']):
            clusters.append('Minimize False Positive')
        else:
            clusters.append('Maximize Precision')

    metrics_df['cluster'] = clusters

    # Display results
    print("Individual Clustering Based on Prediction Performance:\n")
    print(metrics_df[['individual', 'cluster']].to_string(index=False))


In [28]:
cluster_individuals(data_full)

Individual Clustering Based on Prediction Performance:

individual                 cluster
    Bertha Minimize False Negative
    Bieber Minimize False Negative
     Catia Minimize False Negative
     Chamy      Maximize Precision
  Chilleta Minimize False Negative
     Daudi Minimize False Negative
    Doreen Minimize False Negative
      Ella Minimize False Negative
       Gea      Maximize Precision
  Genovive Minimize False Negative
     Hakim Minimize False Negative
     Happy Minimize False Negative
    Julius      Maximize Precision
      Kahu Minimize False Negative
  Kenenisa Minimize False Negative
   Malaika Minimize False Negative
  Maliwaza Minimize False Negative
      Maui Minimize False Negative
    Mayele      Maximize Precision
     Mkuta Minimize False Negative
     Moana Minimize False Positive
   Moureen Minimize False Negative
      Nala Minimize False Negative
     Ngaio Minimize False Positive
   Orpheus      Maximize Precision
  Petrobas Minimize False Negative