# Naive Classifier (One-VS-Rest) on English clinical cases

1. **Réaliser un TF-IDF** sur l'ensemble des `cases_texte` du `df_final`.
2. **Tenter de prédire** la colonne `major_mesh_terms` à partir du TF-IDF.

## Import data

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm  # Barre de progression pour Jupyter
pd.set_option('display.max_colwidth', None)
import s3fs


In [2]:
if os.path.exists("../data/df_train.pkl"):
    data_train = pd.read_pickle('../data/df_train.pkl')
    print(data_train.shape)

if os.path.exists("../data/df_test.pkl"):
    data_test = pd.read_pickle('../data/df_test.pkl')
    print(data_test.shape)

(9646, 12)
(1072, 12)


## Naive Bayes Classifier

### Bag-of-word representation

In [10]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    data_train,
    test_size=0.1/0.9,
    random_state=42,
    shuffle=True
)

X_train = train_df['case_text']
y_train = train_df['target']

X_val   = val_df['case_text']
y_val   = val_df['target']

X_test  = data_test['case_text']
y_test  = data_test['target']


print('Train size :', X_train.shape)
print('Validation size :', X_val.shape)
print('Test size :', X_test.shape)

Train size : (8574,)
Validation size : (1072,)
Test size : (1072,)


In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = CountVectorizer(max_features=1000)
#vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=3000)
# fit on train data
vectorizer.fit(X_train)

# Apply it on train and dev data
X_train_counts = vectorizer.transform(X_train)
X_val_counts = vectorizer.transform(X_val)
X_test_counts = vectorizer.transform(X_test)

In [12]:
vectorizer

### OneVsRestClassifier avec CountVectorizer

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, hamming_loss, classification_report

# Initialize and train a One-vs-Rest logistic regression model for multi-label classification
model = OneVsRestClassifier(LogisticRegression(solver='liblinear'))
model.fit(X_train_counts, np.vstack(y_train.values))



#### Determining the best threshold to maximize the micro F1 score


In [20]:
# Get raw prediction scores (probabilities) for each label
#    predict_proba returns a list of arrays (one per class), so we stack them
y_val_score = np.vstack([est.predict_proba(X_val_counts)[:,1] 
                         for est in model.estimators_]).T

# Find the best threshold to maximize the micro F1 score
thresholds = np.linspace(0,1,101)
best_thr, best_f1 = 0.0, 0.0
for thr in thresholds:
    y_val_pred = (y_val_score >= thr).astype(int)
    f1 = f1_score(np.vstack(y_val.values), y_val_pred, average='micro')
    if f1 > best_f1:
        best_f1, best_thr = f1, thr
print(f"Best threshold on VAL : {best_thr:.2f} => micro-F1={best_f1:.4f}")

# Apply the optimal threshold to obtain the final binary predictions
y_test_score = np.vstack([est.predict_proba(X_test_counts)[:,1] 
                          for est in model.estimators_]).T
y_pred_opt = (y_test_score >= best_thr).astype(int)

Best threshold on VAL : 0.31 => micro-F1=0.5486


In [21]:
from sklearn.metrics import f1_score, precision_score, recall_score,  hamming_loss, classification_report

# Compute global metrics on the optimal predictions
y_test_array = np.vstack(y_test.values)

# Compute global metrics on the optimal predictions
micro_precision = precision_score(y_test_array, y_pred_opt, average='micro', zero_division=0)
micro_recall    = recall_score(y_test_array, y_pred_opt, average='micro', zero_division=0)
f1_micro        = f1_score(y_test_array, y_pred_opt, average='micro', zero_division=0)
f1_macro        = f1_score(y_test_array, y_pred_opt, average='macro', zero_division=0)
hamming         = hamming_loss(y_test_array, y_pred_opt)

print(f"F1 micro        : {f1_micro:.4f}")
print(f"F1 macro        : {f1_macro:.4f}")
print(f"Precision micro : {micro_precision:.4f}")
print(f"Recall micro    : {micro_recall:.4f}")
print(f"Hamming loss    : {hamming:.4f}")

# Exact match (subset accuracy): proportion of samples where all labels match exactly
exact_matches = np.all(y_pred_opt == y_test_array, axis=1)
n_exact = exact_matches.sum()
prop_exact = exact_matches.mean()
print(f"Exact match    : {n_exact} samples ({prop_exact:.4f} of the dataset)")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test_array, y_pred_opt, zero_division=0))

F1 micro        : 0.5590
F1 macro        : 0.3780
Precision micro : 0.5233
Recall micro    : 0.6000
Hamming loss    : 0.0953
Exact match    : 96 samples (0.0896 of the dataset)

Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.71      0.67       233
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.74      0.80      0.77       333
           4       0.34      0.48      0.40        89
           5       0.56      0.66      0.60       159
           6       0.33      0.38      0.36        47
           7       0.55      0.61      0.58       155
           8       0.26      0.31      0.29        32
           9       0.61      0.63      0.62       213
          10       0.67      0.61      0.64        79
          11       0.48      0.64      0.54       129
          12       0.00      0.00      0.00         0
          13       0.67      0.66      0.6

### OneVsRestClassifier avec TF-IDF ngram = 2

In [23]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer_2gram = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
# fit on train data
vectorizer_2gram.fit(X_train)

# Apply it on train and dev data
X_train_2g_counts = vectorizer_2gram.transform(X_train)
X_val_2g_counts = vectorizer_2gram.transform(X_val)
X_test_2g_counts = vectorizer_2gram.transform(X_test)

In [28]:
# train on the 2-gram features
model = OneVsRestClassifier(LogisticRegression(solver='liblinear'))
model.fit(X_train_2g_counts, np.vstack(y_train.values))

# get prediction scores on the validation set (using the same 2-gram features)
y_val_score_2g = np.vstack([
    est.predict_proba(X_val_2g_counts)[:, 1]
    for est in model.estimators_
]).T

# search for the threshold that maximizes micro-F1 on validation
thresholds = np.linspace(0, 1, 101)
best_thr, best_f1 = 0.0, 0.0

for thr in thresholds:
    y_val_pred = (y_val_score_2g >= thr).astype(int)
    f1 = f1_score(np.vstack(y_val.values), y_val_pred, average='micro')
    if f1 > best_f1:
        best_f1, best_thr = f1, thr

print(f"Best threshold on VAL : {best_thr:.2f} -> micro-F1 = {best_f1:.4f}")

# apply that threshold to your test-set 2-gram scores
y_test_score_2g = np.vstack([
    est.predict_proba(X_test_2g_counts)[:, 1]
    for est in model.estimators_
]).T

y_test_pred_opt = (y_test_score_2g >= best_thr).astype(int)



Best threshold on VAL : 0.25 -> micro-F1 = 0.6634


In [29]:
from sklearn.metrics import f1_score, precision_score, recall_score,  hamming_loss, classification_report

# Compute global metrics on the optimal predictions
y_test_array = np.vstack(y_test.values)

# Compute global metrics on the optimal predictions
micro_precision = precision_score(y_test_pred_opt, y_pred_opt, average='micro', zero_division=0)
micro_recall    = recall_score(y_test_pred_opt, y_pred_opt, average='micro', zero_division=0)
f1_micro        = f1_score(y_test_pred_opt, y_pred_opt, average='micro', zero_division=0)
f1_macro        = f1_score(y_test_pred_opt, y_pred_opt, average='macro', zero_division=0)
hamming         = hamming_loss(y_test_pred_opt, y_pred_opt)

print(f"F1 micro        : {f1_micro:.4f}")
print(f"F1 macro        : {f1_macro:.4f}")
print(f"Precision micro : {micro_precision:.4f}")
print(f"Recall micro    : {micro_recall:.4f}")
print(f"Hamming loss    : {hamming:.4f}")

# Exact match (subset accuracy): proportion of samples where all labels match exactly
exact_matches = np.all(y_test_pred_opt == y_test_array, axis=1)
n_exact = exact_matches.sum()
prop_exact = exact_matches.mean()
print(f"Exact match    : {n_exact} samples ({prop_exact:.4f} of the dataset)")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test_array, y_pred_opt, zero_division=0))

F1 micro        : 0.6988
F1 macro        : 0.4557
Precision micro : 0.6536
Recall micro    : 0.7507
Hamming loss    : 0.0650
Exact match    : 220 samples (0.2052 of the dataset)

Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.71      0.67       233
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.74      0.80      0.77       333
           4       0.34      0.48      0.40        89
           5       0.56      0.66      0.60       159
           6       0.33      0.38      0.36        47
           7       0.55      0.61      0.58       155
           8       0.26      0.31      0.29        32
           9       0.61      0.63      0.62       213
          10       0.67      0.61      0.64        79
          11       0.48      0.64      0.54       129
          12       0.00      0.00      0.00         0
          13       0.67      0.66      0.

In [None]:
# 9. Extract top-k tokens per class, skipping constant predictors
feature_names = vectorizer.get_feature_names_out()
top_k = 2

for idx, clf in enumerate(model.estimators_):
    # skip any estimator with no coef_ attribute (i.e., all-negative or all-positive during training)
    if not hasattr(clf, "coef_"):
        print(f"Category '{idx}' has no learned coefficients (constant predictor), skipping.")
        continue

    coefs = clf.coef_.ravel()  # shape (n_features,)
    top_indices = np.argsort(coefs)[-top_k:][::-1]  # indices of the top k positive weights
    top_tokens = [(feature_names[i], round(coefs[i],3)) for i in top_indices]
    print(f"Category '{idx}' top tokens: {top_tokens}")


Category '0' top tokens: [('covid', 1.203), ('abscess', 0.93)]
Category '1' has no learned coefficients (constant predictor), skipping.
Category '2' has no learned coefficients (constant predictor), skipping.
Category '3' top tokens: [('metastatic', 1.684), ('metastases', 1.111)]
Category '4' top tokens: [('consent', 0.671), ('short', 0.624)]
Category '5' top tokens: [('hepatitis', 1.051), ('gastrointestinal', 0.811)]
Category '6' top tokens: [('facial', 1.257), ('having', 1.239)]
Category '7' top tokens: [('sars', 1.855), ('covid', 1.813)]
Category '8' top tokens: [('nasal', 1.7), ('external', 1.245)]
Category '9' top tokens: [('temporal', 0.692), ('brain', 0.658)]
Category '10' top tokens: [('acuity', 1.367), ('replacement', 1.363)]
Category '11' top tokens: [('hiv', 1.356), ('pelvic', 0.837)]
Category '12' has no learned coefficients (constant predictor), skipping.
Category '13' top tokens: [('aneurysm', 1.269), ('echocardiography', 1.153)]
Category '14' top tokens: [('lymphoma', 1.