# Naive Classifier on French clinical cases

1. **Réaliser un TF-IDF** sur l'ensemble des `cases_texte` du `df_final`.
2. **Tenter de prédire** la colonne `major_mesh_terms` à partir du TF-IDF.

## Import data

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm  # Barre de progression pour Jupyter
pd.set_option('display.max_colwidth', None)
import s3fs


In [3]:
if os.path.exists("../data/df_train_fr.pkl"):
    train_set = pd.read_pickle('../data/df_train_fr.pkl')
    print(train_set.shape)

if os.path.exists("../data/test_Vtranslated_fr.pkl"):
    test_set = pd.read_pickle('../data/test_Vtranslated_fr.pkl')
    print(test_set.shape)

(300, 2)
(300, 14)


In [5]:
data_train = train_set.sample(167, random_state=42)
data_test = test_set

## Naive Bayes Classifier

### Bag-of-word representation

In [6]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    data_train,
    test_size=0.1/0.9,
    random_state=42,
    shuffle=True
)

X_train = train_df['case_text']
y_train = train_df['target']

X_val   = val_df['case_text']
y_val   = val_df['target']

X_test  = data_test['case_text']
y_test  = data_test['target']


print('Train size :', X_train.shape)
print('Validation size :', X_val.shape)
print('Test size :', X_test.shape)

Train size : (148,)
Validation size : (19,)
Test size : (300,)


In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = CountVectorizer(max_features=1000)
#vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=3000)
# fit on train data
vectorizer.fit(X_train)

# Apply it on train and dev data
X_train_counts = vectorizer.transform(X_train)
X_val_counts = vectorizer.transform(X_val)
X_test_counts = vectorizer.transform(X_test)

In [8]:
vectorizer

### OneVsRestClassifier avec CountVectorizer

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
import numpy as np

# Initialize and train a One-vs-Rest logistic regression model for multi-label classification
model = OneVsRestClassifier(LogisticRegression(solver='liblinear'))
model.fit(X_train_counts, np.vstack(y_train.values))

# Get raw prediction scores (probabilities) for each label
#    predict_proba returns a list of arrays (one per class), so we stack them
y_score = np.vstack([estimator.predict_proba(X_test_counts)[:, 1] 
                     for estimator in model.estimators_]).T



#### Find the best threshold to maximize the micro F1 score

In [11]:
from sklearn.metrics import f1_score, precision_score, recall_score,  hamming_loss, classification_report

# Get raw prediction scores (probabilities) for each label
#    predict_proba returns a list of arrays (one per class), so we stack them
y_val_score = np.vstack([est.predict_proba(X_val_counts)[:,1] 
                         for est in model.estimators_]).T

# Find the best threshold to maximize the micro F1 score
thresholds = np.linspace(0,1,101)
best_thr, best_f1 = 0.0, 0.0
for thr in thresholds:
    y_val_pred = (y_val_score >= thr).astype(int)
    f1 = f1_score(np.vstack(y_val.values), y_val_pred, average='micro')
    if f1 > best_f1:
        best_f1, best_thr = f1, thr
print(f"Best threshold on VAL : {best_thr:.2f} => micro-F1={best_f1:.4f}")

# Apply the optimal threshold to obtain the final binary predictions
y_test_score = np.vstack([est.predict_proba(X_test_counts)[:,1] 
                          for est in model.estimators_]).T
y_pred_opt = (y_test_score >= best_thr).astype(int)

Best threshold on VAL : 0.15 => micro-F1=0.7200


In [12]:
# Compute global metrics on the optimal predictions
y_test_array = np.vstack(y_test.values)

# Compute global metrics on the optimal predictions
micro_precision = precision_score(y_test_array, y_pred_opt, average='micro', zero_division=0)
micro_recall    = recall_score(y_test_array, y_pred_opt, average='micro', zero_division=0)
f1_micro        = f1_score(y_test_array, y_pred_opt, average='micro', zero_division=0)
f1_macro        = f1_score(y_test_array, y_pred_opt, average='macro', zero_division=0)
hamming         = hamming_loss(y_test_array, y_pred_opt)

print(f"F1 micro        : {f1_micro:.4f}")
print(f"F1 macro        : {f1_macro:.4f}")
print(f"Precision micro : {micro_precision:.4f}")
print(f"Recall micro    : {micro_recall:.4f}")
print(f"Hamming loss    : {hamming:.4f}")

# Exact match (subset accuracy): proportion of samples where all labels match exactly
exact_matches = np.all(y_pred_opt == y_test_array, axis=1)
n_exact = exact_matches.sum()
prop_exact = exact_matches.mean()
print(f"Exact match    : {n_exact} samples ({prop_exact:.4f} of the dataset)")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test_array, y_pred_opt, zero_division=0))

F1 micro        : 0.2256
F1 macro        : 0.1600
Precision micro : 0.1276
Recall micro    : 0.9726
Hamming loss    : 0.6247
Exact match    : 0 samples (0.0000 of the dataset)

Classification Report:
              precision    recall  f1-score   support

           0       0.21      0.98      0.35        63
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.32      0.98      0.48        95
           4       0.08      1.00      0.15        25
           5       0.13      1.00      0.22        37
           6       0.04      1.00      0.07        11
           7       0.13      1.00      0.24        40
           8       0.02      1.00      0.05         7
           9       0.17      0.98      0.30        52
          10       0.05      1.00      0.10        16
          11       0.15      1.00      0.27        46
          12       0.00      0.00      0.00         0
          13       0.18      0.84      0.30

### OneVsRestClassifier avec TF-IDF ngram = 2

In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer_2gram = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
# fit on train data
vectorizer_2gram.fit(X_train)

# Apply it on train and dev data
X_train_2g_counts = vectorizer_2gram.transform(X_train)
X_val_2g_counts = vectorizer_2gram.transform(X_val)
X_test_2g_counts = vectorizer_2gram.transform(X_test)

In [16]:
# train on the 2-gram features
model = OneVsRestClassifier(LogisticRegression(solver='liblinear'))
model.fit(X_train_2g_counts, np.vstack(y_train.values))

# get prediction scores on the validation set (using the same 2-gram features)
y_val_score_2g = np.vstack([
    est.predict_proba(X_val_2g_counts)[:, 1]
    for est in model.estimators_
]).T

# search for the threshold that maximizes micro-F1 on validation
thresholds = np.linspace(0, 1, 101)
best_thr, best_f1 = 0.0, 0.0

for thr in thresholds:
    y_val_pred = (y_val_score_2g >= thr).astype(int)
    f1 = f1_score(np.vstack(y_val.values), y_val_pred, average='micro')
    if f1 > best_f1:
        best_f1, best_thr = f1, thr

print(f"Best threshold on VAL : {best_thr:.2f} -> micro-F1 = {best_f1:.4f}")

# apply that threshold to your test-set 2-gram scores
y_test_score_2g = np.vstack([
    est.predict_proba(X_test_2g_counts)[:, 1]
    for est in model.estimators_
]).T

y_test_pred_opt = (y_test_score_2g >= best_thr).astype(int)



Best threshold on VAL : 0.24 -> micro-F1 = 0.6061


In [17]:
from sklearn.metrics import f1_score, precision_score, recall_score,  hamming_loss, classification_report

# Compute global metrics on the optimal predictions
y_test_array = np.vstack(y_test.values)

# 2. Calcul des métriques globales
micro_precision = precision_score(y_test_array, y_test_pred_opt, average='micro', zero_division=0)
micro_recall    = recall_score(y_test_array, y_test_pred_opt, average='micro', zero_division=0)
f1_micro        = f1_score(y_test_array, y_test_pred_opt, average='micro', zero_division=0)
f1_macro        = f1_score(y_test_array, y_test_pred_opt, average='macro', zero_division=0)
hamming         = hamming_loss(y_test_array, y_test_pred_opt)

# 3. Affichage des métriques globales
print(f"F1 micro        : {f1_micro:.4f}")
print(f"F1 macro        : {f1_macro:.4f}")
print(f"Precision micro : {micro_precision:.4f}")
print(f"Recall micro    : {micro_recall:.4f}")
print(f"Hamming loss    : {hamming:.4f}")

# Exact match (subset accuracy): proportion of samples where all labels match exactly
exact_matches = np.all(y_test_pred_opt == y_test_array, axis=1)
n_exact = exact_matches.sum()
prop_exact = exact_matches.mean()
print(f"Exact match    : {n_exact} samples ({prop_exact:.4f} of the dataset)")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test_array, y_pred_opt, zero_division=0))

F1 micro        : 0.3035
F1 macro        : 0.0785
Precision micro : 0.3072
Recall micro    : 0.3000
Hamming loss    : 0.1288
Exact match    : 4 samples (0.0133 of the dataset)

Classification Report:
              precision    recall  f1-score   support

           0       0.21      0.98      0.35        63
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.32      0.98      0.48        95
           4       0.08      1.00      0.15        25
           5       0.13      1.00      0.22        37
           6       0.04      1.00      0.07        11
           7       0.13      1.00      0.24        40
           8       0.02      1.00      0.05         7
           9       0.17      0.98      0.30        52
          10       0.05      1.00      0.10        16
          11       0.15      1.00      0.27        46
          12       0.00      0.00      0.00         0
          13       0.18      0.84      0.30

In [None]:
# 9. Extract top-k tokens per class, skipping constant predictors
feature_names = vectorizer.get_feature_names_out()
top_k = 2

for idx, clf in enumerate(model.estimators_):
    # skip any estimator with no coef_ attribute (i.e., all-negative or all-positive during training)
    if not hasattr(clf, "coef_"):
        print(f"Category '{idx}' has no learned coefficients (constant predictor), skipping.")
        continue

    coefs = clf.coef_.ravel()  # shape (n_features,)
    top_indices = np.argsort(coefs)[-top_k:][::-1]  # indices of the top k positive weights
    top_tokens = [(feature_names[i], round(coefs[i],3)) for i in top_indices]
    print(f"Category '{idx}' top tokens: {top_tokens}")