# Naive Classifier on French clinical cases

1. **Réaliser un TF-IDF** sur l'ensemble des `cases_texte` du `df_final`.
2. **Tenter de prédire** la colonne `major_mesh_terms` à partir du TF-IDF.

## Import data

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm  # Barre de progression pour Jupyter
pd.set_option('display.max_colwidth', None)
import s3fs


In [2]:
if os.path.exists("../data/df_train_fr.pkl"):
    train_set = pd.read_pickle('../data/df_train_fr.pkl')
    print(data_train.shape)

if os.path.exists("../data/test_Vtranslated_fr.pkl"):
    test_set = pd.read_pickle('../data/test_Vtranslated_fr.pkl')
    print(data_test.shape)

NameError: name 'data_train' is not defined

In [None]:
data_train = train_set.sample(167, random_state=42)
data_test = test_set

## Naive Bayes Classifier

### Bag-of-word representation

In [None]:
from sklearn.model_selection import train_test_split

print ('train size',data_train.shape)
print ('test size', data_test.shape)

# create features X and target y
X_train = data_train.case_text
X_test = data_test.case_text

y_train = data_train.target
y_test = data_test.target

train size (167, 2)
test size (300, 14)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = CountVectorizer(max_features=1000)
#vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=3000)
# fit on train data
vectorizer.fit(X_train)

# Apply it on train and dev data
X_train_counts = vectorizer.transform(X_train)
X_test_counts = vectorizer.transform(X_test)

In [None]:
vectorizer

### OneVsRestClassifier avec CountVectorizer

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
import numpy as np

# Initialize and train a One-vs-Rest logistic regression model for multi-label classification
model = OneVsRestClassifier(LogisticRegression(solver='liblinear'))
model.fit(X_train_counts, np.vstack(y_train.values))

# Get raw prediction scores (probabilities) for each label
#    predict_proba returns a list of arrays (one per class), so we stack them
y_score = np.vstack([estimator.predict_proba(X_test_counts)[:, 1] 
                     for estimator in model.estimators_]).T



In [None]:
# Find the best threshold to maximize the micro F1 score
thresholds = np.linspace(0, 1, 101)
best_f1 = 0.0
best_threshold = 0.0

for thr in thresholds:
    # Binarize predictions at the current threshold
    y_pred_thr = (y_score >= thr).astype(int)
    # Compute micro F1
    f1 = f1_score(np.vstack(y_test.values), y_pred_thr, average='micro')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = thr

print(f"Best threshold : {best_threshold:.2f} -> Micro F1 = {best_f1:.4f}")

# Apply the optimal threshold to obtain the final binary predictions
y_pred_opt = (y_score >= best_threshold).astype(int)

Best threshold : 0.55 -> Micro F1 = 0.2741


In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score,  hamming_loss, classification_report

# Compute global metrics on the optimal predictions
y_test_array = np.vstack(y_test.values)

# Compute global metrics on the optimal predictions
micro_precision = precision_score(y_test_array, y_pred_opt, average='micro', zero_division=0)
micro_recall    = recall_score(y_test_array, y_pred_opt, average='micro', zero_division=0)
f1_micro        = f1_score(y_test_array, y_pred_opt, average='micro', zero_division=0)
f1_macro        = f1_score(y_test_array, y_pred_opt, average='macro', zero_division=0)
hamming         = hamming_loss(y_test_array, y_pred_opt)

print(f"F1 micro        : {f1_micro:.4f}")
print(f"F1 macro        : {f1_macro:.4f}")
print(f"Precision micro : {micro_precision:.4f}")
print(f"Recall micro    : {micro_recall:.4f}")
print(f"Hamming loss    : {hamming:.4f}")

# Exact match (subset accuracy): proportion of samples where all labels match exactly
exact_matches = np.all(y_pred_opt == y_test_array, axis=1)
n_exact = exact_matches.sum()
prop_exact = exact_matches.mean()
print(f"Exact match    : {n_exact} samples ({prop_exact:.4f} of the dataset)")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test_array, y_pred_opt, zero_division=0))

F1 micro        : 0.2741
F1 macro        : 0.1770
Precision micro : 0.1991
Recall micro    : 0.4397
Hamming loss    : 0.2179
Exact match    : 1 samples (0.0033 of the dataset)

Classification Report:
              precision    recall  f1-score   support

           0       0.38      0.51      0.43        63
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.51      0.42      0.46        95
           4       0.06      0.04      0.05        25
           5       0.24      0.22      0.23        37
           6       0.14      0.09      0.11        11
           7       0.17      0.78      0.27        40
           8       0.25      0.14      0.18         7
           9       0.22      0.37      0.27        52
          10       0.40      0.38      0.39        16
          11       0.15      0.63      0.24        46
          12       0.00      0.00      0.00         0
          13       0.09      0.02      0.03

### OneVsRestClassifier avec TF-IDF ngram = 2

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer_2gram = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
# fit on train data
vectorizer_2gram.fit(X_train)

# Apply it on train and dev data
X_train_2g_counts = vectorizer_2gram.transform(X_train)
X_test_2g_counts = vectorizer_2gram.transform(X_test)

In [None]:
model = OneVsRestClassifier(LogisticRegression(solver='liblinear'))
model.fit(X_train_2g_counts, np.vstack(y_train.values))

y_score_2 = np.vstack([estimator.predict_proba(X_test_2g_counts)[:, 1] 
                     for estimator in model.estimators_]).T
thresholds = np.linspace(0, 1, 101)
best_f1 = 0.0
best_threshold = 0.0

for thr in thresholds:
    # Binarize predictions at the current threshold
    y_pred_thr = (y_score_2 >= thr).astype(int)
    # Compute micro F1
    f1 = f1_score(np.vstack(y_test.values), y_pred_thr, average='micro')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = thr

print(f"Best threshold : {best_threshold:.2f} -> Micro F1 = {best_f1:.4f}")

# Apply the optimal threshold to obtain the final binary predictions
y_pred_opt = (y_score_2 >= best_threshold).astype(int)



Best threshold : 0.22 -> Micro F1 = 0.3169


In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score,  hamming_loss, classification_report

# Compute global metrics on the optimal predictions
y_test_array = np.vstack(y_test.values)

# 2. Calcul des métriques globales
micro_precision = precision_score(y_test_array, y_pred_opt, average='micro', zero_division=0)
micro_recall    = recall_score(y_test_array, y_pred_opt, average='micro', zero_division=0)
f1_micro        = f1_score(y_test_array, y_pred_opt, average='micro', zero_division=0)
f1_macro        = f1_score(y_test_array, y_pred_opt, average='macro', zero_division=0)
hamming         = hamming_loss(y_test_array, y_pred_opt)

# 3. Affichage des métriques globales
print(f"F1 micro        : {f1_micro:.4f}")
print(f"F1 macro        : {f1_macro:.4f}")
print(f"Precision micro : {micro_precision:.4f}")
print(f"Recall micro    : {micro_recall:.4f}")
print(f"Hamming loss    : {hamming:.4f}")

# Exact match (subset accuracy): proportion of samples where all labels match exactly
exact_matches = np.all(y_pred_opt == y_test_array, axis=1)
n_exact = exact_matches.sum()
prop_exact = exact_matches.mean()
print(f"Exact match    : {n_exact} samples ({prop_exact:.4f} of the dataset)")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test_array, y_pred_opt, zero_division=0))

F1 micro        : 0.3169
F1 macro        : 0.1009
Precision micro : 0.2409
Recall micro    : 0.4630
Hamming loss    : 0.1868
Exact match    : 0 samples (0.0000 of the dataset)

Classification Report:
              precision    recall  f1-score   support

           0       0.21      0.98      0.35        63
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.32      1.00      0.48        95
           4       0.00      0.00      0.00        25
           5       0.00      0.00      0.00        37
           6       0.00      0.00      0.00        11
           7       1.00      0.03      0.05        40
           8       0.00      0.00      0.00         7
           9       0.17      0.88      0.29        52
          10       0.00      0.00      0.00        16
          11       0.00      0.00      0.00        46
          12       0.00      0.00      0.00         0
          13       0.22      0.55      0.31

In [None]:
# 9. Extract top-k tokens per class, skipping constant predictors
feature_names = vectorizer.get_feature_names_out()
top_k = 2

for idx, clf in enumerate(model.estimators_):
    # skip any estimator with no coef_ attribute (i.e., all-negative or all-positive during training)
    if not hasattr(clf, "coef_"):
        print(f"Category '{idx}' has no learned coefficients (constant predictor), skipping.")
        continue

    coefs = clf.coef_.ravel()  # shape (n_features,)
    top_indices = np.argsort(coefs)[-top_k:][::-1]  # indices of the top k positive weights
    top_tokens = [(feature_names[i], round(coefs[i],3)) for i in top_indices]
    print(f"Category '{idx}' top tokens: {top_tokens}")