# Naive Classifier (One-VS-Rest) on English clinical cases

1. **Réaliser un TF-IDF** sur l'ensemble des `cases_texte` du `df_final`.
2. **Tenter de prédire** la colonne `major_mesh_terms` à partir du TF-IDF.

## Import data

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm  # Barre de progression pour Jupyter
pd.set_option('display.max_colwidth', None)
import s3fs


In [2]:
if os.path.exists("../data/df_train.pkl"):
    data_train = pd.read_pickle('../data/df_train.pkl')
    print(data_train.shape)

if os.path.exists("../data/df_test.pkl"):
    data_test = pd.read_pickle('../data/df_test.pkl')
    print(data_test.shape)

(9646, 12)
(1072, 12)


## Naive Bayes Classifier

### Bag-of-word representation

In [3]:
from sklearn.model_selection import train_test_split

print ('train size',data_train.shape)
print ('test size', data_test.shape)

# create features X and target y
X_train = data_train.case_text
X_test = data_test.case_text

y_train = data_train.target
y_test = data_test.target

train size (9646, 12)
test size (1072, 12)


In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = CountVectorizer(max_features=1000)
#vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=3000)
# fit on train data
vectorizer.fit(X_train)

# Apply it on train and dev data
X_train_counts = vectorizer.transform(X_train)
X_test_counts = vectorizer.transform(X_test)

In [5]:
vectorizer

### OneVsRestClassifier avec CountVectorizer

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, hamming_loss, classification_report

# Initialize and train a One-vs-Rest logistic regression model for multi-label classification
model = OneVsRestClassifier(LogisticRegression(solver='liblinear'))
model.fit(X_train_counts, np.vstack(y_train.values))

# Get raw prediction scores (probabilities) for each label
#    predict_proba returns a list of arrays (one per class), so we stack them
y_score = np.vstack([estimator.predict_proba(X_test_counts)[:, 1] 
                     for estimator in model.estimators_]).T



In [7]:
# Find the best threshold to maximize the micro F1 score
thresholds = np.linspace(0, 1, 101)
best_f1 = 0.0
best_threshold = 0.0

for thr in thresholds:
    # Binarize predictions at the current threshold
    y_pred_thr = (y_score >= thr).astype(int)
    # Compute micro F1
    f1 = f1_score(np.vstack(y_test.values), y_pred_thr, average='micro')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = thr

print(f"Best threshold : {best_threshold:.2f} -> Micro F1 = {best_f1:.4f}")

# Apply the optimal threshold to obtain the final binary predictions
y_pred_opt = (y_score >= best_threshold).astype(int)

Best threshold : 0.35 -> Micro F1 = 0.5710


In [8]:
from sklearn.metrics import f1_score, precision_score, recall_score,  hamming_loss, classification_report

# Compute global metrics on the optimal predictions
y_test_array = np.vstack(y_test.values)

# Compute global metrics on the optimal predictions
micro_precision = precision_score(y_test_array, y_pred_opt, average='micro', zero_division=0)
micro_recall    = recall_score(y_test_array, y_pred_opt, average='micro', zero_division=0)
f1_micro        = f1_score(y_test_array, y_pred_opt, average='micro', zero_division=0)
f1_macro        = f1_score(y_test_array, y_pred_opt, average='macro', zero_division=0)
hamming         = hamming_loss(y_test_array, y_pred_opt)

print(f"F1 micro        : {f1_micro:.4f}")
print(f"F1 macro        : {f1_macro:.4f}")
print(f"Precision micro : {micro_precision:.4f}")
print(f"Recall micro    : {micro_recall:.4f}")
print(f"Hamming loss    : {hamming:.4f}")

# Exact match (subset accuracy): proportion of samples where all labels match exactly
exact_matches = np.all(y_pred_opt == y_test_array, axis=1)
n_exact = exact_matches.sum()
prop_exact = exact_matches.mean()
print(f"Exact match    : {n_exact} samples ({prop_exact:.4f} of the dataset)")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test_array, y_pred_opt, zero_division=0))

F1 micro        : 0.5710
F1 macro        : 0.3854
Precision micro : 0.5574
Recall micro    : 0.5854
Hamming loss    : 0.0885
Exact match    : 115 samples (0.1073 of the dataset)

Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.68      0.67       233
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.76      0.80      0.78       333
           4       0.35      0.43      0.39        89
           5       0.58      0.64      0.61       159
           6       0.30      0.36      0.33        47
           7       0.61      0.62      0.61       155
           8       0.25      0.34      0.29        32
           9       0.65      0.63      0.64       213
          10       0.69      0.51      0.58        79
          11       0.51      0.61      0.56       129
          12       0.00      0.00      0.00         0
          13       0.66      0.65      0.

### OneVsRestClassifier avec TF-IDF ngram = 2

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer_2gram = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
# fit on train data
vectorizer_2gram.fit(X_train)

# Apply it on train and dev data
X_train_2g_counts = vectorizer_2gram.transform(X_train)
X_test_2g_counts = vectorizer_2gram.transform(X_test)

In [10]:
model = OneVsRestClassifier(LogisticRegression(solver='liblinear'))
model.fit(X_train_2g_counts, np.vstack(y_train.values))

y_score_2 = np.vstack([estimator.predict_proba(X_test_2g_counts)[:, 1] 
                     for estimator in model.estimators_]).T
thresholds = np.linspace(0, 1, 101)
best_f1 = 0.0
best_threshold = 0.0

for thr in thresholds:
    # Binarize predictions at the current threshold
    y_pred_thr = (y_score_2 >= thr).astype(int)
    # Compute micro F1
    f1 = f1_score(np.vstack(y_test.values), y_pred_thr, average='micro')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = thr

print(f"Best threshold : {best_threshold:.2f} -> Micro F1 = {best_f1:.4f}")

# Apply the optimal threshold to obtain the final binary predictions
y_pred_opt = (y_score_2 >= best_threshold).astype(int)



Best threshold : 0.25 -> Micro F1 = 0.6745


In [11]:
from sklearn.metrics import f1_score, precision_score, recall_score,  hamming_loss, classification_report

# Compute global metrics on the optimal predictions
y_test_array = np.vstack(y_test.values)

# Compute global metrics on the optimal predictions
micro_precision = precision_score(y_test_array, y_pred_opt, average='micro', zero_division=0)
micro_recall    = recall_score(y_test_array, y_pred_opt, average='micro', zero_division=0)
f1_micro        = f1_score(y_test_array, y_pred_opt, average='micro', zero_division=0)
f1_macro        = f1_score(y_test_array, y_pred_opt, average='macro', zero_division=0)
hamming         = hamming_loss(y_test_array, y_pred_opt)

print(f"F1 micro        : {f1_micro:.4f}")
print(f"F1 macro        : {f1_macro:.4f}")
print(f"Precision micro : {micro_precision:.4f}")
print(f"Recall micro    : {micro_recall:.4f}")
print(f"Hamming loss    : {hamming:.4f}")

# Exact match (subset accuracy): proportion of samples where all labels match exactly
exact_matches = np.all(y_pred_opt == y_test_array, axis=1)
n_exact = exact_matches.sum()
prop_exact = exact_matches.mean()
print(f"Exact match    : {n_exact} samples ({prop_exact:.4f} of the dataset)")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test_array, y_pred_opt, zero_division=0))

F1 micro        : 0.6745
F1 macro        : 0.4535
Precision micro : 0.6723
Recall micro    : 0.6766
Hamming loss    : 0.0657
Exact match    : 217 samples (0.2024 of the dataset)

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.86      0.80       233
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.79      0.89      0.84       333
           4       0.57      0.39      0.47        89
           5       0.75      0.79      0.77       159
           6       0.73      0.23      0.35        47
           7       0.76      0.70      0.73       155
           8       0.78      0.22      0.34        32
           9       0.71      0.76      0.73       213
          10       0.87      0.58      0.70        79
          11       0.66      0.70      0.68       129
          12       0.00      0.00      0.00         0
          13       0.76      0.72      0.