Train classifier of primary endpoint type in EUCT-NS dataset using CNB with TF-IDF Features

In [1]:
import pandas as pd

import numpy as np 
from numpy import mean, std

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import ComplementNB 
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, classification_report
import joblib

import nltk

import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import tqdm as notebook_tqdm

import joblib

Train the classifier

In [2]:
euct_ns = pd.read_csv('c:\\Users\\s2421127\\Documents\\NLP Project\\ObuayaO\\NLP project\\Chapter 3\\euct_ns.csv', encoding='unicode_escape')

In [3]:
euct_ns['concat_corpus'] = euct_ns['Title']+ " " + euct_ns['Objective'] + " " + euct_ns['pr_endpoint'] + " " + euct_ns['endpoint_description']

In [4]:
euct_ns['concat_corpus'] = euct_ns['concat_corpus'].fillna('')

In [5]:
vectorizer = TfidfVectorizer(analyzer='word', min_df=10, ngram_range=(1,3))
tfidf_matrix = vectorizer.fit_transform(euct_ns['concat_corpus'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

In [6]:
X = tfidf_df
y = euct_ns['manual_label'].values

In [7]:
joblib.dump(vectorizer, "tf_idf embeddings.pkl")

['tf_idf embeddings.pkl']

In [8]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [9]:
cnb = ComplementNB()
pipeline = make_pipeline(TfidfVectorizer(analyzer='word', min_df=10, ngram_range=(1,3)), cnb)

joblib.dump(pipeline, "CNB_pipeline.pkl")

['CNB_pipeline.pkl']

In [10]:
cnb = ComplementNB(alpha = 1.0, fit_prior = True)

In [11]:
# Train the model on the training data
cnb.fit(X_train, y_train)

# Make predictions on the test data
y_pred = cnb.predict(X_test)

In [12]:
joblib.dump(cnb, "model.pkl")

['model.pkl']

In [None]:
print(y_test)

In [None]:
print(y_pred) # There is no cases of intermediate outcomes in the pred set. Do I re-run it?

In [None]:
accuracy_weighted = accuracy_score(y_test, y_pred)
precision_weighted = precision_score(y_test, y_pred, average='weighted')
recall_weighted = recall_score(y_test, y_pred, average='weighted')
f1_weighted = f1_score(y_test, y_pred, average='weighted')

# Get the predicted probabilities
y_pred_proba = cnb.predict_proba(X_test)

# Calculate AUROC score using predicted probabilities
auroc_weighted = roc_auc_score(y_test, y_pred_proba, average='weighted', multi_class='ovr')

accuracy_unweighted = accuracy_score(y_test, y_pred)
classification_metrics = classification_report(y_test, y_pred, output_dict=True)

#Print metrics
print(f'Accuracy (Weighted): {accuracy_weighted:.2f}')
print(f'Precision (Weighted): {precision_weighted:.2f}, Mean={np.mean(precision_weighted):.2f} , Std={np.std(precision_weighted):.2f}')
print(f'Recall (Weighted): {recall_weighted:.2f}, Mean={np.mean(recall_weighted):.2f}, Std={np.std(recall_weighted):.2f} ')
print(f'F1 Score (Weighted): {f1_weighted:.2f}, Mean={np.mean(f1_weighted):.2f} ,Std={np.std(f1_weighted):.2f}')
print(f'AUROC (Weighted): {auroc_weighted:.2f}, Mean={np.mean(auroc_weighted):.2f}, Std={np.std(auroc_weighted):.2f}')
print()

print(f'Accuracy (Unweighted): {accuracy_unweighted:.2f}')
print("Precision, Recall, and F1 Score by Class:")
for cls, metrics in classification_metrics.items():
    if cls.isdigit():  # Filter class-specific metrics
        print(f"  Class {cls}: Precision={metrics['precision']:.2f}, Recall={metrics['recall']:.2f}, F1 Score={metrics['f1-score']:.2f}")

In [None]:
from sklearn.metrics import roc_curve

# Calculate the AUC - ROC score
roc_auc = roc_auc_score(y_test, y_pred_proba, average = 'weighted', multi_class='ovr')

# Compute ROC curve for each class
for i in range(cnb.classes_.shape[0]):
	fpr, tpr, _ = roc_curve(y_test, y_pred_proba[:, i], pos_label=i)
	plt.plot(fpr, tpr, lw=2, label=f'Class {i} (AUC = {roc_auc:.2f})')

# Plot the diagonal 50% line
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

# Customize the plot
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison Between Classes')
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve

# Compute precision-recall curve for each class
for i in range(cnb.classes_.shape[0]):
	y_prob_train = cnb.predict_proba(X_train)[:, i]
	y_prob_test = cnb.predict_proba(X_test)[:, i]

	precision, recall, thresholds = precision_recall_curve(y_train == i, y_prob_train)
	plt.plot(recall, precision, lw=2, label=f'Class {i}')

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Train Precision-Recall curve")
plt.legend(loc="best")
plt.show()

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=cnb.classes_)
disp.plot(cmap=plt.cm.Greens)
plt.title('Confusion Matrix of CNB Performance on EUCT Dataset')
plt.show()

Apply CNB model to NS-HRA dataset

In [13]:
ns_hra = pd.read_csv('c:\\Users\\s2421127\\Documents\\NLP Project\\ObuayaO\\NLP project\\Chapter 3\\ns_hra.csv', encoding='unicode_escape')

In [None]:
display(ns_hra)
# Did not clean long-term follow up because it is not included in the feature set.

In [14]:
ns_hra['concat_corpus'] = ns_hra['Title']+ " " + ns_hra['Objective'] + " " + ns_hra['1ry_endpoint'] 

In [15]:
ns_hra['concat_corpus'] = ns_hra['concat_corpus'].fillna('')

In [None]:
print(ns_hra['concat_corpus'].head())

In [16]:
vectorizer = joblib.load("tf_idf embeddings.pkl")

In [None]:
print(type(vectorizer))

In [17]:
X2 = vectorizer.transform(ns_hra['concat_corpus'])

In [18]:
model = joblib.load('model.pkl')

In [19]:
model.predict(X2)



array([1, 0, 2, 2, 2, 2, 1, 2, 2, 0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0,
       2, 0, 2, 0, 2, 0, 0, 0, 0, 0, 1, 2, 1, 0, 2, 2, 1, 0, 0, 2, 1, 2,
       0, 0, 2, 0, 2, 0, 1, 2, 1, 0, 0, 2, 1, 1, 2, 0, 1, 0, 1, 1, 2, 2,
       1, 2, 0, 1, 0, 1, 0, 1, 2, 1, 0, 0, 0, 2, 1, 0, 1, 1, 2, 2, 1, 2,
       1, 0, 1, 0, 2, 1, 2, 2, 1, 0, 1, 2, 0, 1, 0, 2, 0, 1, 0, 2, 1, 1,
       1, 0, 0, 1, 2, 0, 0, 2, 2, 0, 0, 2, 0, 0, 2, 2, 2, 1, 0, 0, 0, 2,
       0, 2, 2, 0, 1, 0, 2, 2, 0, 2, 0, 1, 1, 0, 2, 0, 2, 2, 2, 0, 0, 1,
       2, 1, 1, 2, 0, 2, 1, 1, 2, 0, 2, 0, 0, 0, 2, 0, 0, 2, 1, 1, 2, 1,
       0, 1, 1, 0, 2, 1, 2, 0, 2, 2, 0, 0, 1, 2, 0, 0, 2, 2, 0, 0, 0, 0,
       2, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 2, 0, 2, 0, 1, 2, 0, 1, 1, 2,
       0, 1, 1, 2, 1, 0, 2, 2, 1, 2, 2, 1, 2, 1, 1, 0, 2, 0, 0, 1, 2, 0,
       0, 2, 2, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 1, 2, 2, 0, 2,
       1, 0, 2, 1, 2, 0, 1, 0, 0, 2, 0, 1, 1, 2, 0, 2, 1, 0, 0, 1, 0, 2,
       2, 2, 2, 2, 1, 2, 0, 2, 0, 2, 0, 1, 0, 0, 2,

In [20]:
y_pred = model.predict(X2)

In [21]:
feature_names = vectorizer.get_feature_names_out()
X2_df = pd.DataFrame(X2.toarray(), columns=feature_names)

In [22]:
y_pred = model.predict(X2_df)
# To fix the warning above

In [None]:
unique, counts = np.unique(y_pred, return_counts=True)
print(dict(zip(unique, counts)))

In [None]:
confidences = model.predict_proba(X2_df).max(axis=1)
print(f"Average confidence: {np.mean(confidences):.2f}")

In [None]:
surrogate_confidence_class_2 = model.predict_proba(X2_df.iloc[[2]])[0][2]
print(f"Confidence for predicting class 2: {surrogate_confidence_class_2:.2f}")

In [None]:
low_confidence_indices = np.where(confidences < 0.6)[0]
print(f"Low confidence predictions: {len(low_confidence_indices)}")

Active learning

1. Least confidence sampling

In [23]:
# Define the unlabeled dataset
X2_unlabeled = X2_df.copy()

# Obtain probabilities and calculate uncertainty scores
probs = model.predict_proba(X2_unlabeled)
uncertainty_scores = 1 - probs.max(axis=1)  # Least confidence sampling

# Select the top N most uncertain samples
n_samples = 7
most_uncertain_indices = uncertainty_scores.argsort()[-n_samples:]

# Extract the corresponding Unique_IDs and preprocessed concatenated text
X_initial_raw = ns_hra.iloc[most_uncertain_indices]

def manually_label_samples(selected_rows):
    labels = []
    for idx, row in selected_rows.iterrows():
        print(f"Unique_ID: {row['Unique_ID']}")
        print(f"Preprocessed Text: {row['concat_corpus']}\n")
        label = input("Enter label for this sample (e.g., 0, 1, 2): ")
        labels.append(int(label))
    return labels

# Manually label the selected samples
y_initial = manually_label_samples(X_initial_raw)
X_initial = X_initial_raw['concat_corpus'].tolist()

Unique_ID: IRAS_projectID_257107
Preprocessed Text: randomised clinical trial determine aesthetic outcome failure rate new compared current design resin retained bridge determine change current resin retained bridge design improve aesthetic outcome aesthetic score

Unique_ID: IRAS_projectID_234380
Preprocessed Text: multicenter randomized double blind parallel group placebo controlled study open label period evaluate efficacy safety fremanezumab prophylactic treatment migraine patient inadequate response prior preventive treatment primary objective study demonstrate efficacy fremanezumab administered monthly quarterly subcutaneous sc injection adult patient migraine inadequate response 2 4 class prior preventive treatment compared placebo efficacy endpoint mean change baseline 28 day run period monthly average number migraine day 12 week period 1st dose fremanezumab

Unique_ID: IRAS_projectID_129832
Preprocessed Text: randomised double blind placebo controlled phase iia study ass pharm

In [24]:
# Ensure X_train is always a list before extending
if 'X_train' not in locals() or not isinstance(X_train, list):
    X_train = list(X_initial)  # Convert to list explicitly
    y_train = np.array(y_initial)
else:
    X_train.extend(X_initial)
    y_train = np.hstack([y_train, y_initial])

# Remove labeled samples from unlabeled pool
X2_unlabeled_reset = X2_unlabeled.drop(index=most_uncertain_indices).reset_index(drop=True)

In [25]:
print(len(X_train), len(y_train), X2_unlabeled_reset.shape)

7 7 (687, 353)


Determine best classifiers using grid search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

# Define parameter grids for different classifiers
param_grids = {
    'ComplementNB': {
        'model': [ComplementNB()],
        'model__alpha': [0.1, 0.5, 1],
        'model__fit_prior': [True, False],
    },
    'SVM': {
        'model': [SVC(probability=True)],
        'model__C': [0.1, 1, 10],
        'model__kernel': ['linear', 'rbf'],
    },
    'RandomForest': {
        'model': [RandomForestClassifier()],
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [None, 10, 20],
    },
    'MLP': {
        'model': [MLPClassifier(max_iter=500)],
        'model__hidden_layer_sizes': [(5,), (10,), (5, 5)],
        'model__activation': ['relu', 'tanh'],
    }
}

# Define a preprocessing pipeline (e.g., scaling if needed)
pipeline = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),  # For sparse matrices, set `with_mean=False`
    ('model', ComplementNB())  # Placeholder for the classifier
])

# Reset the indices of X2_unlabeled and most_uncertain_indices
X2_unlabeled_reset = X2_unlabeled.reset_index(drop=True)
most_uncertain_indices_reset = most_uncertain_indices % len(X2_unlabeled_reset)

# Extract initial labeled data
X_initial = X2_unlabeled_reset.iloc[most_uncertain_indices_reset]
y_initial = y_initial

# Iterate over classifiers and parameter grids
best_models = {}
for name, param_grid in param_grids.items():
    print(f"Training {name}...")
    grid = GridSearchCV(pipeline, param_grid, cv=3, scoring='recall_weighted', n_jobs=-1)
    grid.fit(X_initial, y_initial)
    best_models[name] = grid.best_estimator_
    print(f"Best {name} model: {grid.best_params_}")

# Choose the best classifier based on cross-validation scores
best_classifier_name = max(best_models, key=lambda x: best_models[x].score(X_initial, y_initial))
best_model = best_models[best_classifier_name]

print(f"Best overall classifier: {best_classifier_name}")

In [None]:
Train the classifier

In [26]:
pipeline = make_pipeline(
    TfidfVectorizer(analyzer='word', min_df=3, ngram_range=(1, 3)),
    ComplementNB(alpha=0.1, fit_prior=True)
)

In [27]:
pipeline.fit(X_train, y_train)

In [29]:
y_pred = pipeline.predict(X2_unlabeled_reset)

In [30]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_val_score

recall = cross_val_score(pipeline, X_train, y_train, cv=3, scoring='recall_weighted').mean()
print(f"Recall = {recall:.4f}")

Recall = 0.7778




In [31]:
y_pred_cv = cross_val_predict(pipeline, X_train, y_train, cv=3)
recall_per_class = recall_score(y_train, y_pred_cv, average=None)
print(f"Per-Class Recall: {recall_per_class}")

Per-Class Recall: [1.         0.         0.66666667]




In [32]:
confidences = pipeline.predict_proba(X2_unlabeled_reset).max(axis=1)
print(f"Average confidence: {np.mean(confidences):.2f}")

Average confidence: 0.36


Iteration 2

In [33]:
# Define the unlabeled dataset
X2_unlabeled = X2_unlabeled_reset.copy()

# Obtain probabilities and calculate uncertainty scores
probs = model.predict_proba(X2_unlabeled)
uncertainty_scores = 1 - probs.max(axis=1)  # Least confidence sampling

# Select the top N most uncertain samples
n_samples = 7
most_uncertain_indices = uncertainty_scores.argsort()[-n_samples:]

# Extract the corresponding Unique_IDs and preprocessed concatenated text
X_initial_raw = ns_hra.iloc[most_uncertain_indices]

def manually_label_samples(selected_rows):
    labels = []
    for idx, row in selected_rows.iterrows():
        print(f"Unique_ID: {row['Unique_ID']}")
        print(f"Preprocessed Text: {row['concat_corpus']}\n")
        label = input("Enter label for this sample (e.g., 0, 1, 2): ")
        labels.append(int(label))
    return labels

# Manually label the selected samples
y2_initial = manually_label_samples(X_initial_raw)
X2_initial = X_initial_raw['concat_corpus'].tolist()

Unique_ID: IRAS_projectID_263481
Preprocessed Text: randomised double blind parallel group placebo controlled phase 3 trial exenatide weekly 2 year potential disease modifying treatment parkinson disease current trial objective confirm whether previous positive result seen exenatide parkinson disease reproduced multi centre trial design including larger number patient evaluated twice long period achieved comparing effectiveness exenatide weekly versus placebo md updrs part 3 motor sub score AA practically defined medication stateAA AA patient mild moderate severity pd change md updrs part 3 score reflects accumulation motor deficit therefore measure pd motor progression hypothesis exenatide associated reduced md updrs part 3 score 96 week time point comparison md updrs part 3 motor sub score practically defined medication state 96 week participant according treatment allocation

Unique_ID: IRAS_projectID_150070
Preprocessed Text: prospective randomised pilot study videoconferencing adu

In [34]:
X2_initial = list(X2_initial)
y_train = np.array(y2_initial)

In [35]:
X_train = X_train + X2_initial
y_train = np.hstack([y_initial, y2_initial])
len(X_train), len(y_train)

(14, 14)

In [36]:
# Remove labelled samples from unlabelled pool
X2_unlabeled_reset = X2_unlabeled.drop(index=most_uncertain_indices).reset_index(drop=True)

In [37]:
pipeline.fit(X_train, y_train)

In [38]:
y_pred = pipeline.predict(X2_unlabeled_reset)

In [39]:
confidences = pipeline.predict_proba(X2_unlabeled_reset).max(axis=1)
print(f"Average confidence: {np.mean(confidences):.2f}")

Average confidence: 0.38


In [40]:
recall = cross_val_score(pipeline, X_train, y_train, cv=3, scoring='recall_weighted').mean()
print(f"Recall = {recall:.4f}")

Recall = 0.5667


In [41]:
y_pred_cv = cross_val_predict(pipeline, X_train, y_train, cv=3)
recall_per_class = recall_score(y_train, y_pred_cv, average=None)
print(f"Per-Class Recall: {recall_per_class}")

Per-Class Recall: [0.4  0.75 0.6 ]


Iteration 3

In [42]:
# Define the unlabeled dataset
X2_unlabeled = X2_unlabeled_reset.copy()

# Obtain probabilities and calculate uncertainty scores
probs = model.predict_proba(X2_unlabeled)
uncertainty_scores = 1 - probs.max(axis=1)  # Least confidence sampling

# Select the top N most uncertain samples
n_samples = 7
most_uncertain_indices = uncertainty_scores.argsort()[-n_samples:]

# Extract the corresponding Unique_IDs and preprocessed concatenated text
X_initial_raw = ns_hra.iloc[most_uncertain_indices]

def manually_label_samples(selected_rows):
    labels = []
    for idx, row in selected_rows.iterrows():
        print(f"Unique_ID: {row['Unique_ID']}")
        print(f"Preprocessed Text: {row['concat_corpus']}\n")
        label = input("Enter label for this sample (e.g., 0, 1, 2): ")
        labels.append(int(label))
    return labels

# Manually label the selected samples
y3_initial = manually_label_samples(X_initial_raw)
X3_initial = X_initial_raw['concat_corpus'].tolist()

Unique_ID: IRAS_projectID_149334
Preprocessed Text: randomized double blind 104 week treatment study evaluate efficacy safety tolerability pharmacokinetics telbivudine oral solution tablet child adolescent compensated hbeag positive negative chronic hepatitis b virus infection primary objective study demonstrate antiviral efficacy study drug telbivudine compared placebo paediatric patient determining percentage patient achieving serum hepatitis b virus hbv dna level 300 copy ml 51 iu ml week 24 endpoint study demonstrate antiviral efficacy telbivudine compared placebo pediatric patient 2 18 year determining percentage patient achieving serum hbv dna level 300 copy ml week 24

Unique_ID: IRAS_projectID_161837
Preprocessed Text: phase 2b randomized double blind placebo controlled multi center study evaluating antiviral effect pharmacokinetics safety tolerability g 5806 hematopoietic cell transplant hct recipient respiratory syncytial virus rsv infection lower respiratory tract primary ob

In [43]:
X3_initial = list(X3_initial)
y_train = np.array(y3_initial)

In [44]:
X_train = X_train + X3_initial
y_train = np.hstack([y_initial, y2_initial, y3_initial])
len(X_train), len(y_train)

(21, 21)

In [45]:
X2_unlabeled_reset = X2_unlabeled.drop(index=most_uncertain_indices).reset_index(drop=True)

In [46]:
pipeline.fit(X_train, y_train)

In [47]:
confidences = pipeline.predict_proba(X2_unlabeled_reset).max(axis=1)
print(f"Average confidence: {np.mean(confidences):.2f}")

Average confidence: 0.40


In [48]:
recall = cross_val_score(pipeline, X_train, y_train, cv=3, scoring='recall_weighted').mean()
print(f"Recall = {recall:.4f}")

Recall = 0.4762


In [49]:
y_pred_cv = cross_val_predict(pipeline, X_train, y_train, cv=3)
recall_per_class = recall_score(y_train, y_pred_cv, average=None)
print(f"Per-Class Recall: {recall_per_class}")

Per-Class Recall: [0.42857143 0.5        0.5       ]


Iteration 4

In [50]:
# Define the unlabeled dataset
X2_unlabeled = X2_unlabeled_reset.copy()

# Obtain probabilities and calculate uncertainty scores
probs = model.predict_proba(X2_unlabeled)
uncertainty_scores = 1 - probs.max(axis=1)  # Least confidence sampling

# Select the top N most uncertain samples
n_samples = 7
most_uncertain_indices = uncertainty_scores.argsort()[-n_samples:]

# Extract the corresponding Unique_IDs and preprocessed concatenated text
X_initial_raw = ns_hra.iloc[most_uncertain_indices]

def manually_label_samples(selected_rows):
    labels = []
    for idx, row in selected_rows.iterrows():
        print(f"Unique_ID: {row['Unique_ID']}")
        print(f"Preprocessed Text: {row['concat_corpus']}\n")
        label = input("Enter label for this sample (e.g., 0, 1, 2): ")
        labels.append(int(label))
    return labels

# Manually label the selected samples
y4_initial = manually_label_samples(X_initial_raw)
X4_initial = X_initial_raw['concat_corpus'].tolist()

Unique_ID: IRAS_projectID_108629
Preprocessed Text: phase 3 randomized placebo controlled double blind study combined lysis thrombus ultrasound systemic tissue plasminogen activator tpa emergent revascularization clotbust er acute ischemic stroke primary objective study ass efficacy combined treatment ultrasound using sonolysis headframe standard medical treatment clot busting drug tissue plasminogen activator tpa compared tpa alone subject acute ischaemic stroke efficacy assessment objective study AA ass efficacy combined treatment transcranial u using sonolysis headframe systemic tpa target group compared systemic tpa alone control group subject acute ischemic stroke efficacy end point study AA proportion subject treatment v control group mr 0 1 score 90 day safety endpoint study ass difference group respect incidence symptomatic ich well overall analysis safety

Unique_ID: IRAS_projectID_218563
Preprocessed Text: nasal cavity cooling symptomatic relief migraine headache a randomized

In [51]:
X4_initial = list(X4_initial)
y_train = np.array(y4_initial)

In [52]:
X_train = X_train + X4_initial
y_train = np.hstack([y_initial, y2_initial, y3_initial, y4_initial])
len(X_train), len(y_train)

(28, 28)

In [53]:
X2_unlabeled_reset = X2_unlabeled.drop(index=most_uncertain_indices).reset_index(drop=True)

In [54]:
pipeline.fit(X_train, y_train)

In [55]:
confidences = pipeline.predict_proba(X2_unlabeled_reset).max(axis=1)
print(f"Average confidence: {np.mean(confidences):.2f}")

Average confidence: 0.41


In [56]:
recall = cross_val_score(pipeline, X_train, y_train, cv=3, scoring='recall_weighted').mean()
print(f"Recall = {recall:.4f}")

Recall = 0.4556


In [57]:
y_pred_cv = cross_val_predict(pipeline, X_train, y_train, cv=3)
recall_per_class = recall_score(y_train, y_pred_cv, average=None)
print(f"Per-Class Recall: {recall_per_class}")

Per-Class Recall: [0.5        0.42857143 0.45454545]
