# Bethesda
Pipeline for extracting the Bethesda score in PALGA through predictive modeling using the data obtained from the fuzzy string matching pipeline.

In [None]:
import matplotlib.pyplot as plt

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# Custom code
from matcher import Prediction

## Load data
Load the data generated in the previous notebook.

In [None]:
# Prepare data
predictor = Prediction("results.xlsx")
X, y, X_train, y_train, X_test, y_test, X_eval, y_eval = predictor.split_data()

## Modeling
Run any number of predictive models and extract their 5-fold cross-validation scores (F1, both micro and macro to account for class imbalance), a classification report, and a confusion matrix for further exploration.

In [None]:
clfs = [
    LogisticRegression(random_state=0, max_iter=500, dual=True, solver="liblinear"),
    RandomForestClassifier(random_state=0),
    xgb.XGBClassifier(tree_method="hist"),
]
pipelines = [predictor.create_pipeline(clf) for clf in clfs]
cv_scores, reports, displays = predictor.evaluate(
    pipelines, X_train, y_train, X_test, y_test
)

**Display Confusion Matrix**

In [None]:
displays[0].plot()
plt.savefig("lr.png", dpi=300)

## Create Predictions
Add the predictions to the previous results by creating a new column with the predictions and their probabilities.

In [None]:
# Predict unlabeled data
results = predictor.add_predictions(pipelines[0], X, y)
results.to_excel("results_plus_predictions.xlsx")