# Bethesda
Pipeline for extracting the Bethesda score in PALGA through fuzzy string matching

In [None]:
import pandas as pd

# Fuzzy string matching
from rapidfuzz import fuzz
from polyfuzz.models import RapidFuzz

# Custom code
from matcher import MatchEntity, EntityCollection
from matcher import preprocess

#### **Prepare data**
Here, we prepare the data such that the main dataframe has a `Conclusie` column from 
which we can try to extract the Bethesda score. 

In [None]:
df = pd.read_excel("MY_DATA.xlsx")
docs = preprocess.preprocess_docs(df)

#### **Match**
Below, we can define several matching procedures for finding close matches to the bethesda score. 
In principle, this algorithm is trying to find the words closest to a `search_term`. The closest words 
are expected to be the words that contain some variation of the bethesda score. 

In [None]:
# Define matching procedure
main_matching_alg = RapidFuzz(n_jobs=1, scorer=fuzz.QRatio)
to_match = [
    MatchEntity(search_term="bethesda klasse 1", ngram=3, matcher=main_matching_alg),
    MatchEntity(search_term="klasse", ngram=2, matcher=main_matching_alg),
    MatchEntity(search_term="bethesda II", ngram=2, matcher=main_matching_alg),
    MatchEntity(
        search_term="bethesda classificatie II", ngram=3, matcher=main_matching_alg
    ),
    MatchEntity(
        search_term="bethesda categorie II", ngram=3, matcher=main_matching_alg
    ),
    # New, used for 'landelijk'
    MatchEntity(
        search_term="bethesda schildklier categorie 1",
        ngram=4,
        matcher=main_matching_alg,
    ),
    MatchEntity(search_term="bethesdacategorie 1", ngram=2, matcher=main_matching_alg),
    MatchEntity(search_term="bethesdaklasse II", ngram=2, matcher=main_matching_alg),
    MatchEntity(
        search_term="bethesda classificatie cat 2", ngram=4, matcher=main_matching_alg
    ),
    MatchEntity(search_term="bethesda cat 2", ngram=4, matcher=main_matching_alg),
    MatchEntity(
        search_term="bethesda classificatie categorie II",
        ngram=4,
        matcher=main_matching_alg,
    ),
]
entity_collection = EntityCollection(to_match)
matches = entity_collection.match(docs)

#### **Extract Bethesda**

After extracting the closest words to a `search_term`, we extract the Bethesda score from these words. 

In [None]:
df["Bethesda"] = preprocess.extract_bethesda_score(matches)
df.to_excel("results.xlsx")