In [276]:
# Install the required packages
!pip install scikit-learn matplotlib pandas openpyxl



# Set labels for each document

In [277]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [278]:
def get_similarity(query: str, document: list[str]) -> float:
    document_tf_idf = TfidfVectorizer().fit_transform(document)
    query_tf_idf = TfidfVectorizer().fit(document)
    query_tf_idf = query_tf_idf.transform([query])
    
    cosine_similarities = cosine_similarity(query_tf_idf, document_tf_idf).flatten()
    return max(cosine_similarities)

In [279]:
def encode_sentence(sentence: str, encoder_model) -> float:
    tfidf_array = encoder_model.transform([sentence]).toarray().flatten()
    return sum(tfidf_array) / len(tfidf_array)

In [280]:
# Load the datasets and use the keys to store the query
sheets = {}
queries = []
with pd.ExcelFile('data/loinc_dataset-v2_extended.xlsx') as xls:
    queries.extend(xls.sheet_names)
    for query in queries:
        sheets[query] = pd.read_excel(xls, query, skiprows=2, header=0, index_col=0)
        
sheets[queries[0]].head(10)

Unnamed: 0_level_0,long_common_name,component,system,property
loinc_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1988-5,C reactive protein [Mass/volume] in Serum or P...,C reactive protein,Ser/Plas,MCnc
1959-6,Bicarbonate [Moles/volume] in Blood,Bicarbonate,Bld,SCnc
10331-7,Rh [Type] in Blood,Rh,Bld,Type
18998-5,Trimethoprim+Sulfamethoxazole [Susceptibility],Trimethoprim+Sulfamethoxazole,Isolate,Susc
1975-2,Bilirubin.total [Mass/volume] in Serum or Plasma,Bilirubin,Ser/Plas,MCnc
890-4,Blood group antibody screen [Presence] in Seru...,Blood group antibody screen,Ser/Plas,ACnc
20565-8,"Carbon dioxide, total [Moles/volume] in Blood",Carbon dioxide,Bld,SCnc
18906-8,Ciprofloxacin [Susceptibility],Ciprofloxacin,Isolate,Susc
2143-6,Cortisol [Mass/volume] in Serum or Plasma,Cortisol,Ser/Plas,MCnc
2075-0,Chloride [Moles/volume] in Serum or Plasma,Chloride,Ser/Plas,SCnc


In [281]:
# Compute the similarity for each query
for query in queries:
    similarities = pd.Series([], dtype=float)
    for index in sheets[query].index:
        value = sheets[query].loc[index].values.flatten()
        similarities[index] = get_similarity(query, value)
    sheets[query]['similarity'] = similarities

In [282]:
# Save the results to a new file
with pd.ExcelWriter('data/loinc_dataset-v2_similarity.xlsx') as writer:
    for query in queries:
        sheets[query].to_excel(writer, sheet_name=query, startrow=2, startcol=0, index=True)

# Train a model to predict the similarity

In [283]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import root_mean_squared_error

In [284]:
seed = 42

In [285]:
models = {}
for query in queries:
    X = sheets[query].drop(columns=['similarity'])
    y = sheets[query]['similarity']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    X_encoded = pd.DataFrame()
    encoders = {}
    for column in X_train.columns:
        encoder = TfidfVectorizer()
        encoder.fit(X_train[column])
        X_encoded[column] = X_train[column].apply(lambda x: encode_sentence(x, encoder))
        encoders[column] = encoder
    
    # Train the model
    model = BayesianRidge()
    model.fit(X_encoded, y_train)
    
    X_test_encoded = pd.DataFrame()
    for column in X_test.columns:
        X_test_encoded[column] = X_test[column].apply(lambda x: encode_sentence(x, encoders[column]))
    
    # Predict the similarity
    y_pred = model.predict(X_test_encoded)
    
    # Evaluate the model
    error = root_mean_squared_error(y_test, y_pred)
    
    print(pd.DataFrame({'y_test': y_test, 'y_pred': y_pred}).head(10))
    
    models[query] = model
    print(f'Model for query = \"{query}\" obtains an error: {error:.2f}')
    
    print('-' * 75)
    

             y_test    y_pred
loinc_num                    
49926-9    0.470059  0.548572
49925-1    0.451421  0.548185
15076-3    0.619130  0.745093
14423-8    0.388614  0.538383
74774-1    0.582846  0.391266
23658-8    0.000000  0.527262
18928-2    0.000000  0.527262
59813-6    0.543812  0.543263
94231-8    0.468602  0.476200
27353-2    0.505900  0.545562
Model for query = "glucose in blood" obtains an error: 0.23
---------------------------------------------------------------------------
             y_test    y_pred
loinc_num                    
1003-3     0.478651  0.529908
14578-9    0.318388  0.478727
6768-6     0.492569  0.525832
1970-3     0.586489  0.505121
934-0      0.000000 -0.166990
883-9      0.508672  0.305203
26478-8    0.508672  0.281229
1988-5     0.525473  0.503351
8310-5     0.000000 -0.258991
1751-7     0.549584  0.508245
Model for query = "bilirubin in plasma" obtains an error: 0.15
---------------------------------------------------------------------------
     