In [77]:
# Install the required packages
!pip install scikit-learn matplotlib pandas openpyxl



# Set labels for each document

In [ ]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [79]:
def get_similarity(query: str, document: list[str]) -> float:
    document_tf_idf = TfidfVectorizer().fit_transform(document)
    query_tf_idf = TfidfVectorizer().fit(document)
    query_tf_idf = query_tf_idf.transform([query])
    
    cosine_similarities = cosine_similarity(query_tf_idf, document_tf_idf).flatten()
    return max(cosine_similarities)

In [80]:
# Load the datasets and use the keys to store the query
sheets = {}
queries = []
with pd.ExcelFile('data/loinc_dataset-v2.xlsx') as xls:
    queries.extend(xls.sheet_names)
    for query in queries:
        sheets[query] = pd.read_excel(xls, query, skiprows=2, header=0, index_col=0)
        
sheets[queries[0]].head(10)

Unnamed: 0_level_0,long_common_name,component,system,property
loinc_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1988-5,C reactive protein [Mass/volume] in Serum or P...,C reactive protein,Ser/Plas,MCnc
1959-6,Bicarbonate [Moles/volume] in Blood,Bicarbonate,Bld,SCnc
10331-7,Rh [Type] in Blood,Rh,Bld,Type
18998-5,Trimethoprim+Sulfamethoxazole [Susceptibility],Trimethoprim+Sulfamethoxazole,Isolate,Susc
1975-2,Bilirubin.total [Mass/volume] in Serum or Plasma,Bilirubin,Ser/Plas,MCnc
890-4,Blood group antibody screen [Presence] in Seru...,Blood group antibody screen,Ser/Plas,ACnc
20565-8,"Carbon dioxide, total [Moles/volume] in Blood",Carbon dioxide,Bld,SCnc
18906-8,Ciprofloxacin [Susceptibility],Ciprofloxacin,Isolate,Susc
2143-6,Cortisol [Mass/volume] in Serum or Plasma,Cortisol,Ser/Plas,MCnc
2075-0,Chloride [Moles/volume] in Serum or Plasma,Chloride,Ser/Plas,SCnc


In [81]:
# Compute the similarity for each query
for query in queries:
    similarities = pd.Series([], dtype=float)
    for index in sheets[query].index:
        value = sheets[query].loc[index].values.flatten()
        similarities[index] = get_similarity(query, value)
    sheets[query]['similarity'] = similarities

In [82]:
# Save the results to a new file
with pd.ExcelWriter('data/loinc_dataset-v2_similarity.xlsx') as writer:
    for query in queries:
        sheets[query].to_excel(writer, sheet_name=query, startrow=2, startcol=0, index=True)

# Train a model to predict the similarity

In [85]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [86]:
seed = 42

In [ ]:
# Encode the document to convert it to a numerical representation
def encode_document(document: list[str]) -> list[int]:
    encoder = LabelEncoder()
    return encoder.fit_transform(document)