In [19]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Read data from Excel file into a DataFrame, skipping the empty row and starting from the third row
file_path = 'data/loinc_dataset-v2.xlsx'
df = pd.read_excel(file_path, skiprows=2)
df = df.dropna()

# Extract features for the query row
query_row = df.iloc[0]  # Assuming the first row is the query
query_features = query_row[['loinc_num', 'long_common_name', 'component', 'system']].astype(str)


# Combine the text columns into a single string for vectorization
query_text = ' '.join(query_features.values)

# Vectorize the text using TF-IDF for the query
vectorizer = TfidfVectorizer()
query_vectorized = vectorizer.fit_transform([query_text])

# Initialize a list to store similarity scores
similarity_scores = []

# Iterate through each data row until the next empty row and calculate similarity
for index, row in df.iterrows():
    # Check for an empty row
    if row.isnull().all():
        break
    
    data_features = row[['loinc_num', 'long_common_name', 'component', 'system']].astype(str)
    data_text = ' '.join(data_features.values)
    
    # Vectorize the text using TF-IDF for the data row
    data_vectorized = vectorizer.transform([data_text])
    
    # Calculate cosine similarity
    similarity_score = cosine_similarity(query_vectorized, data_vectorized)[0][0]
    
    # Append similarity score to the list
    similarity_scores.append(similarity_score)

# Add similarity scores to the DataFrame
df['Similarity'] = similarity_scores

# Sort DataFrame by similarity in descending order
ranked_df = df.sort_values(by='Similarity', ascending=False)

# Display the ranked DataFrame
print(ranked_df)

   loinc_num                                   long_common_name  \
0     1988-5  C reactive protein [Mass/volume] in Serum or P...   
43   30522-7  C reactive protein [Mass/volume] in Serum or P...   
10    4671-4                  Protein C [Mass/volume] in Plasma   
8     2143-6          Cortisol [Mass/volume] in Serum or Plasma   
21    1968-7  Bilirubin.direct [Mass/volume] in Serum or Plasma   
..       ...                                                ...   
11   18864-9                        Ampicillin [Susceptibility]   
24    8310-5                                   Body temperature   
17     925-8                   Blood product disposition [Type]   
48   18955-5                    Nitrofurantoin [Susceptibility]   
66   23658-8                  Other Antibiotic [Susceptibility]   

                                           component    system property  \
0                                 C reactive protein  Ser/Plas     MCnc   
43                                C reactive 

In [20]:
print(df['Similarity'])

0     1.000000
1     0.342997
2     0.242536
3     0.000000
4     0.685994
        ...   
62    0.242536
63    0.000000
64    0.000000
65    0.641689
66    0.000000
Name: Similarity, Length: 67, dtype: float64


# Esto no esta bien, es aplicando lo de la regresion pero vaya qye no esta.

In [26]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Read data from Excel file into a DataFrame, skipping the empty row and starting from the third row
file_path = 'loinc_dataset-v2.xlsx'
df = pd.read_excel(file_path, skiprows=2)
df = df.dropna()

# Extract features for the query row
query_row = df.iloc[0]  # Assuming the first row is the query
query_features = query_row[['loinc_num', 'long_common_name', 'component', 'system']].astype(str)

# Combine the text columns into a single string for vectorization
query_text = ' '.join(query_features.values)

# Vectorize the text using TF-IDF for the query
vectorizer = TfidfVectorizer()
query_vectorized = vectorizer.fit_transform([query_text])

# Initialize a list to store logistic regression scores
logistic_scores = []

# Initialize logistic regression model
logistic_model = LogisticRegression()

# Iterate through each data row until the next empty row and apply logistic regression
for index, row in df.iterrows():
    # Check for an empty row
    if row.isnull().all():
        break
    
    data_features = row[['loinc_num', 'long_common_name', 'component', 'system']].astype(str)
    data_text = ' '.join(data_features.values)
    
    # Vectorize the text using TF-IDF for the data row
    data_vectorized = vectorizer.transform([data_text])
    
    # Use logistic regression to predict rank (binary classification)
    logistic_score = logistic_model.fit(query_vectorized, [1]*len(data_vectorized)).predict_proba(data_vectorized)[:, 1]
    
    # Append logistic score to the list
    logistic_scores.append(logistic_score[0])

# Add logistic scores to the DataFrame
df['Logistic_Score'] = logistic_scores

# Sort DataFrame by logistic score in descending order
r


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1