In [70]:
import numpy as np
import pandas as pd

In [71]:
df = pd.read_csv("synthetic_names_samples.csv")

In [72]:
df.head()

Unnamed: 0,index,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"NAME : Charlie Harlow SUBJECTIVE:, This 23-y...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"NAME : Helen Bradford PAST MEDICAL HISTORY:, ...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,NAME : Walter Welter HISTORY OF PRESENT ILLNE...,"bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"NAME : Doris Emmert 2-D M-MODE: , ,1. Left a...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,NAME : Rochelle Hurt 1. The left ventricular...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# defining the TF-IDF
tfidf_configs = {
    'lowercase': True,
    'analyzer': 'word',
    'stop_words': 'english',
    'binary': False, # changed this to false
    'max_df': 0.9,
    'max_features': 10_000, # redoing params
    'ngram_range' : (1,2)
}

# defining the number of documents to retrieve
retriever_configs = {
    'n_neighbors': 10,
    'metric': 'cosine'
}

# defining our pipeline
embedding = TfidfVectorizer(**tfidf_configs)
retriever = NearestNeighbors(**retriever_configs)

In [74]:
documents = df[['transcription', 'index']]

In [75]:
tfidf_matrix = embedding.fit_transform(documents['transcription'])
retriever.fit(tfidf_matrix, documents['index'])

In [76]:
def transform_text(vectorizer, text):
    '''
    Print the text and the vector[TF-IDF]
    vectorizer: sklearn.vectorizer
    text: str
    '''
    print('Text:', text)
    vector = vectorizer.transform([text])
    vector = vectorizer.inverse_transform(vector)
    print('Vect:', vector)

In [77]:
# Test the transformation of text
question = "What happened to the Michael Dunstan who takes hydrochlorothiazide for hypertension?"
transform_text(embedding, question)

Text: What happened to the Michael Dunstan who takes hydrochlorothiazide for hypertension?
Vect: [array(['takes', 'michael', 'hypertension', 'hydrochlorothiazide',
       'happened'], dtype='<U33')]


In [78]:
nn_ques = embedding.transform([question])
index = retriever.kneighbors(nn_ques, return_distance=False)

In [79]:
index

array([[  13, 1947, 3392, 4444, 4381, 1366, 3285, 2437, 2375, 3396]],
      dtype=int64)