In [7]:
import time

In [2]:
# install sklearn
! pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp311-cp311-macosx_12_0_arm64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp311-cp311-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp311-cp311-macosx_12_0_arm64.whl (11.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading scipy-1.14.1-cp311-cp311-macosx_14_0_arm64.whl (23.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.1/23.1 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-1.5.2 scipy-1.14.1 threadpoolctl-3.5.0


In [9]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def setup_tfidf_matcher(keyword_list):
    vectorizer = TfidfVectorizer()
    keyword_vectors = vectorizer.fit_transform(keyword_list)
    return vectorizer, keyword_vectors

def map_sentence_to_keywords_tfidf(vectorizer, keyword_vectors, keyword_list, sentence, top_n=3):
    sentence_vector = vectorizer.transform([sentence])
    similarities = cosine_similarity(sentence_vector, keyword_vectors).flatten()
    top_indices = np.argsort(similarities)[-top_n:][::-1]
    return [keyword_list[i] for i in top_indices]


# Cosine similarity
keyword_list = ['weakened immune system', 'vision problems', 'neurological problems', 'muscle weakness',
                'weakness and muscle damage', 'anemia', 'impaired immune response',
                'oxidative degradation', 'nerve damage', 'poor vision', 'dry skin', 'increased susceptibility to infections',
                'xerophthalmia', 'clinical ophthalmic signs of vitamin A deficiency', 'severe VAD', 
                'depression', 'nervosness', 'irritability', 'sideroblastic microcytic anemia', 'seizures',
                'peripheral neuropathy', 'dermatologic lesions', 'megaloblastic anemia', 'neropathy', 'memory loss',
                'nausea and vomiting', 'migraine', 'insulin resistance', 'hypocalcemia', 'thrombosis',
                'cardiovascular complications', 'impaired T cell activation and increased risk of hematologic malignancies',
                'cognitive decline', 'abnormal health rhythms', 'mild anxiety', 'sleep disorders', 'fatigue and weakness',
                'hypernatremia', 'confusion', 'numbness or tingling in extremities', 'sporadic hair loss', 'loss of muscle mass']

sentence = """I've been feeling a bit off lately. I'm experiencing a lot of muscle weakness, especially in my legs. 
Sometimes, I even have trouble walking without stumbling. My reflexes seem slower than usual, 
and I've noticed some numbness and tingling in my hands and feet. 
I've also been feeling a bit lightheaded, and my vision seems a little blurry. I'm worried about what might be causing these issues."""

start_time = time.time()
for i in range(10000):
    vectorizer, keyword_vectors = setup_tfidf_matcher(keyword_list)  
    mapped_keywords = map_sentence_to_keywords_tfidf(vectorizer, keyword_vectors, keyword_list, sentence)
end_time = time.time()
print("Time taken:", end_time - start_time)
print("TF-IDF Mapped Keywords:", mapped_keywords)



Time taken: 5.026917934417725
TF-IDF Mapped Keywords: ['weakness and muscle damage', 'fatigue and weakness', 'numbness or tingling in extremities']


In [5]:
# install fuzzywuzzy
! pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [10]:
# Fuzzy matching
from fuzzywuzzy import process

def map_sentence_to_keywords_fuzzy(keyword_list, sentence, top_n=3):
    matches = process.extract(sentence, keyword_list, limit=top_n)
    return [match[0] for match in matches]


keyword_list = ['weakened immune system', 'vision problems', 'neurological problems', 'muscle weakness',
                'weakness and muscle damage', 'anemia', 'impaired immune response',
                'oxidative degradation', 'nerve damage', 'poor vision', 'dry skin', 'increased susceptibility to infections',
                'xerophthalmia', 'clinical ophthalmic signs of vitamin A deficiency', 'severe VAD', 
                'depression', 'nervosness', 'irritability', 'sideroblastic microcytic anemia', 'seizures',
                'peripheral neuropathy', 'dermatologic lesions', 'megaloblastic anemia', 'neropathy', 'memory loss',
                'nausea and vomiting', 'migraine', 'insulin resistance', 'hypocalcemia', 'thrombosis',
                'cardiovascular complications', 'impaired T cell activation and increased risk of hematologic malignancies',
                'cognitive decline', 'abnormal health rhythms', 'mild anxiety', 'sleep disorders', 'fatigue and weakness',
                'hypernatremia', 'confusion', 'numbness or tingling in extremities', 'sporadic hair loss', 'loss of muscle mass']

#keyword_list = ['Bruising', 'Hemolytic anemia', 'Iron-deficiency Anemia', 'Spina bifida', 'Ariboflavinosis', 'Tremors', 'Skin irritation', 'Cardiovascular Disease', 'Heart Disease', 'Hypomagnesemia', 'Difficulty concentrating', 'Liver damage', 'Bleeding gums', 'Inflammatory disorders', 'Parkinson s Disease', 'Preeclampsia', 'Swollen gums', 'Tissue fragility', 'Abdominal Pain', 'Oxidative stress', 'Fatigue', 'Depressed Mood', 'Chest pain', 'Magnesium Deficiency', 'Joint pain', 'Increased oxidative stress', 'Cognitive Impairment', 'Thrombus formation', 'Coronary heart disease', 'Reduced bone mass', 'Pale skin', 'Deficiency can lead to neurological disorders', 'Zinc Deficiency', 'Clostridium difficile infection', 'Hair Loss', 'Abdominal Cramps', 'Antibiotic-associated diarrhea', 'Migraine', 'Alzheimer s disease', 'Cell proliferation', 'Loss of height', 'Osteoporosis', 'Airway inflammation', 'Poor bioavailability of tocotrienol', 'Dysmenorrhea', 'Menstrual Pain', 'Sensitivity to light', 'Neurological disorders', 'Asthma', 'Blindness', 'Neurodevelopmental Disorders', 'Anemia', 'Cognitive Decline', 'Delayed wound healing', 'Headache', 'Difficulty seeing in low light', 'High blood pressure', 'Beriberi', 'Bleeding Gums', 'Premenstrual Syndrome (PMS)', 'Weight gain', 'Tiredness', 'Muscle cramps', 'Bradykinesia', 'Protein in urine', 'Proinflammatory cytokines production', 'Behavioral Issues', 'Dry eyes', 'Irritability', 'Insulin resistance', 'Traumatic brain injury', 'Not available', 'Major Depression', 'Cardiac arrhythmias', 'Increased risk of atherosclerosis', 'Muscle Weakness', 'Depression', 'Vitamin A Deficiency', 'Dry Skin', 'Neural tube defects', 'Scurvy', 'Cognitive decline', 'Vision problems', 'Insulin Resistance', 'N/A', 'Increased fragility', 'Mood swings', 'Weakened immune system', 'High blood sugar levels', 'Anencephaly', 'Muscle weakness', 'Diarrhea', 'Capillary fragility', 'Xerophthalmia', 'Anxiety and Depression', 'Shortness of breath', 'Atheromas formation', 'Heart-related symptoms', 'Bone pain', 'Decreased bone density', 'Numbness', 'Vision Problems', 'Inflammatory Diseases', 'Hypomagnesaemia', 'Bloating', 'Atherosclerosis', 'Metabolic Syndrome', 'Back pain', 'Alzheimer s Disease', 'Night blindness', 'Parkinson s disease', 'Essential Fatty Acid Deficiency', 'Hair loss', 'Asymptomatic', 'Anxiety', 'Swelling', 'Swollen mucous membranes', 'Disease caused by deficiencies of the nutrient', 'Osteoarthritis', 'Memory loss', 'Vitamin E Deficiency', 'Poor healing', 'Clinical features', 'Weakness', 'Nausea', 'Not specified', 'Sore throat', 'Bone fractures', 'Irritable bowel syndrome', 'Allergic disorders', 'Nerve damage']


sentence = """I've been feeling a bit off lately. I'm experiencing a lot of muscle weakness, especially in my legs. 
Sometimes, I even have trouble walking without stumbling. My reflexes seem slower than usual, 
and I've noticed some numbness and tingling in my hands and feet. 
I've also been feeling a bit lightheaded, and my vision seems a little blurry. I'm worried about what might be causing these issues."""

start_time = time.time()
for i in range(10000):
    mapped_keywords = map_sentence_to_keywords_fuzzy(keyword_list, sentence)
end_time = time.time()
print("Time taken:", end_time - start_time)
print("Fuzzy Mapped Keywords:", mapped_keywords)

Time taken: 134.38339829444885
Fuzzy Mapped Keywords: ['impaired T cell activation and increased risk of hematologic malignancies', 'muscle weakness', 'vision problems']


In [43]:
# install sentence_transformers
! pip uninstall peft -y

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Found existing installation: peft 0.14.0
Uninstalling peft-0.14.0:
  Successfully uninstalled peft-0.14.0


In [44]:
! pip install peft==0.3.0
! pip install sentence-transformers==3.1.0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting peft==0.3.0
  Using cached peft-0.3.0-py3-none-any.whl.metadata (21 kB)
Using cached peft-0.3.0-py3-none-any.whl (56 kB)
Installing collected packages: peft
Successfully installed peft-0.3.0


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [36]:
# install annoy
! pip uninstall annoy -y
! pip install annoy

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Found existing installation: annoy 1.17.3
Uninstalling annoy-1.17.3:
  Successfully uninstalled annoy-1.17.3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting annoy
  Using cached annoy-1.17.3-cp311-cp311-macosx_11_0_arm64.whl
Installing collected packages: annoy
Successfully installed annoy-1.17.3


In [39]:
! pip install --upgrade peft

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting peft
  Using cached peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Using cached peft-0.14.0-py3-none-any.whl (374 kB)
Installing collected packages: peft
  Attempting uninstall: peft
    Found existing installation: peft 0.3.0
    Uninstalling peft-0.3.0:
      Successfully uninstalled peft-0.3.0
Successfully installed peft-0.14.0


In [45]:
! pip show peft

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Name: peft
Version: 0.3.0
Summary: Parameter-Efficient Fine-Tuning (PEFT)
Home-page: https://github.com/huggingface/peft
Author: The HuggingFace team
Author-email: sourab@huggingface.co
License: Apache
Location: /Users/obinnairrechukwu/anaconda3/envs/myLLMenv/lib/python3.11/site-packages
Requires: accelerate, numpy, packaging, psutil, pyyaml, torch, transformers
Required-by: 


In [11]:
from sentence_transformers import SentenceTransformer
import numpy as np
from annoy import AnnoyIndex

def setup_sentence_transformer_matcher(keyword_list):
    # Load the model
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    
    keyword_sentences = [f"symptoms of {keyword.lower()}" for keyword in keyword_list]
    keyword_vectors = model.encode(keyword_sentences)
    
    # Setup Annoy index
    vector_dimension = keyword_vectors.shape[1]  # Get dimensionality from the model output
    index = AnnoyIndex(vector_dimension, 'angular')
    
    for i, vec in enumerate(keyword_vectors):
        index.add_item(i, vec)
    index.build(10)
    
    return model, index, keyword_vectors

def map_sentence_to_keywords_transformer(model, index, keyword_list, sentence, top_n=3):
    # Convert input to proper sentence format
    input_sentence = f"patient describes symptoms: {sentence.lower()}"
    
    # Get embedding for the sentence
    sentence_vector = model.encode(input_sentence)
    
    # Find nearest neighbors
    nearest = index.get_nns_by_vector(sentence_vector, top_n)
    return [keyword_list[i] for i in nearest]


#keyword_list = ['Bruising', 'Hemolytic anemia', 'Iron-deficiency Anemia', 'Spina bifida', 'Ariboflavinosis', 'Tremors', 'Skin irritation', 'Cardiovascular Disease', 'Heart Disease', 'Hypomagnesemia', 'Difficulty concentrating', 'Liver damage', 'Bleeding gums', 'Inflammatory disorders', 'Parkinson s Disease', 'Preeclampsia', 'Swollen gums', 'Tissue fragility', 'Abdominal Pain', 'Oxidative stress', 'Fatigue', 'Depressed Mood', 'Chest pain', 'Magnesium Deficiency', 'Joint pain', 'Increased oxidative stress', 'Cognitive Impairment', 'Thrombus formation', 'Coronary heart disease', 'Reduced bone mass', 'Pale skin', 'Deficiency can lead to neurological disorders', 'Zinc Deficiency', 'Clostridium difficile infection', 'Hair Loss', 'Abdominal Cramps', 'Antibiotic-associated diarrhea', 'Migraine', 'Alzheimer s disease', 'Cell proliferation', 'Loss of height', 'Osteoporosis', 'Airway inflammation', 'Poor bioavailability of tocotrienol', 'Dysmenorrhea', 'Menstrual Pain', 'Sensitivity to light', 'Neurological disorders', 'Asthma', 'Blindness', 'Neurodevelopmental Disorders', 'Anemia', 'Cognitive Decline', 'Delayed wound healing', 'Headache', 'Difficulty seeing in low light', 'High blood pressure', 'Beriberi', 'Bleeding Gums', 'Premenstrual Syndrome (PMS)', 'Weight gain', 'Tiredness', 'Muscle cramps', 'Bradykinesia', 'Protein in urine', 'Proinflammatory cytokines production', 'Behavioral Issues', 'Dry eyes', 'Irritability', 'Insulin resistance', 'Traumatic brain injury', 'Not available', 'Major Depression', 'Cardiac arrhythmias', 'Increased risk of atherosclerosis', 'Muscle Weakness', 'Depression', 'Vitamin A Deficiency', 'Dry Skin', 'Neural tube defects', 'Scurvy', 'Cognitive decline', 'Vision problems', 'Insulin Resistance', 'N/A', 'Increased fragility', 'Mood swings', 'Weakened immune system', 'High blood sugar levels', 'Anencephaly', 'Muscle weakness', 'Diarrhea', 'Capillary fragility', 'Xerophthalmia', 'Anxiety and Depression', 'Shortness of breath', 'Atheromas formation', 'Heart-related symptoms', 'Bone pain', 'Decreased bone density', 'Numbness', 'Vision Problems', 'Inflammatory Diseases', 'Hypomagnesaemia', 'Bloating', 'Atherosclerosis', 'Metabolic Syndrome', 'Back pain', 'Alzheimer s Disease', 'Night blindness', 'Parkinson s disease', 'Essential Fatty Acid Deficiency', 'Hair loss', 'Asymptomatic', 'Anxiety', 'Swelling', 'Swollen mucous membranes', 'Disease caused by deficiencies of the nutrient', 'Osteoarthritis', 'Memory loss', 'Vitamin E Deficiency', 'Poor healing', 'Clinical features', 'Weakness', 'Nausea', 'Not specified', 'Sore throat', 'Bone fractures', 'Irritable bowel syndrome', 'Allergic disorders', 'Nerve damage']

keyword_list = ['weakened immune system', 'vision problems', 'neurological problems', 'muscle weakness',
                'weakness and muscle damage', 'anemia', 'impaired immune response',
                'oxidative degradation', 'nerve damage', 'poor vision', 'dry skin', 'increased susceptibility to infections',
                'xerophthalmia', 'clinical ophthalmic signs of vitamin A deficiency', 'severe VAD', 
                'depression', 'nervosness', 'irritability', 'sideroblastic microcytic anemia', 'seizures',
                'peripheral neuropathy', 'dermatologic lesions', 'megaloblastic anemia', 'neropathy', 'memory loss',
                'nausea and vomiting', 'migraine', 'insulin resistance', 'hypocalcemia', 'thrombosis',
                'cardiovascular complications', 'impaired T cell activation and increased risk of hematologic malignancies',
                'cognitive decline', 'abnormal health rhythms', 'mild anxiety', 'sleep disorders', 'fatigue and weakness',
                'hypernatremia', 'confusion', 'numbness or tingling in extremities', 'sporadic hair loss', 'loss of muscle mass']
model, index, keyword_vectors = setup_sentence_transformer_matcher(keyword_list)



In [48]:
from peft import PeftModel, LoraConfig

In [50]:
from peft import PeftModelForFeatureExtraction

ImportError: cannot import name 'PeftModelForFeatureExtraction' from 'peft' (/Users/obinnairrechukwu/anaconda3/envs/myLLMenv/lib/python3.11/site-packages/peft/__init__.py)

In [12]:

sentence = """I've been feeling a bit off lately. I'm experiencing a lot of muscle weakness, 
              especially in my legs. Sometimes, I even have trouble walking without stumbling. 
              My reflexes seem slower than usual, and I've noticed some numbness and tingling 
              in my hands and feet. I've also been feeling a bit lightheaded, and my vision 
              seems a little blurry."""

start_time = time.time()
for i in range(10000):
    mapped_keywords = map_sentence_to_keywords_transformer(model, index, keyword_list, sentence)
end_time = time.time()
print("Time taken:", end_time - start_time) 
print(mapped_keywords)

Time taken: 78.0156569480896
['weakness and muscle damage', 'muscle weakness', 'fatigue and weakness']
