In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [103]:
!pip install -q chromadb spacy sentence_transformers textblob spellchecker
# !pip install -q spacy
# !pip install -q sentence_transformers

In [124]:
from chromadb.utils import embedding_functions
import chromadb
import numpy as np
import pandas as pd
import json
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
EMBED_MODEL = "all-MiniLM-L6-v2"

# client = chromadb.Client()
client = chromadb.PersistentClient(path="/content/drive/MyDrive/Skin Disease/vectordata")

embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
                                                                            model_name=EMBED_MODEL
                                                                        )
collection = client.get_or_create_collection(
                                                name='combined_data',
                                                embedding_function=embedding_func,
                                                metadata={"hnsw:space": "cosine"},
                                            )

sym_collection = client.get_or_create_collection(
                                                name='symptoms_data',
                                                embedding_function=embedding_func,
                                                metadata={"hnsw:space": "cosine"},
                                            )


In [10]:
sentences = pd.read_csv('/content/drive/MyDrive/Skin Disease/finaldata.csv')

In [11]:
sentences.head(2)

Unnamed: 0.1,Unnamed: 0,text1,text2,text3,combined
0,0,disease:Acne,"symptoms:pimples,blackheads,whiteheads,painful...",etiology:Caused by clogged hair follicles due ...,"disease:Acne|symptoms:pimples,blackheads,white..."
1,1,disease:Eczema (Atopic Dermatitis),"symptoms:itchy skin,red patches,thickened skin...","etiology:Exact cause unknown, but linked to a ...",disease:Eczema (Atopic Dermatitis)|symptoms:it...


In [13]:
with open('/content/drive/MyDrive/Skin Disease/all_symptoms.json', 'r') as f:
    sentences1= json.load(f)
# sentences1

## Text Pre-processing

In [16]:
nlp = spacy.load('en_core_web_sm')

In [107]:
# define the preprocess function
def preprocess_symptom(symptom_text):
    sym = nlp(symptom_text)
    cleaned_sym = []
    for words in sym:
        if words.text.lower() not in STOP_WORDS and words.text.isalpha():
            cleaned_sym.append(words.lemma_.lower())
    return ' '.join(cleaned_sym)

In [108]:
preprocess_symptom('really worried')


'worried'

In [155]:
sentences2 = [preprocess_symptom(symptom) for symptom in sentences1]
# for original, preprocessed in zip(sentences1, sentences2):
#     print(f"Original: {original} -> Preprocessed: {preprocessed}")
# sentences2

In [110]:
newsentences = sentences['combined'].tolist()
newids = sentences.index.astype(str).tolist()

In [156]:
newsentences1 = sentences2
ids1=[str(x) for x in range(len(newsentences1))]

In [157]:
collection.upsert(
                documents=newsentences,
                ids=newids
                        )

In [158]:
sym_collection.upsert(
                documents=newsentences1,
                ids=ids1
                        )

In [159]:
query = "I've noticed that my skin is very dry and flaky, and I also have some red patches. Sometimes it gets itchy too. What skin condition could this be?"

In [160]:
preprocess_symptom('sore mouth')

'sore mouth'

In [161]:
query = query.lower()
query = preprocess_symptom(query)
query

'notice skin dry flaky red patch get itchy skin condition'

In [168]:
# smart basket query function
def chroma_query(query):
    query = query.lower()
    query = preprocess_symptom(query)
    result = collection.query(
                                query_texts=query,
                                n_results=3,
                                include=["documents", "distances"]
                            )
    # result = result['documents'][0]

    result1=sym_collection.query(
        query_texts=query,
         n_results=3,
         include=["documents","distances"]
    )

    return result, result1


In [169]:
result=chroma_query(query)
result

({'ids': [['1', '8', '39']],
  'distances': [[0.3765604387982826, 0.3894060796599865, 0.40028203375285576]],
  'metadatas': None,
  'embeddings': None,
  'documents': [['disease:Eczema (Atopic Dermatitis)|symptoms:itchy skin,red patches,thickened skin,dryness,cracking,bleeding|etiology:Exact cause unknown, but linked to a combination of genetic and environmental factors.',
    'disease:Seborrheic Dermatitis|symptoms:red skin,scaly patches,dandruff,greasy or oily skin,itching behind ears, eyebrows, or on chest|etiology:Linked to a combination of genetic, environmental, and fungal factors.',
    'disease:Actinic Keratosis|symptoms:scaly patch,pink, red, or brown,itching or burning,rough, sandpaper-like texture|etiology:Precancerous skin lesion caused by prolonged UV exposure.']],
  'uris': None,
  'data': None,
  'included': ['documents', 'distances']},
 {'ids': [['477', '162', '164']],
  'distances': [[0.23711282014846802,
    0.24002474546432495,
    0.26602423191070557]],
  'metadatas

In [170]:
def process_chroma_query_result(result):
    output = []

    # identified symptoms
    identified_symptoms = result[1]['documents'][0]
    output.append({'identified symptoms': identified_symptoms})

    # diseases information and confidence
    disease_info = result[0]['documents'][0]
    distances = result[0]['distances'][0]

    for i, info in enumerate(disease_info):
        disease_data = {}
        info_parts = info.split('|')

        # formatting the disease information
        for part in info_parts:
            key, value = part.split(':', 1)
            if key == 'disease':
                disease_data[key] = value
            elif key == 'symptoms':
                disease_data['symptom'] = value
            elif key == 'etiology':
                disease_data[key] = value

        # Adding confidence
        disease_data['confidence'] = 1-distances[i]

        output.append(disease_data)

    return output




In [171]:
# Process the result
output = process_chroma_query_result(result)
output

[{'identified symptoms': ['red itchy patch',
   'dry flaky skin',
   'itchy red rash']},
 {'disease': 'Eczema (Atopic Dermatitis)',
  'symptom': 'itchy skin,red patches,thickened skin,dryness,cracking,bleeding',
  'etiology': 'Exact cause unknown, but linked to a combination of genetic and environmental factors.',
  'confidence': 0.6234395612017174},
 {'disease': 'Seborrheic Dermatitis',
  'symptom': 'red skin,scaly patches,dandruff,greasy or oily skin,itching behind ears, eyebrows, or on chest',
  'etiology': 'Linked to a combination of genetic, environmental, and fungal factors.',
  'confidence': 0.6105939203400135},
 {'disease': 'Actinic Keratosis',
  'symptom': 'scaly patch,pink, red, or brown,itching or burning,rough, sandpaper-like texture',
  'etiology': 'Precancerous skin lesion caused by prolonged UV exposure.',
  'confidence': 0.5997179662471442}]