In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import tensorflow_hub as hub
from sklearn import metrics
import gzip
import json
import scipy
import numpy as np
import warnings
from time import time
from numpy.linalg import norm
from numba import njit, prange, jit
import pandas as pd
import pickle
from transformers import pipeline
import math

In [None]:
warnings.filterwarnings("ignore")

In [None]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
@njit
def cosine_similarity(np_array_a, np_array_b):
  return np.dot(np_array_a, np_array_b)/(norm(np_array_a)*norm(np_array_b))

In [None]:
@njit
def mean(l):
 return sum(l)/len(l)

@njit
def metric_MIFTS(MIFTS):
  a = 7
  k = 2

  if (MIFTS <= 75):
    return (k/(a-1)) * (a**((100-MIFTS)/100) - 1)
  else:
    return (-1/300) * (MIFTS-75) + 0.2089


@njit
def std(l, mean):
  if len(l) == 1:
    return 0
  else:
    return math.sqrt( sum([(i-mean)**2 for i in l]) / (len(l) - 1) )

@njit
def similarity(list_symptoms1, list_symptoms2):
  if len(list_symptoms1) < len(list_symptoms2):
    a = list_symptoms1
    b = list_symptoms2
  else:
    a = list_symptoms2
    b = list_symptoms1

  cos_similarties_a = []

  for a_i in prange(len(a)):
    array_a = a[a_i]

    max_cos_sim_a = 0

    for b_i in prange(len(b)):
      array_b = b[b_i]

      cos_similarity = cosine_similarity(array_a, array_b)
      if cos_similarity > max_cos_sim_a:
        max_cos_sim_a = cos_similarity

    cos_similarties_a.append(max_cos_sim_a)


  return cos_similarties_a

In [None]:
which_seno_part = "nose"

In [None]:
npzfile = np.load(f"best_{which_seno_part}_numpy.npz")

In [None]:
with open(f"{which_seno_part}_MIFTS.json", "r") as f:
  MIFTS_disease = json.load(f)

In [None]:
def get_disease(symptoms_emb_nparr):
  disease_sims = {}

  getting_symptoms_time = 0
  similairty_time = 0

  for disease_index in range(len(npzfile.keys())):
    disease = list(npzfile.keys())[disease_index]

    MIFTS = MIFTS_disease[disease]

    disease_emb_nparr = npzfile[disease]

    cos_sims = similarity(symptoms_emb_nparr, disease_emb_nparr)

    mean_cos_sim = mean(cos_sims)
    std_cos_sim = std(cos_sims, mean_cos_sim)

    # Number of disease symptoms that don't link up to the user's symptoms
    num_nomatchsymps = abs(len(disease_emb_nparr)-len(symptoms_emb_nparr))

    # More no match symptoms means that the metic increases so the disease is a worse diagnosis
    metric = (1 - mean_cos_sim) + (num_nomatchsymps/6) + (std_cos_sim/4) + metric_MIFTS(MIFTS)

    disease_sims[disease] = metric

  return dict(sorted(disease_sims.items(), key=lambda item: item[1]))

In [None]:
symptoms = ["A fold of skin that starts above the inside corner of the upper eyelid and curves down to cover and go in front of the inner corner of the eye.", "Webbing or joining of fingers or toes, including only soft tissue or also involving the bones.", "Being shorter than what's considered normal for your age and gender.", "A problem with the little fold of skin under your tongue that attaches it to the bottom of your mouth, or having extra folds of skin in your mouth.", "A toe that looks too small compared to the rest of your foot.", "A problem with the bone in the middle of your chest.", "A smaller than normal lower jawbone that affects development.", "A brain abnormality in which there are holes or cavities in the brain tissue.", "Any unusual problem with your tongue.", "Shrinking or wasting of the outer layer of your brain called the cerebrum.", "Irregular surface of the tongue with multiple bumps and/or grooves.", "A gap or split in the roof of your mouth or lip.", "A smaller than normal shin bone.", "A finger or toe that bends in a way that makes it look like it's in the palm or sole of your hand or foot.", "Eyes that are further apart than what's considered normal.", "A raised palate (roof of the mouth).", "Ears that are located lower than usual on the head.", "An extra finger or toe that is next to the thumb or big toe."]

In [None]:
# Example Symptoms and it as an numpy array of its embedding

symptoms_emb_nparr = np.array(embed(symptoms))

In [None]:
top5 = list(get_disease(symptoms_emb_nparr).items())[:5]

combined_symptoms = "".join(f"{symptom}. " for symptom in symptoms)
top5_diseases = [t[0] for t in top5]

top_5_zero_shot = classifier(combined_symptoms, top5_diseases, multi_label=True)
top_5_zero_shot

{'sequence': "A fold of skin that starts above the inside corner of the upper eyelid and curves down to cover and go in front of the inner corner of the eye.. Webbing or joining of fingers or toes, including only soft tissue or also involving the bones.. Being shorter than what's considered normal for your age and gender.. A problem with the little fold of skin under your tongue that attaches it to the bottom of your mouth, or having extra folds of skin in your mouth.. A toe that looks too small compared to the rest of your foot.. A problem with the bone in the middle of your chest.. A smaller than normal lower jawbone that affects development.. A brain abnormality in which there are holes or cavities in the brain tissue.. Any unusual problem with your tongue.. Shrinking or wasting of the outer layer of your brain called the cerebrum.. Irregular surface of the tongue with multiple bumps and/or grooves.. A gap or split in the roof of your mouth or lip.. A smaller than normal shin bone..

In [None]:
top_5_dict = {top_5_zero_shot["labels"][i]: top_5_zero_shot["scores"][i] for i in range(5)}

# Load gzipped json
def load(file_name):
  with gzip.open(f"best_{file_name}.json.gz", 'rt', encoding='utf-8') as f:
    data = json.load(f)

  return data

# Given a list of dicts where each dict has the disease as a key and the list of symptoms as its value
# Search/find the disease symptoms pair and return it
def find(data, disease):
  for disease_dict in data:
    if list(disease_dict.keys())[0] == disease:
      return [disease, disease_dict[disease]]

# -----------------------------

skin_json = load(which_seno_part)

for disease in top_5_dict.keys():
  confidence = top_5_dict[disease]

  disease, symptoms = find(skin_json, disease)

  print(disease + "; ", end="")

Orofaciodigital Syndrome Iv; Rapadilino Syndrome; Laurin-Sandrow Syndrome; Bell's Palsy; Congenital Heart Defects, Hamartomas of Tongue, and Polysyndactyly; 