In [1]:
import pandas as pd
import numpy as np

### NaturDoc - TL BL WT 22-23

# Data clustering:

## Dealing With the Second Embedding Column:

### Loading Embeddings Data:


In [2]:
symptoms_embeddings = pd.read_csv("../data/embeddings/word_embeddings_dataframe.csv")

As a reminder: the dataframe contains three columns: the symptom name (from the Duke dataset and the google symptom data), and one column for the two embedding models each: the first being `all-MiniLM-L6-v2` and the second being `average_word_embeddings_glove.840B.300d`.

In [3]:
print(symptoms_embeddings.shape)
symptoms_embeddings.head(2)

(2404, 3)


Unnamed: 0,Symptom,Embedding1,Embedding2
0,Abcess,[-9.81967244e-03 1.01662287e-02 3.75229940e-...,[ 2.1690e-02 -1.8056e-01 -8.5585e-02 -5.6702e-...
1,Abdomen,[ 5.98840415e-02 1.64022837e-02 -4.90665212e-...,[-0.73936 -0.18636 0.59149 0.47356 ...


Extracting a dictionary matching index to symptom name:

In [4]:
dict_symptom = symptoms_embeddings["Symptom"].to_dict()

#### Transforming:

Reading from the csv, it is no longer a proper list but instead a string containing extra characters:

In [5]:
symptoms_embeddings.loc[0, "Embedding1"][:100]

'[-9.81967244e-03  1.01662287e-02  3.75229940e-02  1.75703913e-02\n -1.11436069e-01  3.83325890e-02  1'

In [6]:
type(symptoms_embeddings.loc[1, "Embedding1"])

str

In [7]:
test_list_1 = symptoms_embeddings.loc[0, "Embedding1"].replace("\n", "").replace("[", "").replace("]", "").split(" ")
test_list_2 = symptoms_embeddings.loc[0, "Embedding2"].replace("\n", "").replace("[", "").replace("]", "").split(" ")

In [8]:
test_list_1[:5]

['-9.81967244e-03', '', '1.01662287e-02', '', '3.75229940e-02']

Removing all empty strings:

In [9]:
test_list_1 = [x for x in test_list_1 if x]
test_list_2 = [x for x in test_list_2 if x]
test_list_1[:5]

['-9.81967244e-03',
 '1.01662287e-02',
 '3.75229940e-02',
 '1.75703913e-02',
 '-1.11436069e-01']

### Creating useable dataframes:

In [10]:
def listify_df_values(df_series: pd.Series):
    df_series = df_series.str.replace("\n", "", regex=True)
    df_series = df_series.str.replace("[", "", regex=True).replace("]", "", regex=True)
    df_series = df_series.str.split(" ")
    # df_list = df_list.apply(lambda x: x for x in df_list if x)
    return df_series

#### Embedding 2 column:

Creating the basic dataframe:

In [11]:
embeddings2_series = listify_df_values(symptoms_embeddings.loc[:, "Embedding2"])
embeddings2_series = embeddings2_series.apply(lambda row: [val for val in row if val])

embeddings2_df = pd.DataFrame(embeddings2_series)
embeddings2_df.head()

Unnamed: 0,Embedding2
0,"[2.1690e-02, -1.8056e-01, -8.5585e-02, -5.6702..."
1,"[-0.73936, -0.18636, 0.59149, 0.47356, 0.59297..."
2,"[0.58928, 0.24762, 0.5015, -0.31308, -0.029607..."
3,"[8.2946e-02, 1.6964e-01, -2.1112e-01, 2.1073e-..."
4,"[-3.7954e-01, 4.4132e-01, 3.6332e-02, 2.2410e-..."


Exploding the lists of values into their own columns so that every cell only contains a single value:

In [12]:
embeddings2_df = pd.concat(
    [embeddings2_df[c].apply(pd.Series).add_prefix(c + "_") for c in embeddings2_df], axis=1
)

embeddings2_df.head()

Unnamed: 0,Embedding2_0,Embedding2_1,Embedding2_2,Embedding2_3,Embedding2_4,Embedding2_5,Embedding2_6,Embedding2_7,Embedding2_8,Embedding2_9,...,Embedding2_290,Embedding2_291,Embedding2_292,Embedding2_293,Embedding2_294,Embedding2_295,Embedding2_296,Embedding2_297,Embedding2_298,Embedding2_299
0,0.02169,-0.18056,-0.085585,-0.56702,-0.37991,0.74952,0.27161,-0.20359,0.28772,-1.4985,...,0.7687,-0.57498,-0.10212,-0.0557,-0.45765,-0.26548,0.19396,0.38276,-0.015735,-0.036918
1,-0.73936,-0.18636,0.59149,0.47356,0.59297,-0.22319,0.066332,0.35977,0.063273,-1.5661,...,0.78603,0.54811,0.23896,-0.42036,-0.085291,0.64376,0.54307,0.42253,0.61038,-0.75482
2,0.58928,0.24762,0.5015,-0.31308,-0.029607,0.39451,-0.22913,0.57697,-0.76873,-1.3676,...,-0.42955,-0.14359,0.16626,0.3584,-0.10825,-0.1961,-0.15036,0.13764,-0.41586,-0.72983
3,0.082946,0.16964,-0.21112,0.21073,-0.0094237,0.34631,-0.25166,0.18472,-0.33269,-1.6627,...,0.33283,-0.15003,0.54558,-0.023841,-0.48079,0.51326,-0.2866,0.041394,-0.066671,-0.3077
4,-0.37954,0.44132,0.036332,0.2241,0.087512,-0.41484,-0.0060271,0.098966,-0.11458,-1.7897,...,0.10686,0.27241,-0.31783,0.13302,-0.17751,0.74856,0.36981,0.35658,0.13955,-0.54288


Converting the cell values to floats:

In [13]:
embeddings2_df = embeddings2_df.apply(pd.to_numeric, errors='coerce')
type(embeddings2_df.loc[0, "Embedding2_0"])

numpy.float64

### Creating the Distance Matrix:

In [14]:
# importing the library
from scipy.spatial import distance_matrix

In [15]:
import math

def generate_distance_matrix(df : pd.DataFrame,
                distance_metric : str = "euclidean") -> pd.DataFrame: # 2.5k x 2.5k
    if distance_metric == "manhattan":
        p = 1
    elif distance_metric == "euclidean":
        p = 2
    elif distance_metric == "chebychev":
        p = math.inf
    else:
        p = 2
    dis_matrix = distance_matrix(df.values, df.values, p)
    dis_df = pd.DataFrame(dis_matrix)
    return dis_df


### Creating the Dictionaries:

In [16]:
def generate_dict(df_dist : pd.DataFrame,
                threshold : float) -> dict:
    filt = (df_dist[:] > threshold)
    df_filt = df_dist.copy()
    df_filt[filt] = np.nan
    dict_dist = df_filt.to_dict('dict')
    for i, dic in dict_dist.items():
        to_pop = list()
        for key, value in dic.items():
            if np.isnan(value):
                to_pop.append(key)
            # elif value == 0.0:
            #     to_pop.append(key)
        for target_key in to_pop:
            dic.pop(target_key)
        dict_dist[i] = dic
    return dict_dist

In [17]:
def generate_dict_match(dict_dist: dict) -> dict:   
    dict_match = dict()

    for key, value in dict_dist.items():
        for sub_key in value.keys():
            if dict_symptom[key] not in dict_match:
                dict_match[dict_symptom[key]] = [dict_symptom[sub_key]]
            else:
                dict_match[dict_symptom[key]] = [*dict_match.get(dict_symptom[key]), dict_symptom[sub_key]]
                # [*response.get("match_partial"), match_partial[i_partial]]

    return dict_match

In [18]:
activities_symptoms_df = pd.read_csv("../output/activities_symptoms_bool.csv")
activities_symptoms_df.drop(columns="Unnamed: 0", inplace=True)
activities_symptoms_df.head()

Unnamed: 0,symptomName,is_symptom,is_activity
0,Abcess,0,1
1,Abdomen,0,1
2,Abortifacient,0,1
3,Abortive,0,1
4,Abrasion,0,1


In [19]:
filt_sym = (activities_symptoms_df["is_symptom"] == 1)
filt_sym_df = activities_symptoms_df[filt_sym]
filt_sym_list = filt_sym_df["symptomName"].values.tolist()

In [20]:
filt_not_act = (activities_symptoms_df["is_activity"] == 0)
filt_not_act_df = activities_symptoms_df[filt_not_act]
filt_not_act_list = filt_not_act_df["symptomName"].values.tolist()

In [21]:
def create_dict_sym(dict_dist):  
    dict_sym = dict()

    for sym, list_sym in dict_dist.items():
        if sym not in filt_sym_list:
            continue
        for sub_sym in list_sym:
            if sub_sym in filt_not_act_list:
                continue
            if sym not in dict_sym:
                dict_sym[sym] = [sub_sym]
            else:
                dict_sym[sym] = [*dict_sym.get(sym), sub_sym]
    
    return dict_sym

## Creating the Dictionary for the Second Embedding Model:

#### Distance Matrix:

In [22]:
df_dist_2 = generate_distance_matrix(embeddings2_df)
df_dist_2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2394,2395,2396,2397,2398,2399,2400,2401,2402,2403
0,0.000000,9.942843,8.784920,8.232345,9.910264,9.184355,7.202011,9.933517,8.554601,9.013563,...,10.233912,9.314567,9.376453,9.180041,9.993439,10.348576,10.379647,8.481104,11.215959,9.881715
1,9.942843,0.000000,10.594719,9.092435,9.080341,9.760703,7.240622,9.523623,9.701527,7.085359,...,9.217572,8.534188,8.879009,8.650456,8.355311,11.036702,10.223542,8.380999,11.489306,10.246362
2,8.784920,10.594719,0.000000,8.911869,9.984212,9.694277,7.788598,10.527021,9.885447,9.483890,...,10.491074,9.849962,9.382018,9.452173,10.308570,10.837974,11.214281,9.132222,11.857727,9.906005
3,8.232345,9.092435,8.911869,0.000000,9.329214,8.618238,6.308059,9.776170,8.539223,8.399780,...,9.360827,8.679456,8.652150,8.572486,8.727339,9.806711,10.267550,8.039326,11.093683,9.261648
4,9.910264,9.080341,9.984212,9.329214,0.000000,9.532016,7.203297,8.390142,9.782737,8.664715,...,9.731884,9.167016,8.643785,8.095538,8.852319,11.230354,10.294182,8.781414,10.703237,9.869799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399,10.348576,11.036702,10.837974,9.806711,11.230354,10.804562,8.675304,11.606166,10.381668,10.743392,...,10.649502,10.544861,10.239899,10.334715,10.653345,0.000000,12.256771,10.208283,12.240513,10.836945
2400,10.379647,10.223542,11.214281,10.267550,10.294182,10.384463,8.632461,11.402102,10.012039,9.716490,...,10.498840,10.266239,9.820044,10.444798,10.856580,12.256771,0.000000,10.042034,12.678147,10.458506
2401,8.481104,8.380999,9.132222,8.039326,8.781414,8.460653,5.594245,8.909344,8.437277,7.433279,...,8.786142,7.539064,7.838269,7.730331,8.045926,10.208283,10.042034,0.000000,10.921098,8.689270
2402,11.215959,11.489306,11.857727,11.093683,10.703237,11.734419,9.532773,11.079050,11.446188,11.062574,...,12.057926,11.506828,10.885291,10.925064,10.904707,12.240513,12.678147,10.921098,0.000000,11.735542


Several entries seem to have matched to an absurd amount of activities:

In [23]:
print("Value counts:", df_dist_2.loc[:, 1].value_counts())

Value counts: 7.240622     830
8.358644       3
9.079539       2
10.997180      2
10.270435      2
            ... 
9.093840       1
10.213916      1
11.291925      1
8.894402       1
10.246362      1
Name: 1, Length: 1554, dtype: int64


In [24]:
print("Value counts:", df_dist_2.loc[:, 6].value_counts())

Value counts: 0.000000    830
7.467473      3
7.259116      2
9.168405      2
7.789581      2
           ... 
8.068840      1
7.136250      1
8.165595      1
8.418919      1
7.147895      1
Name: 6, Length: 1554, dtype: int64


Trying other distance metrics results in similar results:

In [25]:
df_dist_2_manhattan = generate_distance_matrix(embeddings2_df, 1)

In [26]:
df_dist_2_chebychev = generate_distance_matrix(embeddings2_df, 3)

In [27]:
print("Value counts:", df_dist_2_manhattan.loc[:, 1].value_counts())

Value counts: 7.240622     830
8.358644       3
9.079539       2
10.997180      2
10.270435      2
            ... 
9.093840       1
10.213916      1
11.291925      1
8.894402       1
10.246362      1
Name: 1, Length: 1554, dtype: int64


In [28]:
print("Value counts:", df_dist_2_chebychev.loc[:, 1].value_counts())

Value counts: 7.240622     830
8.358644       3
9.079539       2
10.997180      2
10.270435      2
            ... 
9.093840       1
10.213916      1
11.291925      1
8.894402       1
10.246362      1
Name: 1, Length: 1554, dtype: int64


### Final Dictionaries:

#### Trying Different Threshold:

##### Threshold 2.0:

In [29]:
dict_dist_20 = generate_dict(df_dist_2, 2)
dict_20 = generate_dict_match(dict_dist_20)
dict_20_sym = create_dict_sym(dict_20)
print(len(dict_20_sym))
list(dict_20_sym.items())[:5]

132


[('Acne', ['Acne']),
 ('Alcoholism', ['Alcoholism']),
 ('Allergy', ['Allergy']),
 ('Amblyopia', ['Amblyopia']),
 ('Amenorrhea', ['Amenorrhea'])]

In [30]:
print("Abdominal pain" in dict_20_sym.keys())
print("Eye pain" in dict_20_sym.keys())
print("Common cold" in dict_20_sym.keys())

False
False
False


##### Threshold 5.0:

In [31]:
dict_dist_50 = generate_dict(df_dist_2, 5)
dict_50 = generate_dict_match(dict_dist_50)
dict_50_sym = create_dict_sym(dict_50)
print(len(dict_50_sym))
list(dict_50_sym.items())[:5]

226


[('Acne', ['Acne']),
 ('Alcoholism', ['Alcoholism']),
 ('Allergy', ['Allergy']),
 ('Amblyopia',
  ['Amblyopia', 'Anorectic', 'Antacid', 'Carcinogenic', 'Dullness']),
 ('Amenorrhea', ['Amenorrhea'])]

In [44]:
print("Abdominal pain" in dict_50_sym.keys())
print("Eye pain" in dict_50_sym.keys())
print("Common cold" in dict_50_sym.keys())

False
True
True


Due to the erroneous values in the distance matrix, some symptoms match an absurd amount of activities:

In [36]:
print(dict_50_sym["Eye pain"])
print(len(dict_50_sym["Common cold"]))
print(dict_50_sym["Common cold"][::20])

['Ear drop', 'Evil eye', 'Eye', 'Eye drop', 'Pain', 'Cold sore']
827
['Abscess(Breast)', 'Anal-Eversion', 'Antidote(Pithecellobium)', 'Antidote(Cyanide)', 'Antidote(Mushroom)', 'Antidote(Vermillion)', 'Antisudorific', 'Atheromasia', 'Bite(Tiger)', 'Cancer(Brain)', 'Cancer(Mouth)', 'Cardiodynia', 'Circulation-Tonic', 'Decoagulant', 'Enterorrhagia', 'Food-Dye', 'Guineaworms', 'Hepatosis', 'Intestinal-Ailments', 'Leukopenic', 'Morphinism', 'Odontectomy', 'Parasympatholytic', 'Polyp(Bladder)', 'Preventitive(Elephantiasis)', 'Prickly-Heat', 'Rectitis', 'Rodent-Ulcer', 'Sclerosis(Stomach)', 'Sterilizant', 'Syphilis(3)', 'Tumor(Armpit)', 'Tumor(Leg)', 'Tumor(Thyroid)', 'Vermifuge(Cattle)', 'Polypus(Nose)', 'Preventitive(Measless)', 'Antidote(Tarantula)', 'Dyslactea', 'Oxyuris', 'Thrombosis(Venous)', 'Ochititis']


In [38]:
print(dict_50_sym["Avoidant personality disorder"][::20])

['Abscess(Breast)', 'Anal-Eversion', 'Antidote(Pithecellobium)', 'Antidote(Cyanide)', 'Antidote(Mushroom)', 'Antidote(Vermillion)', 'Antisudorific', 'Atheromasia', 'Bite(Tiger)', 'Cancer(Brain)', 'Cancer(Mouth)', 'Cardiodynia', 'Circulation-Tonic', 'Decoagulant', 'Enterorrhagia', 'Food-Dye', 'Guineaworms', 'Hepatosis', 'Intestinal-Ailments', 'Leukopenic', 'Morphinism', 'Odontectomy', 'Parasympatholytic', 'Polyp(Bladder)', 'Preventitive(Elephantiasis)', 'Prickly-Heat', 'Rectitis', 'Rodent-Ulcer', 'Sclerosis(Stomach)', 'Sterilizant', 'Syphilis(3)', 'Tumor(Armpit)', 'Tumor(Leg)', 'Tumor(Thyroid)', 'Vermifuge(Cattle)', 'Polypus(Nose)', 'Preventitive(Measless)', 'Antidote(Tarantula)', 'Dyslactea', 'Oxyuris', 'Thrombosis(Venous)', 'Lymphitis']


## A Bandaid Solution (?):

As a bandaid, we tried to modify the initial dictionary generating code. Unproblematic nested dictionaries are added to a new dictionary, and then the problematic ones will be further processed. This did not really work out too well and is a bad approach anyway:

In [39]:
def generate_dict_patch(df_dist : pd.DataFrame,
                threshold : float) -> dict:
    filt = (df_dist[:] > threshold)
    df_filt = df_dist.copy()
    df_filt[filt] = np.nan
    dict_dist = df_filt.to_dict('dict')
    for i, dic in dict_dist.items():
        to_pop = list()
        for key, value in dic.items():
            if np.isnan(value):
                to_pop.append(key)
        for target_key in to_pop:
            dic.pop(target_key)
        dict_dist[i] = dic
    dict_clear = dict()
    for key, dic in dict_dist.items():
        
# first, add unproblematic dictionaries to dict_clear:
        if len(dic) <= 100:
            dict_clear[key] = dic
            continue

# a new dict lists the indices for each value:
        count_dict = dict()
        for sub_key, val_dis in dic.items():
            if val_dis not in count_dict:
                count_dict[val_dis] = [sub_key]
            else:
                count_dict[val_dis] = [*count_dict.get(val_dis), sub_key]

# checking the count_dict: 
# if a certain key has too many indices as its value, it will be skipped:
        for val, val_i in count_dict.items():
            if len(val_i) > 100:
                continue

# in reverse:
# remaining values will be assigned to correct index keys in dict_clear:
            for i in val_i:
                if i not in dict_clear:
                    dict_clear[i] = {i: val}
                else:
                    dict_clear[i][i] = val

# in case self referencing 0.0 value was removed, add it again:
    target_i = list()
    for key, value in dict_clear.items():
        if key not in value:
            target_i.append(key)
    for i in target_i:
        dict_clear[i] = 0.0

    return dict_clear

In [45]:
print(list(dict_dist_50[6].items())[::20])

[(6, 0.0), (53, 0.0), (99, 0.0), (122, 0.0), (142, 0.0), (163, 0.0), (196, 0.0), (246, 0.0), (293, 0.0), (357, 0.0), (377, 0.0), (407, 0.0), (468, 0.0), (534, 0.0), (629, 0.0), (711, 0.0), (785, 0.0), (850, 0.0), (914, 0.0), (988, 0.0), (1055, 0.0), (1140, 0.0), (1198, 4.695847538646427), (1253, 0.0), (1311, 0.0), (1343, 0.0), (1363, 0.0), (1428, 0.0), (1467, 0.0), (1506, 0.0), (1578, 0.0), (1635, 0.0), (1709, 0.0), (1729, 0.0), (1749, 0.0), (1806, 0.0), (1867, 0.0), (1894, 0.0), (1934, 0.0), (1955, 0.0), (1991, 4.848344020092718), (2037, 0.0), (2074, 0.0), (2140, 4.303535052716567), (2291, 4.831098821251361)]


In [46]:
dict_dist_patch_50 = generate_dict_patch(df_dist_2, 5)

In [48]:
print(list(dict_dist_patch_50.items())[::20])

[(0, {0: 0.0}), (1198, {1198: 4.695847538646427}), (2140, {2140: 4.303535052716567}), (2291, {2291: 4.831098821251361}), (23, {23: 0.0}), (48, {48: 0.0}), (80, {80: 0.0}), (187, {187: 0.0}), (223, {223: 0.0}), (256, {256: 3.935557761883537, 2132: 3.935557761883537}), (277, {277: 0.0}), (322, {322: 0.0}), (349, {349: 0.0}), (414, {414: 0.0}), (440, {440: 0.0}), (472, {472: 0.0}), (494, {494: 0.0}), (520, {520: 0.0}), (550, {550: 0.0}), (574, {574: 0.0, 970: 4.226506207860805}), (596, {596: 0.0}), (620, {620: 0.0}), (646, {646: 0.0}), (674, {674: 3.9358604723234465, 675: 3.9358604723234465, 1173: 4.953613275781259}), (699, {699: 0.0}), (730, {730: 0.0}), (755, {755: 0.0}), (779, {779: 0.0}), (809, {809: 0.0}), (841, {841: 4.320528044933689, 842: 4.320528044933689, 2223: 4.527226442929275, 2224: 4.564068628190037}), (866, {866: 0.0}), (896, {896: 0.0}), (925, {925: 0.0}), (947, {947: 0.0}), (978, {978: 0.0}), (1009, {1009: 0.0}), (1035, {1035: 0.0}), (1061, {1061: 0.0}), (1085, {1085: 0.0

In [54]:
dict_patch_50 = generate_dict_match(dict_dist_patch_50)
dict_dist_patch_50_sym = create_dict_sym(dict_patch_50)
print(len(dict_dist_patch_50_sym))
print(list(dict_dist_patch_50_sym.items())[::20])

183
[('Acne', ['Acne']), ('Cataract', ['Cataract']), ('Diabetes', ['Diabetes', 'Diabetes Mellitis']), ('Halitosis', ['Halitosis']), ('Inflammation', ['Inflammation']), ('Perspiration', ['Perspiration']), ('Telangiectasia', ['Telangiectasia']), ('Breast pain', ['Breast', 'Pain']), ('Muscle atrophy', ['Muscle']), ('Urinary urgency', ['Urinary'])]


In [50]:
print("Abdominal pain" in dict_dist_patch_50_sym.keys())
print("Eye pain" in dict_dist_patch_50_sym.keys())
print("Common cold" in dict_dist_patch_50_sym.keys())

False
True
False


Changing thresholds somehow results in keys being removed (maybe they slipped into a territory of having too many matches):

In [51]:
dict_dist_patch_80 = generate_dict_patch(df_dist_2, 8)
dict_patch_80 = generate_dict_match(dict_dist_patch_80)
dict_dist_patch_80_sym = create_dict_sym(dict_patch_80)
print(len(dict_dist_patch_80_sym))
print("Abdominal pain" in dict_dist_patch_80_sym.keys())
print("Eye pain" in dict_dist_patch_80_sym.keys())
print("Common cold" in dict_dist_patch_80_sym.keys())

128
False
False
False


In [55]:
print(list(dict_dist_patch_80_sym.items())[::20])

[('Inflammation', ['Inflammation']), ('Burn', ['Burn']), ('Encephalitis', ['Encephalitis']), ('Infection', ['Infection']), ('Tinnitus', ['Tinnitus']), ('Chancre', ['Chancre']), ('Pyelonephritis', ['Pyelonephritis'])]


After more cautious threshold changes, 5.1 seems the best so far:

In [53]:
dict_dist_patch_51 = generate_dict_patch(df_dist_2, 5.1)
dict_patch_51 = generate_dict_match(dict_dist_patch_51)
dict_dist_patch_51_sym = create_dict_sym(dict_patch_51)
print(len(dict_dist_patch_51_sym))
print("Abdominal pain" in dict_dist_patch_51_sym.keys())
print("Eye pain" in dict_dist_patch_51_sym.keys())
print("Common cold" in dict_dist_patch_51_sym.keys())

186
True
True
False


In [56]:
print(list(dict_dist_patch_51_sym.items())[::20])

[('Acne', ['Acne']), ('Burn', ['Burn']), ('Depression', ['Depression']), ('Gout', ['Gout']), ('Infection', ['Infection']), ('Pericarditis', ['Pericarditis']), ('Tachycardia', ['Tachycardia']), ('Neck pain', ['Neck', 'Pain', 'Cold sore']), ('Pelvic inflammatory disease', ['Digestive disease', 'Skin diseases']), ('Thyroid nodule', ['Thyroid'])]


In [57]:
print(dict_dist_patch_51_sym["Abdominal pain"])
print(dict_dist_patch_51_sym["Eye pain"])

['Pain']
['Cosmetic (Grey hair)', 'Ear drop', 'Evil eye', 'Eye', 'Eye drop', 'Pain', 'Skin diseases', 'Cold sore']


### Closing thoughts:

In any case, as we can see above, the results are not particularly great so there are probably issues with our initial approach to creating or processing the second set of embeddings.

We should proceed with _Embeddings1_ values for now and see if we can pinpoint the issues in the second set of vectors.