In [48]:
import pandas as pd
import numpy as np

### NaturDoc - TL BL WT 22-23

# Data clustering:

## Groundwork: Approaches

What approaches are possible? And while waiting for the input data to be generated, what data can we use to already test these approaches?

### Loading Embeddings Data:


In [49]:
symptoms_embeddings = pd.read_csv("../data/embeddings/word_embeddings_dataframe.csv")

The dataframe contains three columns: the symptom name (from the Duke dataset and the google symptom data), and one column for the two embedding models each: the first being `all-MiniLM-L6-v2` and the second being `average_word_embeddings_glove.840B.300d`.

In [50]:
print(symptoms_embeddings.shape)
symptoms_embeddings.head(2)

(2404, 3)


Unnamed: 0,Symptom,Embedding1,Embedding2
0,Abcess,[-9.81967244e-03 1.01662287e-02 3.75229940e-...,[ 2.1690e-02 -1.8056e-01 -8.5585e-02 -5.6702e-...
1,Abdomen,[ 5.98840415e-02 1.64022837e-02 -4.90665212e-...,[-0.73936 -0.18636 0.59149 0.47356 ...


Extracting a dictionary matching index to symptom name:

In [51]:
dict_symptom = symptoms_embeddings["Symptom"].to_dict()
dict_symptom

{0: 'Abcess',
 1: 'Abdomen',
 2: 'Abortifacient',
 3: 'Abortive',
 4: 'Abrasion',
 5: 'Abscess',
 6: 'Abscess(Breast)',
 7: 'Absorbent',
 8: 'Acaricide',
 9: 'Ache',
 10: 'Ache(Arm)',
 11: 'Ache(Back)',
 12: 'Ache(Body)',
 13: 'Ache(Ear)',
 14: 'Ache(Foot)',
 15: 'Ache(Head)',
 16: 'Ache(Leg)',
 17: 'Ache(Limb)',
 18: 'Ache(Loin)',
 19: 'Ache(Rib)',
 20: 'Ache(Side)',
 21: 'Ache(Stomach)',
 22: 'Achlorhydria',
 23: 'Acne',
 24: 'Acrid',
 25: 'Actinomycosis',
 26: "Addison's-Disease",
 27: 'Adenopathy',
 28: 'Adrenocortical',
 29: 'Adulterant',
 30: 'Aerophagia',
 31: 'Afterbirth',
 32: 'Aftosa',
 33: 'Agility',
 34: 'Ague',
 35: 'Airwick',
 36: 'Alactia',
 37: 'Albuminuria',
 38: 'Alcoholism',
 39: 'Alexipharmic',
 40: 'Alexiteric',
 41: 'Algicide',
 42: 'Allergenic',
 43: 'Allergy',
 44: 'Alopecia',
 45: 'Alterative',
 46: 'Amaurosis',
 47: 'Amblyopia',
 48: 'Amebiasis',
 49: 'Amenorrhea',
 50: 'Amnesia',
 51: 'Amygdalitis',
 52: 'Amygdalosis',
 53: 'Anal-Eversion',
 54: 'Analeptic',


#### Transforming:

Reading from the csv, it is no longer a proper list but instead a string containing extra characters:

In [52]:
symptoms_embeddings.loc[0, "Embedding1"][:100]

'[-9.81967244e-03  1.01662287e-02  3.75229940e-02  1.75703913e-02\n -1.11436069e-01  3.83325890e-02  1'

In [53]:
type(symptoms_embeddings.loc[1, "Embedding1"])

str

In [54]:
test_list_1 = symptoms_embeddings.loc[0, "Embedding1"].replace("\n", "").replace("[", "").replace("]", "").split(" ")
test_list_2 = symptoms_embeddings.loc[0, "Embedding2"].replace("\n", "").replace("[", "").replace("]", "").split(" ")

In [55]:
test_list_1[:5]

['-9.81967244e-03', '', '1.01662287e-02', '', '3.75229940e-02']

Removing all empty strings:

In [56]:
test_list_1 = [x for x in test_list_1 if x]
test_list_2 = [x for x in test_list_2 if x]
test_list_1[:5]

['-9.81967244e-03',
 '1.01662287e-02',
 '3.75229940e-02',
 '1.75703913e-02',
 '-1.11436069e-01']

In [57]:
len(test_list_1)

384

In [58]:
len(test_list_2)

300

### Creating useable dataframes:

#### Embedding 1 column:

First, transform content of rows from strings to lists:

In [59]:
symptoms_embeddings.loc[:, "Embedding1"].str.replace("[", "", regex=True).replace("]", "", regex=True).str.split(" ")[1]

['',
 '5.98840415e-02',
 '',
 '1.64022837e-02',
 '-4.90665212e-02',
 '',
 '4.81191762e-02\n',
 '-9.69780684e-02',
 '-1.16978601e-01',
 '',
 '1.07039817e-01',
 '',
 '2.18950473e-02\n',
 '',
 '4.59282361e-02',
 '-6.05028607e-02',
 '',
 '2.40573995e-02',
 '-1.45412553e-02\n',
 '',
 '1.28115609e-03',
 '',
 '1.13097858e-02',
 '-9.77337454e-03',
 '-5.58230355e-02\n',
 '',
 '5.63496165e-02',
 '-8.33319798e-02',
 '-2.63765566e-02',
 '',
 '2.35134549e-02\n',
 '-7.78171644e-02',
 '',
 '4.52542566e-02',
 '-8.62413377e-04',
 '',
 '1.06803086e-02\n',
 '-4.85648867e-03',
 '-4.36314084e-02',
 '',
 '2.85004098e-02',
 '-8.02691560e-03\n',
 '',
 '1.19372457e-02',
 '-8.53983238e-02',
 '',
 '6.69634296e-03',
 '-3.36318836e-02\n',
 '',
 '4.77311388e-02',
 '',
 '1.43900368e-04',
 '',
 '2.62411013e-02',
 '-6.15936443e-02\n',
 '-2.96628335e-03',
 '',
 '5.19950949e-02',
 '',
 '2.96347905e-02',
 '-1.06338166e-01\n',
 '-3.13502252e-02',
 '-6.62523285e-02',
 '-8.64137523e-03',
 '',
 '5.17196581e-03\n',
 '',
 '4.5

In [60]:
def listify_df_values(df_series: pd.Series):
    df_series = df_series.str.replace("\n", "", regex=True)
    df_series = df_series.str.replace("[", "", regex=True).replace("]", "", regex=True)
    df_series = df_series.str.split(" ")
    # df_list = df_list.apply(lambda x: x for x in df_list if x)
    return df_series

In [61]:
embeddings1_series = listify_df_values(symptoms_embeddings.loc[:, "Embedding1"])
# symptoms_embeddings.loc[:, "Embedding1"] = symptoms_embeddings.loc[:, "Embedding1"]

In [62]:
embeddings1_series = embeddings1_series.apply(lambda row: [val for val in row if val])

https://stackoverflow.com/questions/67442107/pandas-expand-explode-dataframe-horizontally

In [63]:
embeddings1_df = pd.DataFrame(embeddings1_series)
embeddings1_df

Unnamed: 0,Embedding1
0,"[-9.81967244e-03, 1.01662287e-02, 3.75229940e-..."
1,"[5.98840415e-02, 1.64022837e-02, -4.90665212e-..."
2,"[6.30832557e-03, 6.94514960e-02, 9.17118881e-0..."
3,"[-1.41132241e-02, 7.76526034e-02, -8.35783686e..."
4,"[-7.86128864e-02, -2.58876905e-02, 3.46109122e..."
...,...
2399,"[-1.92209315e-02, 3.93610820e-02, -8.05331487e..."
2400,"[2.64672562e-02, -4.77555906e-03, -3.02140005e..."
2401,"[2.54991353e-02, 1.67390481e-02, 5.45178875e-0..."
2402,"[-4.97031994e-02, 1.04058813e-02, 1.57771539e-..."


In [64]:
embeddings1_df = pd.concat(
    [embeddings1_df[c].apply(pd.Series).add_prefix(c + "_") for c in embeddings1_df], axis=1
)

embeddings1_df


Unnamed: 0,Embedding1_0,Embedding1_1,Embedding1_2,Embedding1_3,Embedding1_4,Embedding1_5,Embedding1_6,Embedding1_7,Embedding1_8,Embedding1_9,...,Embedding1_374,Embedding1_375,Embedding1_376,Embedding1_377,Embedding1_378,Embedding1_379,Embedding1_380,Embedding1_381,Embedding1_382,Embedding1_383
0,-9.81967244e-03,1.01662287e-02,3.75229940e-02,1.75703913e-02,-1.11436069e-01,3.83325890e-02,1.48906738e-01,4.44466770e-02,5.77533916e-02,-1.21526700e-02,...,6.11344092e-02,1.98782869e-02,1.33477971e-02,3.86779606e-02,-4.79677059e-02,3.42200510e-02,4.26308662e-02,3.78118306e-02,6.95859119e-02,-4.20008637e-02
1,5.98840415e-02,1.64022837e-02,-4.90665212e-02,4.81191762e-02,-9.69780684e-02,-1.16978601e-01,1.07039817e-01,2.18950473e-02,4.59282361e-02,-6.05028607e-02,...,4.86743562e-02,1.04232021e-02,1.38152717e-02,-5.32790925e-03,-1.77161284e-02,1.04324900e-01,9.65044126e-02,7.19451010e-02,1.72711313e-02,6.24693604e-03
2,6.30832557e-03,6.94514960e-02,9.17118881e-03,-4.25593607e-04,3.68529968e-02,2.88750455e-02,9.93606523e-02,1.99077209e-03,3.11414283e-02,3.83325480e-02,...,2.25946605e-02,-3.91616262e-02,1.23729361e-02,-2.83677857e-02,-8.51575360e-02,7.25132674e-02,6.53430074e-02,2.26758630e-03,6.07209243e-02,-2.46002264e-02
3,-1.41132241e-02,7.76526034e-02,-8.35783686e-03,2.37053819e-02,5.61783165e-02,3.36992592e-02,1.19458653e-01,-2.01092865e-02,3.62723432e-02,4.82863858e-02,...,6.69673532e-02,1.13059739e-02,-1.16295973e-02,-2.42045093e-02,-5.78260906e-02,3.89332138e-02,1.18804961e-01,-2.96259206e-02,3.69524844e-02,-9.53654386e-03
4,-7.86128864e-02,-2.58876905e-02,3.46109122e-02,5.58277592e-02,-3.87978852e-02,-5.56877032e-02,1.44394651e-01,2.46080924e-02,-7.19921589e-02,-4.99793142e-02,...,2.83698123e-02,-3.49769071e-02,-2.14673597e-02,1.45020243e-02,5.76726533e-02,3.32759731e-02,1.09838024e-01,-7.57560134e-02,2.23050658e-02,-4.70947437e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399,-1.92209315e-02,3.93610820e-02,-8.05331487e-03,-1.88839864e-02,1.07821608e-02,-7.60535970e-02,1.06595382e-01,5.58597855e-02,-2.49612960e-04,-1.84704605e-02,...,2.31999531e-02,5.51964119e-02,-1.61917768e-02,6.19558468e-02,-2.18066014e-02,1.71804782e-02,4.90004942e-02,-4.42138314e-02,5.27935810e-02,1.88747607e-02
2400,2.64672562e-02,-4.77555906e-03,-3.02140005e-02,-2.61425432e-02,-3.36900353e-02,-1.13033682e-01,8.00898746e-02,5.02843335e-02,-1.70364957e-02,1.48580121e-02,...,-1.41694313e-02,6.69747815e-02,6.27062470e-02,1.41177222e-01,-7.76808560e-02,6.27785251e-02,8.16981718e-02,-3.36580649e-02,4.98927645e-02,9.57844872e-03
2401,2.54991353e-02,1.67390481e-02,5.45178875e-02,-1.12884091e-02,2.85636671e-02,-5.56535311e-02,5.83849624e-02,-4.70371544e-03,9.25000161e-02,-4.64275433e-03,...,2.22304687e-02,3.83732654e-02,-2.88097896e-02,-6.03646785e-03,-7.98023343e-02,-1.62580740e-02,1.73452757e-02,-5.45231067e-02,1.05497010e-01,-2.00341363e-02
2402,-4.97031994e-02,1.04058813e-02,1.57771539e-02,7.78019577e-02,4.29792143e-03,1.63108837e-02,7.65634999e-02,1.36114378e-02,-5.57998866e-02,2.25542802e-02,...,-9.64850485e-02,9.33009025e-04,5.58457756e-03,3.17276493e-02,1.84024610e-02,-2.05374248e-02,4.81370240e-02,-7.93446414e-03,-1.34081161e-02,1.22985832e-01


Wrong data type:

In [65]:
embeddings1_df.loc[0, "Embedding1_0"]

'-9.81967244e-03'

In [66]:
type(embeddings1_df.loc[0, "Embedding1_0"])

str

In [67]:
embeddings1_df = embeddings1_df.apply(pd.to_numeric, errors='coerce')

In [68]:
type(embeddings1_df.loc[0, "Embedding1_0"])

numpy.float64

#### Embedding 2 column:

Repeat the above steps for the Embedding_2 column:

Creating the basic dataframe:

In [169]:
embeddings2_series = listify_df_values(symptoms_embeddings.loc[:, "Embedding2"])
embeddings2_series = embeddings2_series.apply(lambda row: [val for val in row if val])

embeddings2_df = pd.DataFrame(embeddings2_series)
embeddings2_df.head()

Unnamed: 0,Embedding2
0,"[2.1690e-02, -1.8056e-01, -8.5585e-02, -5.6702..."
1,"[-0.73936, -0.18636, 0.59149, 0.47356, 0.59297..."
2,"[0.58928, 0.24762, 0.5015, -0.31308, -0.029607..."
3,"[8.2946e-02, 1.6964e-01, -2.1112e-01, 2.1073e-..."
4,"[-3.7954e-01, 4.4132e-01, 3.6332e-02, 2.2410e-..."


Exploding the lists of values into their own columns so that every cell only contains a single value:

In [170]:
embeddings2_df = pd.concat(
    [embeddings2_df[c].apply(pd.Series).add_prefix(c + "_") for c in embeddings2_df], axis=1
)

embeddings2_df.head()

Unnamed: 0,Embedding2_0,Embedding2_1,Embedding2_2,Embedding2_3,Embedding2_4,Embedding2_5,Embedding2_6,Embedding2_7,Embedding2_8,Embedding2_9,...,Embedding2_290,Embedding2_291,Embedding2_292,Embedding2_293,Embedding2_294,Embedding2_295,Embedding2_296,Embedding2_297,Embedding2_298,Embedding2_299
0,0.02169,-0.18056,-0.085585,-0.56702,-0.37991,0.74952,0.27161,-0.20359,0.28772,-1.4985,...,0.7687,-0.57498,-0.10212,-0.0557,-0.45765,-0.26548,0.19396,0.38276,-0.015735,-0.036918
1,-0.73936,-0.18636,0.59149,0.47356,0.59297,-0.22319,0.066332,0.35977,0.063273,-1.5661,...,0.78603,0.54811,0.23896,-0.42036,-0.085291,0.64376,0.54307,0.42253,0.61038,-0.75482
2,0.58928,0.24762,0.5015,-0.31308,-0.029607,0.39451,-0.22913,0.57697,-0.76873,-1.3676,...,-0.42955,-0.14359,0.16626,0.3584,-0.10825,-0.1961,-0.15036,0.13764,-0.41586,-0.72983
3,0.082946,0.16964,-0.21112,0.21073,-0.0094237,0.34631,-0.25166,0.18472,-0.33269,-1.6627,...,0.33283,-0.15003,0.54558,-0.023841,-0.48079,0.51326,-0.2866,0.041394,-0.066671,-0.3077
4,-0.37954,0.44132,0.036332,0.2241,0.087512,-0.41484,-0.0060271,0.098966,-0.11458,-1.7897,...,0.10686,0.27241,-0.31783,0.13302,-0.17751,0.74856,0.36981,0.35658,0.13955,-0.54288


Converting the cell values to floats:

In [171]:
embeddings2_df = embeddings2_df.apply(pd.to_numeric, errors='coerce')
type(embeddings2_df.loc[0, "Embedding2_0"])

numpy.float64

## Approach 1:

### Creating the Distance Matrix:

In [69]:
# importing the library
from scipy.spatial import distance_matrix

In [72]:
import math

def generate_distance_matrix(df : pd.DataFrame,
                distance_metric : str = "euclidean") -> pd.DataFrame: # 2.5k x 2.5k
    if distance_metric == "manhattan":
        p = 1
    elif distance_metric == "euclidean":
        p = 2
    elif distance_metric == "chebychev":
        p = math.inf
    else:
        p = 2
    dis_matrix = distance_matrix(df.values, df.values, p)
    dis_df = pd.DataFrame(dis_matrix)
    return dis_df


In [176]:
df_dist_1 = generate_distance_matrix(embeddings1_df)
df_dist_1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2394,2395,2396,2397,2398,2399,2400,2401,2402,2403
0,0.000000,1.172835,1.230569,1.202147,1.068863,1.102719,1.213488,1.227862,1.129428,1.118084,...,1.387558,1.259721,1.314804,1.312847,1.368299,1.229228,1.269921,1.238341,1.326879,1.321452
1,1.172835,0.000000,1.308356,1.177216,1.158076,0.999987,1.074094,1.098266,1.247356,1.068514,...,1.311162,1.283786,1.350499,1.214172,1.252881,1.200630,1.159814,1.350562,1.248203,1.022531
2,1.230569,1.308356,0.000000,0.662869,1.238897,1.226645,1.240566,1.260014,1.054402,1.326170,...,1.361361,1.321046,1.303957,1.323953,1.452224,1.300060,1.313569,1.299467,1.382976,1.290404
3,1.202147,1.177216,0.662869,0.000000,1.123468,1.144537,1.210960,1.234160,1.030969,1.265411,...,1.385154,1.344453,1.356146,1.278151,1.396429,1.251390,1.286678,1.263937,1.389592,1.217563
4,1.068863,1.158076,1.238897,1.123468,0.000000,1.006659,1.118275,1.032273,1.103120,1.115403,...,1.402944,1.334073,1.277256,1.159948,1.333937,1.217910,1.272939,1.227224,1.202582,1.270807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399,1.229228,1.200630,1.300060,1.251390,1.217910,1.257878,1.302735,1.295551,1.209389,1.270752,...,1.276127,1.325131,1.325960,1.332267,1.309440,0.000000,0.803892,1.283610,1.181490,1.231146
2400,1.269921,1.159814,1.313569,1.286678,1.272939,1.285842,1.289375,1.324865,1.274088,1.307315,...,1.255466,1.445945,1.235505,1.316208,1.296076,0.803892,0.000000,1.284518,1.180191,1.259903
2401,1.238341,1.350562,1.299467,1.263937,1.227224,1.362247,1.375149,1.326699,1.266961,1.191208,...,1.370138,1.345567,1.320906,1.285295,1.383062,1.283610,1.284518,0.000000,1.357254,1.457786
2402,1.326879,1.248203,1.382976,1.389592,1.202582,1.206832,1.273774,1.289590,1.201103,1.269518,...,1.348097,1.274635,1.316166,1.246534,1.354974,1.181490,1.180191,1.357254,0.000000,1.275877


##### Notes/Questions:

How does it treat the various columns?

As we can see, the initial DF contains columns X_0 to X_19:

### Generating the Final Dictionary:

#### A Dictionary of All Symptoms and Activities and their Related Terms:

Pseudocode reminder:

    def generate_dict(df_dist : pd.DataFrame,
                    threshold : float) -> Dict[str: List[str]]:

##### A Look at the Current Values:

To get an idea of what kind of threshold make sense, let's have a look at what the values for a single column look like:

In [177]:
df_dist_1.loc[:, 0].max()

1.4425400157357833

In [75]:
df_dist_1.loc[:, 0].min()

0.0

In [76]:
df_dist_1.loc[:, 0].mean()

1.2302958968753164

In [77]:
df_dist_1.loc[:, 0].median()

1.2282502265314303

In [78]:
df_dist_1.loc[1:, 0].min()

1.033265852919092

A look at min and max values across the entire dataframe: 

In [185]:
print(df_dist_1.max())
df_dist_1.max().max()

0       1.442540
1       1.432538
2       1.485387
3       1.454778
4       1.413604
          ...   
2399    1.427753
2400    1.445945
2401    1.468315
2402    1.475749
2403    1.503820
Length: 2404, dtype: float64


1.541120350374177

In [184]:
print(df_dist_1.copy().replace(0.0, 10).min())
df_dist_1.copy().replace(0.0, 10).min().min()

0       1.033266
1       0.572948
2       0.608119
3       0.388080
4       0.964157
          ...   
2399    0.803892
2400    0.803892
2401    0.897608
2402    0.883082
2403    0.558493
Length: 2404, dtype: float64


0.1283884804575315

The maximum value in the similarity matrix is about 1.44254, the minimum is 0.0. 0.0 indicates absolute similarity and in this similarity matrix happens when rows and columns refer to the same initial data entry (so at (0,0), (1,1), (2,2) and so on).The mean lies at around 1.23030, with the median a bit lower at about 1.22825.

Looking at the minimum value where similarity is not referenced relevant to the entry itself, (so in column 0, from row 1 onwards), we have about 1.03327 as a minimum value. We should therefore aim to have a threshold above this.

In [80]:
def generate_dict(df_dist : pd.DataFrame,
                threshold : float) -> dict:
    filt = (df_dist[:] > threshold)
    df_filt = df_dist.copy()
    df_filt[filt] = np.nan
    dict_dist = df_filt.to_dict('dict')
    for i, dic in dict_dist.items():
        to_pop = list()
        for key, value in dic.items():
            if np.isnan(value):
                to_pop.append(key)
            # elif value == 0.0:
            #     to_pop.append(key)
        for target_key in to_pop:
            dic.pop(target_key)
        dict_dist[i] = dic
    return dict_dist

In [81]:
dict_dist = generate_dict(df_dist_1, 0.5)

Taking a look at the first three dictionary entries:

In [82]:
print({k: dict_dist[k] for k in list(dict_dist)[:3]})

{0: {0: 0.0}, 1: {1: 0.0}, 2: {2: 0.0}}


In [83]:
dict_dist[0]

{0: 0.0}

In [84]:
dict_symptom[0]

'Abcess'

In [85]:
def generate_dict_match(dict_dist: dict) -> dict:   
    dict_match = dict()

    for key, value in dict_dist.items():
        for sub_key in value.keys():
            if dict_symptom[key] not in dict_match:
                dict_match[dict_symptom[key]] = [dict_symptom[sub_key]]
            else:
                dict_match[dict_symptom[key]] = [*dict_match.get(dict_symptom[key]), dict_symptom[sub_key]]
                # [*response.get("match_partial"), match_partial[i_partial]]

    return dict_match

Threshold 0.5:

Much too restrictive.

In [86]:
dict_dist = generate_dict(df_dist_1, 0.5)
generate_dict_match(dict_dist)

{'Abcess': ['Abcess'],
 'Abdomen': ['Abdomen'],
 'Abortifacient': ['Abortifacient'],
 'Abortive': ['Abortive', 'Abortive?'],
 'Abrasion': ['Abrasion'],
 'Abscess': ['Abscess'],
 'Abscess(Breast)': ['Abscess(Breast)'],
 'Absorbent': ['Absorbent'],
 'Acaricide': ['Acaricide'],
 'Ache': ['Ache'],
 'Ache(Arm)': ['Ache(Arm)'],
 'Ache(Back)': ['Ache(Back)'],
 'Ache(Body)': ['Ache(Body)'],
 'Ache(Ear)': ['Ache(Ear)'],
 'Ache(Foot)': ['Ache(Foot)'],
 'Ache(Head)': ['Ache(Head)'],
 'Ache(Leg)': ['Ache(Leg)'],
 'Ache(Limb)': ['Ache(Limb)'],
 'Ache(Loin)': ['Ache(Loin)'],
 'Ache(Rib)': ['Ache(Rib)'],
 'Ache(Side)': ['Ache(Side)'],
 'Ache(Stomach)': ['Ache(Stomach)'],
 'Achlorhydria': ['Achlorhydria'],
 'Acne': ['Acne'],
 'Acrid': ['Acrid'],
 'Actinomycosis': ['Actinomycosis'],
 "Addison's-Disease": ["Addison's-Disease"],
 'Adenopathy': ['Adenopathy'],
 'Adrenocortical': ['Adrenocortical'],
 'Adulterant': ['Adulterant'],
 'Aerophagia': ['Aerophagia'],
 'Afterbirth': ['Afterbirth'],
 'Aftosa': ['Af

Threshold 0.8:

This threshold already seems to provide really good results, albeit maybe a bit too narrow.

In [87]:
dict_dist = generate_dict(df_dist_1, 0.8)
generate_dict_match(dict_dist)

{'Abcess': ['Abcess'],
 'Abdomen': ['Abdomen',
  'Cancer(Abdomen)',
  'Stomachic',
  'Stomach',
  'Tumor(Abdomen)',
  'Abdominal pain'],
 'Abortifacient': ['Abortifacient',
  'Abortive',
  'Antiabortifacient',
  'Preventitive(Abortifacient)',
  'Abortive?'],
 'Abortive': ['Abortifacient', 'Abortive', 'Abortive?'],
 'Abrasion': ['Abrasion'],
 'Abscess': ['Abscess', 'Abscess(Breast)', 'Preventitive(Abscess)'],
 'Abscess(Breast)': ['Abscess', 'Abscess(Breast)'],
 'Absorbent': ['Absorbent'],
 'Acaricide': ['Acaricide', 'Homocide', 'Apicide', 'Ovocide'],
 'Ache': ['Ache',
  'Ache(Arm)',
  'Ache(Back)',
  'Ache(Body)',
  'Ache(Ear)',
  'Ache(Foot)',
  'Ache(Head)',
  'Ache(Leg)',
  'Ache(Limb)',
  'Ache(Loin)',
  'Ache(Rib)',
  'Ache(Side)',
  'Ache(Stomach)',
  'Ache(Bones)',
  'Ache(Chest)',
  'Pain',
  'Ache(Tooth)',
  'Ache(Neck)',
  'Ache(Eye)'],
 'Ache(Arm)': ['Ache',
  'Ache(Arm)',
  'Ache(Back)',
  'Ache(Body)',
  'Ache(Ear)',
  'Ache(Foot)',
  'Ache(Head)',
  'Ache(Leg)',
  'Ache(Li

Threshold 1.0:

A threshold of 1.0 also provides good results, although in this case the scope might be a little bit too broad. This could be a great threshold for a future functionality proposing remedies for related symptoms.

In [88]:
dict_dist = generate_dict(df_dist_1, 1.0)
generate_dict_match(dict_dist)

{'Abcess': ['Abcess'],
 'Abdomen': ['Abdomen',
  'Abscess',
  'Ache(Stomach)',
  'Appendicitis',
  'Bile',
  'Bowel',
  'Cancer(Abdomen)',
  'Cancer(Stomach)',
  'Digestion',
  'Digestive',
  'Duodenum',
  'Gall Bladder',
  'Gall-Bladder',
  'Gallbladder',
  'Gallstones',
  'Gastritis',
  'Gastrodynia',
  'Gastrointestinal',
  'Groin',
  'Hepatic',
  'Hernia',
  'Intestinal',
  'Intestinal-Troubles',
  'Intestine',
  'Liver',
  'Lung',
  'Navel',
  'Pelvis',
  'Peritonitis',
  'Polyp(Abdomen)',
  'Polyp(Stomach)',
  'Sclerosis(Abdomen)',
  'Sclerosis(Belly)',
  'Sclerosis(Stomach)',
  'Scrotum',
  'Stomachic',
  'Stomach',
  'Thigh',
  'Tumor(Abdomen)',
  'Tumor(Groin)',
  'Tumor(Intestine)',
  'Tumor(Stomach)',
  'Uterine',
  'Uterine Organs',
  'Uterus',
  'Waist',
  'Womb',
  'Gastric',
  'Tumor(Digestive)',
  'Gastric-Ulcer',
  'Tumor(Belly)',
  'Vomiting',
  'Muscle(Uterus)',
  'Abdominal obesity',
  'Abdominal pain',
  'Stomach rumble'],
 'Abortifacient': ['Abortifacient',
  'Abo

Taking a closer look at "Abdomen" in particular, the threshold does in fact appear to be a bit too open.

In [91]:
print(generate_dict_match(dict_dist)["Abdomen"])

['Abdomen', 'Abscess', 'Ache(Stomach)', 'Appendicitis', 'Bile', 'Bowel', 'Cancer(Abdomen)', 'Cancer(Stomach)', 'Digestion', 'Digestive', 'Duodenum', 'Gall Bladder', 'Gall-Bladder', 'Gallbladder', 'Gallstones', 'Gastritis', 'Gastrodynia', 'Gastrointestinal', 'Groin', 'Hepatic', 'Hernia', 'Intestinal', 'Intestinal-Troubles', 'Intestine', 'Liver', 'Lung', 'Navel', 'Pelvis', 'Peritonitis', 'Polyp(Abdomen)', 'Polyp(Stomach)', 'Sclerosis(Abdomen)', 'Sclerosis(Belly)', 'Sclerosis(Stomach)', 'Scrotum', 'Stomachic', 'Stomach', 'Thigh', 'Tumor(Abdomen)', 'Tumor(Groin)', 'Tumor(Intestine)', 'Tumor(Stomach)', 'Uterine', 'Uterine Organs', 'Uterus', 'Waist', 'Womb', 'Gastric', 'Tumor(Digestive)', 'Gastric-Ulcer', 'Tumor(Belly)', 'Vomiting', 'Muscle(Uterus)', 'Abdominal obesity', 'Abdominal pain', 'Stomach rumble']


Threshold 0.9:

A threshold of 1.0 also provides good results, although in this case the scope might be a little bit too broad. This could be a great threshold for a future functionality proposing remedies for related symptoms.

In [96]:
dict_dist = generate_dict(df_dist_1, 0.9)
print(generate_dict_match(dict_dist)["Abdomen"])
print(generate_dict_match(dict_dist)["Cold"])
print(generate_dict_match(dict_dist)["Abdominal pain"])
print(generate_dict_match(dict_dist)["Xerostomia"])

['Abdomen', 'Ache(Stomach)', 'Cancer(Abdomen)', 'Cancer(Stomach)', 'Hernia', 'Intestinal', 'Navel', 'Pelvis', 'Polyp(Abdomen)', 'Sclerosis(Abdomen)', 'Stomachic', 'Stomach', 'Tumor(Abdomen)', 'Tumor(Stomach)', 'Uterus', 'Gastric', 'Tumor(Belly)', 'Abdominal obesity', 'Abdominal pain', 'Stomach rumble']
['Chest-Cold', 'Cold', 'Cold(Head)', 'Heat', 'Cold Feet', 'Cold sore', 'Common cold', 'Shivering']
['Abdomen', 'Ache(Stomach)', 'Cancer(Abdomen)', 'Stomach', 'Tumor(Abdomen)', 'Abdominal obesity', 'Abdominal pain']
['Xeroderma', 'Xerostomia']


__Visualise/analyse the amount of matches? How many symptoms are there (esp. from the 422 symptoms that we will provide to web dev) that only have themselves as a match?__

#### A Dictionary of Symptoms Only and their Related Terms from the Duke Activities:

For the sake of the Naturedoc web application, our goal is to provide the users with a more intuitive range of symptoms. As previously discussed, the Duke Database's _Activity_ column contains not just symptoms but also other forms of use (such as _tea_, _spice_, _abortifacient_ etc.). Additionally, these terms are often formatted in a way that is different from how most users are expected to input their symptoms: e.g., various aches are always written as _Ache(Stomach)_ and so on.

For now, we will use the list of symptoms extracted from the Google Symptoms Database. Ideally, this set of symptoms should be further modified and enriched - with specific diseases, synonyms etc. for an optimised user experience. 

For the sake of Naturedoc's proof of concept, we will focus on the 422 symptoms as they exist in the google Database:

In [107]:
activities_symptoms_df = pd.read_csv("../output/activities_symptoms_bool.csv")
activities_symptoms_df.drop(columns="Unnamed: 0", inplace=True)
activities_symptoms_df.head()

Unnamed: 0,symptomName,is_symptom,is_activity
0,Abcess,0,1
1,Abdomen,0,1
2,Abortifacient,0,1
3,Abortive,0,1
4,Abrasion,0,1


This dataframe contains all symptoms and activities from both the Google Dataset and Duke's Database. The _is_symptom_ and _is_activity_ columns indicate which of these sources they originate from.

In [115]:
filt_sym = (activities_symptoms_df["is_symptom"] == 1)
filt_sym_df = activities_symptoms_df[filt_sym]
filt_sym_list = filt_sym_df["symptomName"].values.tolist()

Some symptoms were matched to just themselves or other symptoms that only exist in the Google Dataset, but not Duke's Database.

These should be removed.

In [120]:
filt = (activities_symptoms_df["symptomName"] == "Xerostomia")
filt_df = activities_symptoms_df[filt]
filt_df

Unnamed: 0,symptomName,is_symptom,is_activity
2400,Xerostomia,1,0


In general, symptoms originating from the Google Dataset should probably be removed from the dictionary values, as the might cause issues when querying the database.

To eventually exclude not-activities from the dictionary, we also create a list of entries not in the activities:

In [118]:
filt_not_act = (activities_symptoms_df["is_activity"] == 0)
filt_not_act_df = activities_symptoms_df[filt_not_act]
filt_not_act_list = filt_not_act_df["symptomName"].values.tolist()

We generate the dictionary using a threshold of 0.9:

In [144]:
dict_dist_09 = generate_dict(df_dist_1, 0.9)
dict_09 = generate_dict_match(dict_dist_09)


Keys that exist in the list of Google Symptoms are added to a new dictionary, alongside their values:

In [145]:
dict_09_sym = dict()

for key in dict_09.keys():
    if key in filt_sym_list:
        dict_09_sym[key] = dict_09[key]

In [127]:
print(len(dict_09_sym))
dict_09_sym

422


{'Acne': ['Acne', 'Complexion', 'Pimple'],
 'Alcoholism': ['Alcoholism', 'Beer', 'Beverage', 'Drunkenness'],
 'Allergy': ['Allergenic', 'Allergy'],
 'Amblyopia': ['Amblyopia', 'Nyctalopia'],
 'Amenorrhea': ['Amenorrhea'],
 'Amnesia': ['Amnesia', 'Forgetfulness', 'Memory'],
 'Anemia': ['Anemia',
  'Hyperemia',
  'Toxemia',
  'Uremia',
  'Anosmia',
  'Iron deficiency'],
 'Anxiety': ['Anxiety',
  'Apprehension',
  'Fear',
  'Nervousness',
  'Generalized anxiety disorder',
  'Panic attack'],
 'Arthralgia': ['Arthralgia', 'Arthrodynia', 'Pharyngalgia', 'Arthrosis'],
 'Arthritis': ['Arthritis',
  'Arthritis?',
  'Heumatism',
  'Rheumatic',
  'Rheumatism',
  'Rheumatitis',
  'Knee pain'],
 'Ascites': ['Ascites'],
 'Asthma': ['Asthma',
  'Breathing',
  'Bronchial-Asthma',
  'Bronchitis',
  'Asthma (Ivy)',
  'Lung',
  'Respiratory',
  'Asthma (Hay)'],
 'Ataxia': ['Ataxia', 'Ataxia(Locomotor)'],
 'Atheroma': ['Atheroma', 'Atheromasia'],
 'Boil': ['Boil', 'Preventitive(Boil)'],
 'Bronchitis': ['A

Generate the dictionary while removing Google Symptoms from the values inside the dictionary:

In [153]:
def create_dict_sym(dict_dist):  
    dict_sym = dict()

    for sym, list_sym in dict_dist.items():
        if sym not in filt_sym_list:
            continue
        for sub_sym in list_sym:
            if sub_sym in filt_not_act_list:
                continue
            if sym not in dict_sym:
                dict_sym[sym] = [sub_sym]
            else:
                dict_sym[sym] = [*dict_sym.get(sym), sub_sym]
    
    return dict_sym

In [154]:
dict_09_sym = create_dict_sym(dict_09)

In [155]:
print(len(dict_09_sym))
dict_09_sym

341


{'Acne': ['Acne', 'Complexion', 'Pimple'],
 'Alcoholism': ['Alcoholism', 'Beer', 'Beverage', 'Drunkenness'],
 'Allergy': ['Allergenic', 'Allergy'],
 'Amblyopia': ['Amblyopia', 'Nyctalopia'],
 'Amenorrhea': ['Amenorrhea'],
 'Amnesia': ['Amnesia', 'Forgetfulness', 'Memory'],
 'Anemia': ['Anemia', 'Hyperemia', 'Toxemia', 'Uremia'],
 'Anxiety': ['Anxiety', 'Apprehension', 'Fear', 'Nervousness'],
 'Arthralgia': ['Arthralgia', 'Arthrodynia', 'Pharyngalgia', 'Arthrosis'],
 'Arthritis': ['Arthritis',
  'Arthritis?',
  'Heumatism',
  'Rheumatic',
  'Rheumatism',
  'Rheumatitis'],
 'Ascites': ['Ascites'],
 'Asthma': ['Asthma',
  'Breathing',
  'Bronchial-Asthma',
  'Bronchitis',
  'Asthma (Ivy)',
  'Lung',
  'Respiratory',
  'Asthma (Hay)'],
 'Ataxia': ['Ataxia', 'Ataxia(Locomotor)'],
 'Atheroma': ['Atheroma', 'Atheromasia'],
 'Boil': ['Boil', 'Preventitive(Boil)'],
 'Bronchitis': ['Asthma',
  'Bronchial',
  'Bronchial-Asthma',
  'Bronchiectasis',
  'Bronchitis',
  'Bronchodilator',
  'Bronchosi

A few too many hits for cancers and tumors?

In [158]:
print(dict_09_sym["Abdominal pain"])
print(dict_09_sym["Eye pain"])

['Abdomen', 'Ache(Stomach)', 'Cancer(Abdomen)', 'Stomach', 'Tumor(Abdomen)']
['Cancer(Eye)', 'Eye', 'Eye drop', 'Eyeball', 'Eyelid', 'Pain', 'Sclerosis(Eyelid)', 'Sore(Eye)', 'Tumor(Eye)', 'Ache(Eye)']


In [157]:
dict_dist_085 = generate_dict(df_dist_1, 0.85)
dict_085 = generate_dict_match(dict_dist_085)
dict_085_sym = create_dict_sym(dict_085)
print(len(dict_085_sym))
dict_085_sym

306


{'Acne': ['Acne', 'Pimple'],
 'Alcoholism': ['Alcoholism', 'Beer', 'Beverage', 'Drunkenness'],
 'Allergy': ['Allergenic', 'Allergy'],
 'Amblyopia': ['Amblyopia'],
 'Amenorrhea': ['Amenorrhea'],
 'Amnesia': ['Amnesia', 'Forgetfulness'],
 'Anemia': ['Anemia', 'Hyperemia', 'Toxemia', 'Uremia'],
 'Anxiety': ['Anxiety', 'Fear', 'Nervousness'],
 'Arthralgia': ['Arthralgia', 'Arthrodynia'],
 'Arthritis': ['Arthritis',
  'Arthritis?',
  'Heumatism',
  'Rheumatism',
  'Rheumatitis'],
 'Ascites': ['Ascites'],
 'Asthma': ['Asthma', 'Bronchial-Asthma', 'Asthma (Ivy)', 'Asthma (Hay)'],
 'Ataxia': ['Ataxia', 'Ataxia(Locomotor)'],
 'Atheroma': ['Atheroma', 'Atheromasia'],
 'Boil': ['Boil', 'Preventitive(Boil)'],
 'Bronchitis': ['Bronchial',
  'Bronchial-Asthma',
  'Bronchiectasis',
  'Bronchitis',
  'Bronchodilator',
  'Bronchosis',
  'Lung',
  'Bronchorrhea'],
 'Bruise': ['Bruise'],
 'Bunion': ['Bunion'],
 'Burn': ['Burn', 'Burns'],
 'Cataract': ['Cataract'],
 'Chancre': ['Chancre', 'Chancroid'],
 '

This already removes some of the cancer/tumor matches - however, symptoms like "Common cold" also drop out:

In [165]:
print(dict_085_sym["Abdominal pain"])
print(dict_085_sym["Eye pain"])
print("Common cold" in dict_085_sym.keys())

['Abdomen', 'Ache(Stomach)', 'Cancer(Abdomen)']
['Eye', 'Eye drop', 'Sore(Eye)', 'Ache(Eye)']
False


A threshold of .86 looks like an appropriate compromise for now:

In [166]:
dict_dist_086 = generate_dict(df_dist_1, 0.86)
dict_086 = generate_dict_match(dict_dist_086)
dict_086_sym = create_dict_sym(dict_086)
print(len(dict_086_sym))
dict_086_sym

314


{'Acne': ['Acne', 'Pimple'],
 'Alcoholism': ['Alcoholism', 'Beer', 'Beverage', 'Drunkenness'],
 'Allergy': ['Allergenic', 'Allergy'],
 'Amblyopia': ['Amblyopia'],
 'Amenorrhea': ['Amenorrhea'],
 'Amnesia': ['Amnesia', 'Forgetfulness', 'Memory'],
 'Anemia': ['Anemia', 'Hyperemia', 'Toxemia', 'Uremia'],
 'Anxiety': ['Anxiety', 'Fear', 'Nervousness'],
 'Arthralgia': ['Arthralgia', 'Arthrodynia', 'Arthrosis'],
 'Arthritis': ['Arthritis',
  'Arthritis?',
  'Heumatism',
  'Rheumatism',
  'Rheumatitis'],
 'Ascites': ['Ascites'],
 'Asthma': ['Asthma',
  'Breathing',
  'Bronchial-Asthma',
  'Asthma (Ivy)',
  'Asthma (Hay)'],
 'Ataxia': ['Ataxia', 'Ataxia(Locomotor)'],
 'Atheroma': ['Atheroma', 'Atheromasia'],
 'Boil': ['Boil', 'Preventitive(Boil)'],
 'Bronchitis': ['Bronchial',
  'Bronchial-Asthma',
  'Bronchiectasis',
  'Bronchitis',
  'Bronchodilator',
  'Bronchosis',
  'Lung',
  'Bronchorrhea'],
 'Bruise': ['Bruise'],
 'Bunion': ['Bunion'],
 'Burn': ['Burn', 'Burns'],
 'Cataract': ['Cataract

In [167]:
print(dict_086_sym["Abdominal pain"])
print(dict_086_sym["Eye pain"])
print("Common cold" in dict_086_sym.keys())

['Abdomen', 'Ache(Stomach)', 'Cancer(Abdomen)']
['Eye', 'Eye drop', 'Sore(Eye)', 'Ache(Eye)']
True


Export as json:

In [309]:
import json

with open("../output/symptom_matches.json", "w") as outfile:
    json.dump(dict_086_sym, outfile)

### Creating the Dictionary for the Second Embedding Model:

The values of the second embedding model are not usable in the current state. Several of the symptoms end up being matched with with a disproportionate amount of other symptoms and activities. As can be seen in the example of a specific row's value counts below, the exact same value is present 830 times.

#### Distance Matrix:

In [310]:
df_dist_2 = generate_distance_matrix(embeddings2_df)

In [258]:
print("Value counts:", df_dist_2.loc[:, 1].value_counts())

Value counts: 7.240622     830
8.358644       3
9.079539       2
10.997180      2
10.270435      2
            ... 
9.093840       1
10.213916      1
11.291925      1
8.894402       1
10.246362      1
Name: 1, Length: 1554, dtype: int64


This issue was further explored in the notebook __data_clustering_emb2_bandaid__.