In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### NaturDoc - TL BL WT 22-23

# Data clustering:

## HDBSCAN:

## Including Required Code:

### Loading Embeddings Data:


In [2]:
symptoms_embeddings = pd.read_csv("../data/embeddings/word_embeddings_dataframe.csv")

In [3]:
dict_symptom = symptoms_embeddings["Symptom"].to_dict()

#### Transforming:

In [4]:
test_list_1 = symptoms_embeddings.loc[0, "Embedding1"].replace("\n", "").replace("[", "").replace("]", "").split(" ")
test_list_2 = symptoms_embeddings.loc[0, "Embedding2"].replace("\n", "").replace("[", "").replace("]", "").split(" ")

Removing all empty strings:

In [5]:
test_list_1 = [x for x in test_list_1 if x]
test_list_2 = [x for x in test_list_2 if x]
test_list_1[:5]

['-9.81967244e-03',
 '1.01662287e-02',
 '3.75229940e-02',
 '1.75703913e-02',
 '-1.11436069e-01']

### Creating useable dataframes:

#### Embedding 1 column:

First, transform content of rows from strings to lists:

In [6]:
symptoms_embeddings.loc[:, "Embedding1"].str.replace("[", "", regex=True).replace("]", "", regex=True).str.split(" ")[1][:10]

['',
 '5.98840415e-02',
 '',
 '1.64022837e-02',
 '-4.90665212e-02',
 '',
 '4.81191762e-02\n',
 '-9.69780684e-02',
 '-1.16978601e-01',
 '']

In [7]:
def listify_df_values(df_series: pd.Series):
    df_series = df_series.str.replace("\n", "", regex=True)
    df_series = df_series.str.replace("[", "", regex=True).replace("]", "", regex=True)
    df_series = df_series.str.split(" ")
    return df_series

In [8]:
embeddings1_series = listify_df_values(symptoms_embeddings.loc[:, "Embedding1"])

In [9]:
embeddings1_series = embeddings1_series.apply(lambda row: [val for val in row if val])

https://stackoverflow.com/questions/67442107/pandas-expand-explode-dataframe-horizontally

In [10]:
embeddings1_df = pd.DataFrame(embeddings1_series)
embeddings1_df.head(5)

Unnamed: 0,Embedding1
0,"[-9.81967244e-03, 1.01662287e-02, 3.75229940e-..."
1,"[5.98840415e-02, 1.64022837e-02, -4.90665212e-..."
2,"[6.30832557e-03, 6.94514960e-02, 9.17118881e-0..."
3,"[-1.41132241e-02, 7.76526034e-02, -8.35783686e..."
4,"[-7.86128864e-02, -2.58876905e-02, 3.46109122e..."


In [11]:
embeddings1_df = pd.concat(
    [embeddings1_df[c].apply(pd.Series).add_prefix(c + "_") for c in embeddings1_df], axis=1
)

embeddings1_df.head(5)

Unnamed: 0,Embedding1_0,Embedding1_1,Embedding1_2,Embedding1_3,Embedding1_4,Embedding1_5,Embedding1_6,Embedding1_7,Embedding1_8,Embedding1_9,...,Embedding1_374,Embedding1_375,Embedding1_376,Embedding1_377,Embedding1_378,Embedding1_379,Embedding1_380,Embedding1_381,Embedding1_382,Embedding1_383
0,-0.00981967244,0.0101662287,0.037522994,0.0175703913,-0.111436069,0.038332589,0.148906738,0.044446677,0.0577533916,-0.01215267,...,0.0611344092,0.0198782869,0.0133477971,0.0386779606,-0.0479677059,0.034220051,0.0426308662,0.0378118306,0.0695859119,-0.0420008637
1,0.0598840415,0.0164022837,-0.0490665212,0.0481191762,-0.0969780684,-0.116978601,0.107039817,0.0218950473,0.0459282361,-0.0605028607,...,0.0486743562,0.0104232021,0.0138152717,-0.00532790925,-0.0177161284,0.1043249,0.0965044126,0.071945101,0.0172711313,0.00624693604
2,0.00630832557,0.069451496,0.00917118881,-0.000425593607,0.0368529968,0.0288750455,0.0993606523,0.00199077209,0.0311414283,0.038332548,...,0.0225946605,-0.0391616262,0.0123729361,-0.0283677857,-0.085157536,0.0725132674,0.0653430074,0.0022675863,0.0607209243,-0.0246002264
3,-0.0141132241,0.0776526034,-0.00835783686,0.0237053819,0.0561783165,0.0336992592,0.119458653,-0.0201092865,0.0362723432,0.0482863858,...,0.0669673532,0.0113059739,-0.0116295973,-0.0242045093,-0.0578260906,0.0389332138,0.118804961,-0.0296259206,0.0369524844,-0.00953654386
4,-0.0786128864,-0.0258876905,0.0346109122,0.0558277592,-0.0387978852,-0.0556877032,0.144394651,0.0246080924,-0.0719921589,-0.0499793142,...,0.0283698123,-0.0349769071,-0.0214673597,0.0145020243,0.0576726533,0.0332759731,0.109838024,-0.0757560134,0.0223050658,-0.0470947437


Wrong data type:

In [12]:
embeddings1_df = embeddings1_df.apply(pd.to_numeric, errors='coerce')

### Code to Generate Distance Matrix:

In [13]:
import math
from scipy.spatial import distance_matrix

def generate_distance_matrix(df : pd.DataFrame,
                distance_metric : str = "euclidean") -> pd.DataFrame: # 2.5k x 2.5k
    if distance_metric == "manhattan":
        p = 1
    elif distance_metric == "euclidean":
        p = 2
    elif distance_metric == "chebychev":
        p = math.inf
    else:
        p = 2
    dis_matrix = distance_matrix(df.values, df.values, p)
    dis_df = pd.DataFrame(dis_matrix)
    return dis_df

### Code to Generate Dictionaries:

In [14]:
def generate_dict(df_dist: pd.DataFrame,
                threshold: float) -> dict:
    filt = (df_dist[:] > threshold)
    df_filt = df_dist.copy()
    df_filt[filt] = np.nan
    dict_dist = df_filt.to_dict('dict')
    for i, dic in dict_dist.items():
        to_pop = list()
        for key, value in dic.items():
            if np.isnan(value):
                to_pop.append(key)
        for target_key in to_pop:
            dic.pop(target_key)
        dict_dist[i] = dic
    return dict_dist

In [15]:
def generate_dict_match(dict_dist: dict) -> dict:   
    dict_match = dict()

    for key, value in dict_dist.items():
        for sub_key in value.keys():
            if dict_symptom[key] not in dict_match:
                dict_match[dict_symptom[key]] = [dict_symptom[sub_key]]
            else:
                dict_match[dict_symptom[key]] = [*dict_match.get(dict_symptom[key]), dict_symptom[sub_key]]

    return dict_match

In [16]:
activities_symptoms_df = pd.read_csv("../output/activities_symptoms_bool.csv")
activities_symptoms_df.drop(columns="Unnamed: 0", inplace=True)

In [17]:
filt_sym = (activities_symptoms_df["is_symptom"] == 1)
filt_sym_df = activities_symptoms_df[filt_sym]
filt_sym_list = filt_sym_df["symptomName"].values.tolist()

In [18]:
filt_not_act = (activities_symptoms_df["is_activity"] == 0)
filt_not_act_df = activities_symptoms_df[filt_not_act]
filt_not_act_list = filt_not_act_df["symptomName"].values.tolist()

In [19]:
def create_dict_sym(dict_dist):  
    dict_sym = dict()

    for sym, list_sym in dict_dist.items():
        if sym not in filt_sym_list:
            continue
        for sub_sym in list_sym:
            if sub_sym in filt_not_act_list:
                continue
            if sym not in dict_sym:
                dict_sym[sym] = [sub_sym]
            else:
                dict_sym[sym] = [*dict_sym.get(sym), sub_sym]
    
    return dict_sym

### Generating Distance Matrix and Dictionaries:

In [20]:
df_dist_1 = generate_distance_matrix(embeddings1_df)

In [21]:
dict_dist_086 = generate_dict(df_dist_1, 0.86)
dict_086 = generate_dict_match(dict_dist_086)
dict_086_sym = create_dict_sym(dict_086)

## HDBSCAN* Clustering:

This is more for DS research rather than the MVP functionality. The quality of the matching mostly seems to rely on the word embeddings, in any case.

In [22]:
import hdbscan

### Experimenting With Different Parameters and Attributes:

In [23]:
clusterer_distance = hdbscan.HDBSCAN(metric='precomputed')
clusterer_distance.fit(df_dist_1)
print(set(clusterer_distance.labels_))

{0, 1, -1}


In [24]:
clusterer_distance = hdbscan.HDBSCAN(min_samples=4, min_cluster_size=5, metric='precomputed')
clusterer_distance.fit(df_dist_1)
print(set(clusterer_distance.labels_))

{0, 1, 2, 3, 4, 5, 6, 7, -1}


In [25]:
print(len(clusterer_distance.probabilities_))
print(set(clusterer_distance.probabilities_[::20]))

2404
{0.0, 1.0, 0.98305244711713, 0.9791146416423028, 0.9769353926578465, 0.9961216827029399, 0.9738910081588747, 0.9970175222727984, 0.9902114067769573, 0.9720453152121288, 0.972016389330309}


#### Labels and probabilities:

In [26]:
print(dict_symptom[0])
print(clusterer_distance.labels_[0])
print(clusterer_distance.probabilities_[0])

Abcess
-1
0.0


In [27]:
print(dict_symptom[1])
print(clusterer_distance.labels_[1])
print(clusterer_distance.probabilities_[1])

Abdomen
6
1.0


In [28]:
print(dict_symptom[1594])
print(clusterer_distance.labels_[1594])
print(clusterer_distance.probabilities_[1594])

Stomach
6
1.0


In [29]:
label_6 = {
    "indices": [],
    "probabilities": []
}
for i, label in enumerate(clusterer_distance.labels_):
    if label == 6:
        label_6["indices"].append(i)
        label_6["probabilities"].append(clusterer_distance.probabilities_[i])

#### Soft labels:

In [30]:
soft_label_6 = list()

for i, index in enumerate(label_6["indices"]):
    if label_6["probabilities"][i] < 1.0:
        soft_label_6.append(index)

In [31]:
len(soft_label_6)

145

In [32]:
print(dict_symptom[181])
print(clusterer_distance.labels_[181])
print(clusterer_distance.probabilities_[181])

Antimalaria
6
0.9739849522332443


#### Hard labels:

In [33]:
hard_label_6 = list()

for i, index in enumerate(label_6["indices"]):
    if label_6["probabilities"][i] == 1.0:
        hard_label_6.append(index)

In [34]:
len(hard_label_6)

813

In [35]:
len(label_6["indices"])

958

Possible to further refine/reduce number of matches by taking the probabilities value into account.

### Our Cluster Function:

The function must therefore run the HDBSCAN clustering algorithm and return a dictionary where the label groups are returned. min_sample and min_cluster_size should be parameters input into the function to quickly receive different results.

If we want, we can include parameter to make a decision whether to use hard/soft clustering filter, and at what threshold. Implementing this would help further refining the matches in case too many labels with too many items included are generated.

Return a dict of matches and also a list of outliers:

In [36]:
def cluster(df_dist: pd.DataFrame, min_sample_n: int = 5, 
                min_cluster_n: int = 5, filter: bool = False,
                min_probability: float = 1.0) -> dict:
    clusterer_distance = hdbscan.HDBSCAN(
        min_samples = min_sample_n, 
        min_cluster_size = min_cluster_n, 
        metric='precomputed')
    clusterer_distance.fit(df_dist)

    # create a dictionary:
    # a key for each label, value are indices

    labels = list(set(clusterer_distance.labels_))

    label_dict = dict()

    for label in labels:
        label_dict[label] = {}

    for i, assigned_label in enumerate(clusterer_distance.labels_):
        label_dict[assigned_label][i] = clusterer_distance.probabilities_[i]
    
    # create a dictionary:
    # keys are indices from 0 to end, values are all indices from 
    # the same label

    match_dict = dict()
    outliers = list()

    for i in range(len(clusterer_distance.labels_)):
        for label, label_group in label_dict.items():
            if label == -1 and i in label_group:
                outliers.append(i)
                continue
            if i in label_group:
                match_dict[i] = label_group

    return match_dict, outliers, labels

In [37]:
cluster_dict, outliers, labels = cluster(df_dist_1)

In [38]:
labels

[0, 1, -1]

Outliers:

In [39]:
print(len(outliers))
outliers[:5]

1076


[0, 2, 3, 4, 5]

A dictionary containing similar values:

In [40]:
print(str(cluster_dict)[:200] + "...")

{1: {1: 1.0, 6: 0.9796693383722548, 8: 1.0, 9: 1.0, 10: 1.0, 11: 1.0, 12: 1.0, 13: 1.0, 14: 1.0, 15: 1.0, 16: 1.0, 17: 1.0, 18: 1.0, 19: 1.0, 20: 1.0, 21: 1.0, 25: 0.9645519056822113, 27: 0.9605620807...


#### Testing out different values:

We have 422 symptoms in our symptom list.

In [41]:
len(filt_sym_list)

422

In [42]:
cluster_dict, outliers, labels = cluster(df_dist_1, min_sample_n = 2)

In [43]:
labels

[0, 1, -1]

In [44]:
print(len(outliers))
outliers[:5]

329


[0, 7, 26, 28, 29]

In [45]:
print(len(cluster_dict[2]))
print(str(cluster_dict)[:200] + "...")

2068
{1: {1: 1.0, 2: 1.0, 3: 1.0, 4: 0.9307565897715478, 5: 1.0, 6: 1.0, 8: 1.0, 9: 1.0, 10: 1.0, 11: 1.0, 12: 1.0, 13: 1.0, 14: 1.0, 15: 1.0, 16: 1.0, 17: 1.0, 18: 1.0, 19: 1.0, 20: 1.0, 21: 1.0, 22: 0.98...


In [46]:
cluster_dict, outliers, labels = cluster(df_dist_1, min_sample_n = 1)
len(labels)

110

In [47]:
cluster_dict, outliers, labels = cluster(df_dist_1, min_sample_n = 3,
            min_cluster_n = 4)
len(labels)

59

This is the lowest we can go for min sample and min cluster:

In [48]:
cluster_dict, outliers, labels = cluster(df_dist_1, min_sample_n = 1,
            min_cluster_n = 2)
len(labels)

559

In [49]:
len(outliers)

835

In [50]:
print(str(cluster_dict)[:200] + "...")

{1: {1: 1.0, 1550: 0.9479793910211823, 1594: 1.0}, 2: {2: 1.0, 1328: 1.0}, 3: {3: 1.0, 2097: 1.0}, 5: {5: 1.0, 6: 1.0, 1329: 0.8907824978608478}, 6: {5: 1.0, 6: 1.0, 1329: 0.8907824978608478}, 8: {8: ...


In [51]:
cluster_sym_dict = generate_dict_match(cluster_dict)

In [52]:
print(str(cluster_sym_dict)[:200] + "...")

{'Abdomen': ['Abdomen', 'Stomachic', 'Stomach'], 'Abortifacient': ['Abortifacient', 'Preventitive(Abortifacient)'], 'Abortive': ['Abortive', 'Abortive?'], 'Abscess': ['Abscess', 'Abscess(Breast)', 'Pr...


In [53]:
final = create_dict_sym(cluster_sym_dict)

In [54]:
print(len(final))
print(str(final)[:200] + "...")

255
{'Acne': ['Acne', 'Pimple'], 'Alcoholism': ['Alcoholism', 'Drunkenness'], 'Allergy': ['Allergenic', 'Allergy'], 'Amblyopia': ['Amblyopia', 'Nyctalopia'], 'Amnesia': ['Amnesia', 'Dementia', 'Forgetfuln...


#### Comparison With Dictionary Generated With Distance Matrix and Distance Threshold:

The dictionaries generated with HDBSCAN* are much more restrictive:

In [55]:
print(len(dict_086_sym))
list(dict_086_sym.items())[:5]

314


[('Acne', ['Acne', 'Pimple']),
 ('Alcoholism', ['Alcoholism', 'Beer', 'Beverage', 'Drunkenness']),
 ('Allergy', ['Allergenic', 'Allergy']),
 ('Amblyopia', ['Amblyopia']),
 ('Amenorrhea', ['Amenorrhea'])]

In [56]:
print("Abdominal pain" in final.keys())
print(final["Eye pain"])
print("Common cold" in final.keys())

False
['Sore(Eye)', 'Ache(Eye)']
False


In [57]:
print(dict_086_sym["Abdominal pain"])
print(dict_086_sym["Eye pain"])
print(dict_086_sym["Common cold"])

['Abdomen', 'Ache(Stomach)', 'Cancer(Abdomen)']
['Eye', 'Eye drop', 'Sore(Eye)', 'Ache(Eye)']
['Cold']


The results are not as useful, so we will explore more options provided by the HDBSCAN* library in hopes of creating more appropriate clusters.

### Cluster Function with Leaf Clustering:
From the HDBSCAN* documentation:

"_HDBSCAN supports an extra parameter_ cluster_selection_method _to determine how it selects flat clusters from the cluster tree hierarchy. The default method is 'eom' for Excess of Mass, the algorithm described in How HDBSCAN Works. This is not always the most desireable approach to cluster selection. If you are more interested in having small homogeneous clusters then you may find Excess of Mass has a tendency to pick one or two large clusters and then a number of small extra clusters. In this situation you may be tempted to recluster just the data in the single large cluster. Instead, a better option is to select 'leaf' as a cluster selection method. This will select leaf nodes from the tree, producing many small homogeneous clusters. Note that you can still get variable density clusters via this method, and it is also still possible to get large clusters, but there will be a tendency to produce a more fine grained clustering than Excess of Mass can provide._"

In [58]:
def leaf_cluster(df_dist: pd.DataFrame, min_sample_n: int = 5, 
                min_cluster_n: int = 5, filter: bool = False,
                min_probability: float = 1.0) -> dict:
    clusterer_distance = hdbscan.HDBSCAN(
        min_samples = min_sample_n, 
        min_cluster_size = min_cluster_n, 
        cluster_selection_method = 'leaf',
        metric='precomputed')
    clusterer_distance.fit(df_dist)

    # create a dictionary:
    # a key for each label, value are indices

    labels = list(set(clusterer_distance.labels_))

    label_dict = dict()

    for label in labels:
        label_dict[label] = {}

    for i, assigned_label in enumerate(clusterer_distance.labels_):
        label_dict[assigned_label][i] = clusterer_distance.probabilities_[i]
    
    # create a dictionary:
    # keys are indices from 0 to end, values are all indices from 
    # the same label

    match_dict = dict()
    outliers = list()

    for i in range(len(clusterer_distance.labels_)):
        for label, label_group in label_dict.items():
            if label == -1 and i in label_group:
                outliers.append(i)
                continue
            if i in label_group:
                match_dict[i] = label_group

    return match_dict, outliers, labels

As a reminder, previous results:

* 559 labels
* 835 outliers
* 255 symptoms matched to activities in the final dictionary
* "Abdominal pain", "Common cold" not included amongst others

In [59]:
cluster_dict, outliers, labels = leaf_cluster(df_dist_1, min_sample_n = 1,
            min_cluster_n = 2)
print(len(labels), "labels")
print(len(outliers), "outliers")
cluster_sym_dict = generate_dict_match(cluster_dict)
final_cluster_dict = create_dict_sym(cluster_sym_dict)
print(len(final_cluster_dict), "symptoms matched to activities in the final dictionary ")
list(final_cluster_dict.items())[:5]

575 labels
863 outliers
249 symptoms matched to activities in the final dictionary 


[('Acne', ['Acne', 'Pimple']),
 ('Alcoholism', ['Alcoholism', 'Drunkenness']),
 ('Allergy', ['Allergenic', 'Allergy']),
 ('Amblyopia', ['Amblyopia', 'Nyctalopia']),
 ('Amnesia', ['Amnesia', 'Dementia', 'Forgetfulness', 'Memory'])]

In [60]:
print("Abdominal pain" in final_cluster_dict.keys())
print(final_cluster_dict["Eye pain"])
print("Common cold" in final_cluster_dict.keys())

False
['Sore(Eye)', 'Ache(Eye)']
False


Taking a closer look at the dictionary containing all label groups:

In [61]:
cluster_sym_dict = generate_dict_match(cluster_dict)
list(cluster_sym_dict.items())[:5]

[('Abdomen', ['Abdomen', 'Stomachic', 'Stomach']),
 ('Abortifacient', ['Abortifacient', 'Preventitive(Abortifacient)']),
 ('Abortive', ['Abortive', 'Abortive?']),
 ('Abscess', ['Abscess', 'Abscess(Breast)', 'Preventitive(Abscess)']),
 ('Abscess(Breast)', ['Abscess', 'Abscess(Breast)', 'Preventitive(Abscess)'])]

The results for leaf clustering are overall quite similar: while a few more labels were created, and less data points labelled as outliers, the dictionary matching only to symptoms actually ended up containing a few less entries. "Common cold" and "Abdominal pain" are still not included.

### Cluster Function with Soft Clustering:

It is important to note that, if we wish to use the soft clustering we should use the prediction_data=True option for HDBSCAN. This will ensure we generate the extra data required that will allow soft clustering to work. This __cannot__ generate prediction data for non-vectorspace inputs - access to the source data rather than mere distances is required!

Accordingly, we should use the original embeddings dataframe and likewise remove the metric = 'precomputed' parameter:

    clusterer_distance = hdbscan.HDBSCAN(
        min_samples = 1, 
        min_cluster_size = 2, 
        prediction_data= True)
    clusterer_distance.fit(df_dist_1)

There is no -1 outliers label, so one less than .labels_ returns

#### Soft Clustering Function:

For each list in soft clusters: 

* label is argmax 

* if the value at argmax index position is above threshold => add to label group

* below threshold: add to outlier dictionary, but label as key

In [62]:
def soft_cluster(df_vect: pd.DataFrame, min_sample_n: int = 5, 
                min_cluster_n: int = 5, threshold: float = 0.2025,
                filter: bool = False,
                min_probability: float = 1.0) -> dict:

    threshold = threshold / 100

    clusterer_distance = hdbscan.HDBSCAN(
        min_samples = min_sample_n, 
        min_cluster_size = min_cluster_n, 
        prediction_data= True)

    clusterer_distance.fit(df_vect)

    soft_clusters = hdbscan.all_points_membership_vectors(clusterer_distance)

    # create a dictionary:
    # a key for each label, value are indices

    labels = range(len(soft_clusters[0]))

    label_dict = dict()
    label_dict[-1] = {}

    for label in labels:
        label_dict[label] = {}

    for i, membership_vectors in enumerate(soft_clusters):
        assigned_label = np.argmax(membership_vectors)
        membership_vector = membership_vectors[assigned_label]
        if membership_vector >= threshold:
            label_dict[assigned_label][i] = membership_vector
        else:
            label_dict[-1][i] = {assigned_label: membership_vector}
    
    # create a dictionary:
    # keys are indices from 0 to end, values are all indices from 
    # the same label

    match_dict = dict()
    outliers = list()

    for i in range(len(soft_clusters)):
        for label, label_group in label_dict.items():
            if label == -1 and i in label_group:
                outliers.append(label_group)
                continue
            if i in label_group:
                match_dict[i] = label_group

    return match_dict, outliers, labels

##### For verification purposes, we are using the basic threshold and trying with distance matrix to check results:

In [63]:
cluster_dict, outliers, labels = soft_cluster(df_dist_1, min_sample_n = 1,
            min_cluster_n = 2)
print(len(labels))
print(len(outliers))

  in_cluster_probs = all_points_prob_in_some_cluster(


561
260


In [64]:
print(str(outliers)[:200] + "...")

[{0: {279: 0.0015467320559012343}, 6: {0: 0.0}, 7: {270: 0.0006325716607575951}, 25: {115: 0.0018285710047903538}, 26: {74: 0.0014173188180648247}, 39: {65: 0.0012295304792319943}, 55: {29: 0.00121941...


In [65]:
cluster_sym_dict = generate_dict_match(cluster_dict)
final_cluster_dict = create_dict_sym(cluster_sym_dict)
print(len(final_cluster_dict))
list(final_cluster_dict.items())[:5]

300


[('Acne', ['Acne', 'Pimple']),
 ('Alcoholism', ['Alcoholism', 'Drowning', 'Drunkenness']),
 ('Allergy', ['Allergy', 'Emphysema']),
 ('Amblyopia', ['Amblyopia', 'Nyctalopia', 'Nypnotic']),
 ('Amenorrhea', ['Amenorrhea', 'Enteritis', 'Enterosis', 'Enterostenosis'])]

In [66]:
print("Abdominal pain" in final_cluster_dict.keys())
print("Eye pain" in final_cluster_dict.keys())
print("Common cold" in final_cluster_dict.keys())

False
True
False


In [67]:
final_cluster_dict["Eye pain"]

['Glaucoma']

Results seem somewhat promising (reduction in outliers), but the matches seem a little scrambled. However, this test run used a distance matrix as input, which should have caused issues. 

##### Using Embeddings Dataframe for Soft Clustering:

##### Default Threshold: 

For now, using the default threshold again (which is a little lower than the max membership value discovered amongst previous outliers):

In [73]:
cluster_dict, outliers, labels = soft_cluster(embeddings1_df, min_sample_n = 1,
            min_cluster_n = 2)
print(len(labels), "labels")
print(len(outliers), "outliers")

  in_cluster_probs = all_points_prob_in_some_cluster(


558 labels
1865 outliers


In [69]:
print(str(outliers)[:200] + "...")

[{0: {23: 0.0015456918900324888}, 1: {506: 0.002015968949266312}, 4: {76: 0.0014027082077308775}, 5: {400: 0.0020198751979227762}, 7: {260: 0.0014376918096444334}, 8: {225: 0.00195581615208487}, 9: {5...


In [75]:
cluster_sym_dict = generate_dict_match(cluster_dict)
final_cluster_dict = create_dict_sym(cluster_sym_dict)
print(len(final_cluster_dict), "symptoms matched in the symptom-activity dictionary")
list(final_cluster_dict.items())[:5]

62 symptoms matched in the symptom-activity dictionary


[('Anxiety', ['Anxiety']),
 ('Arthritis', ['Arthritis', 'Arthritis?']),
 ('Asthma', ['Asthma', 'Bronchial-Asthma']),
 ('Ataxia', ['Ataxia', 'Ataxia(Locomotor)']),
 ('Bronchitis', ['Bronchitis', 'Bronchosis'])]

In [71]:
print("Abdominal pain" in final_cluster_dict.keys())
print("Eye pain" in final_cluster_dict.keys())
print("Common cold" in final_cluster_dict.keys())

False
False
False


##### Threshold 0.202:

In [76]:
cluster_dict, outliers, labels = soft_cluster(embeddings1_df, min_sample_n = 1,
            min_cluster_n = 2, threshold = 0.202)
print(len(labels), "labels")
print(len(outliers), "outliers")
cluster_sym_dict = generate_dict_match(cluster_dict)
final_cluster_dict = create_dict_sym(cluster_sym_dict)
print(len(final_cluster_dict), "symptoms matched in the symptom-activity dictionary")
print("Abdominal pain" in final_cluster_dict.keys())
print("Eye pain" in final_cluster_dict.keys())
print("Common cold" in final_cluster_dict.keys())

  in_cluster_probs = all_points_prob_in_some_cluster(


558 labels
1853 outliers
64 symptoms matched in the symptom-activity dictionary
False
False
False


In [77]:
list(final_cluster_dict.items())[:5]

[('Anxiety', ['Anxiety']),
 ('Arthritis', ['Arthritis', 'Arthritis?']),
 ('Asthma', ['Asthma', 'Bronchial-Asthma']),
 ('Ataxia', ['Ataxia', 'Ataxia(Locomotor)']),
 ('Bronchitis', ['Bronchitis', 'Bronchosis'])]

##### Threshold 0.1:

In [78]:
cluster_dict, outliers, labels = soft_cluster(embeddings1_df, min_sample_n = 1,
            min_cluster_n = 2, threshold = 0.1)
print(len(labels), "labels")
print(len(outliers), "outliers")
cluster_sym_dict = generate_dict_match(cluster_dict)
final_cluster_dict = create_dict_sym(cluster_sym_dict)
print(len(final_cluster_dict), "symptoms matched in the symptom-activity dictionary")
print("Abdominal pain" in final_cluster_dict.keys())
print("Eye pain" in final_cluster_dict.keys())
print("Common cold" in final_cluster_dict.keys())

  in_cluster_probs = all_points_prob_in_some_cluster(


558 labels
123 outliers
380 symptoms matched in the symptom-activity dictionary
True
True
True


In [79]:
list(final_cluster_dict.items())[:5]

[('Acne', ['Acne', 'Lacrimatory', 'Mastitis']),
 ('Alcoholism', ['Alcoholism', 'Drunkenness', 'Home-Remedy']),
 ('Allergy', ['Allergy', 'Fumigant']),
 ('Amblyopia',
  ['Amblyopia', 'Deafness', 'Laryngalgia', 'Laryngeal', 'Laryngitis']),
 ('Amenorrhea', ['Amenorrhea', 'Sponge'])]

In [80]:
final_cluster_dict["Common cold"]

['Dentifrice',
 'Dentition',
 'Denture',
 'Medicine (Vet)',
 'Respiratory',
 'Rope',
 'Tongue',
 'Toothstick',
 'Pockmark',
 'Mucus-Mover',
 'Scleroderma']

With this threshold, we have an acceptable amount of symptoms matched to activities as recorded in Dr. Duke's database, however they are much too broad and their relation sometimes seems far fetched.

##### Threshold 0.15:

In [81]:
cluster_dict, outliers, labels = soft_cluster(embeddings1_df, min_sample_n = 1,
            min_cluster_n = 2, threshold = 0.15)
print(len(labels), "labels")
print(len(outliers), "outliers")
cluster_sym_dict = generate_dict_match(cluster_dict)
final_cluster_dict = create_dict_sym(cluster_sym_dict)
print(len(final_cluster_dict), "symptoms matched in the symptom-activity dictionary")
print("Abdominal pain" in final_cluster_dict.keys())
print("Eye pain" in final_cluster_dict.keys())
print("Common cold" in final_cluster_dict.keys())

  in_cluster_probs = all_points_prob_in_some_cluster(


558 labels
472 outliers
323 symptoms matched in the symptom-activity dictionary
True
True
True


In [82]:
list(final_cluster_dict.items())[:5]

[('Acne', ['Acne', 'Lacrimatory', 'Mastitis']),
 ('Alcoholism', ['Alcoholism', 'Drunkenness', 'Home-Remedy']),
 ('Allergy', ['Allergy', 'Fumigant']),
 ('Amblyopia',
  ['Amblyopia', 'Deafness', 'Laryngalgia', 'Laryngeal', 'Laryngitis']),
 ('Amnesia', ['Amnesia', 'Forgetfulness'])]

In [85]:
final_cluster_dict["Common cold"]

['Dentifrice',
 'Dentition',
 'Denture',
 'Medicine (Vet)',
 'Respiratory',
 'Rope',
 'Tongue',
 'Toothstick',
 'Pockmark',
 'Scleroderma']

The soft clustering results continue to seem disappointing. While the length of the final dictionary is alright, and certain matches seem promising, aberrant matches such as "Abdominal pain" matching to "Ankle" continue to be present and cast a shadow of doubt over the results in total.

## Looking Ahead:

The final notebook will deal with visualising the data we have and hopefully yield some more insights into why the clusters sometimes seem so mismatched.