In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from eli5.sklearn import InvertableHashingVectorizer
from sklearn.decomposition import PCA

import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

%matplotlib inline



In [83]:
df_train = pd.read_csv('../datasets/df_train_cleaned_shortened.csv')
df_train = df_train.sample(n=20000)

In [84]:
drugName = df_train['drugName'].tolist()
condition = df_train['condition'].tolist()
rating = df_train['rating'].tolist()
reviews = df_train['review'].tolist()

In [85]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [86]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [87]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in reviews:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [88]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print ('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

there are 1234079 items in vocab_frame


In [89]:
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.85, max_features=1000,
                                 min_df=4, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,2))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(reviews) #fit the vectorizer to synopses

print(tfidf_matrix.shape)



CPU times: user 23.9 s, sys: 167 ms, total: 24.1 s
Wall time: 24.3 s
(20000, 1000)


In [90]:
terms = tfidf_vectorizer.get_feature_names()

In [91]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

## K-Means

In [93]:
num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 8.13 s, sys: 200 ms, total: 8.33 s
Wall time: 2.58 s


In [94]:
import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [95]:
drugs = {'drugName': drugName, 
         'condition': condition, 
         'cluster': clusters, 
         'rating':rating,
        'reviews':reviews}

In [96]:
frame = pd.DataFrame(drugs, index = [clusters] , columns = ['drugName', 'condition', 'reviews','cluster', 'rating'])

In [97]:
frame['cluster'].value_counts()

0    9514
3    3763
4    3520
2    2199
1    1004
Name: cluster, dtype: int64

In [98]:

grouped = frame['rating'].groupby(frame['cluster']) #groupby cluster for aggregation purposes

grouped.mean() #average rank (1 to 100) per cluster

cluster
0    7.230818
1    7.164343
2    6.831287
3    7.144034
4    6.322443
Name: rating, dtype: float64

In [104]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d titles:" % i, end='')
    for condition in frame.loc[i]['condition'].values.tolist():
        print(' %s,' % condition, end='')
    print() #add whitespace
    print() #add whitespace
    
print()
print()

Top terms per cluster:

Cluster 0 words: b'work', b'wa', b'day', b'effect', b'use', b'year',



Cluster 0 titles: Obesity, Ulcerative Colitis, Birth Control, Vaginal Yeast Infection, Smoking Cessation, Back Pain, Pain, Supraventricular Tachycardia, Nausea/Vomiting, Chemotherapy Induced, Crohn's Disease, Acute, Diabetes, Type 2, Allergic Rhinitis, Bipolar Disorde, Major Depressive Disorde, Cluster Headaches, Helicobacter Pylori Infection, High Blood Pressure, Diabetes, Type 2, Cold Sores, Vulvodynia, Vaginal Yeast Infection, Bowel Preparation, Acne, Perimenopausal Symptoms, Bladder Infection, Hemorrhoids, Constipation, Dysuria, Diabetes, Type 2, Rheumatoid Arthritis, Smoking Cessation, Onychomycosis, Toenail, Opiate Dependence, Migraine, Edema, Generalized Anxiety Disorde, ibromyalgia, Atopic Dermatitis, Insomnia, Hypogonadism, Male, Bipolar Disorde, Seizures, Allergic Rhinitis, Depression, Acne, ibromyalgia, Acne, Depression, Insomnia, Weight Loss, Acne, Constipation, Chronic, Diabetes, Type 1, ADHD, Diabetes, Type 2, Birth Control, Asthma, acute, Pain, Hyperuricemia Secondary to

 Diabetes, Type 2, Urinary Incontinence, Weight Loss, Cough, Bipolar Disorde, Bronchitis, Ulcerative Colitis, Sinusitis, Conjunctivitis, Bacterial, High Cholesterol, Birth Control, Bipolar Disorde, Weight Loss, Bronchiectasis, Vaginal Yeast Infection, Birth Control, Birth Control, Weight Loss, Constipation, Diabetes, Type 2, Epilepsy, Pulmonary Embolism, Acne, Vaginal Yeast Infection, ADHD, Nausea/Vomiting, Constipation, Cough, Weight Loss, Smoking Cessation, Hepatitis C, Obesity, ADHD, Major Depressive Disorde, Anal Fissure and Fistula, Urinary Tract Infection, GERD, Depression, Schizophrenia, Herpes Simplex, Chronic Pain, Nausea/Vomiting of Pregnancy, Allergic Reactions, Overactive Bladde, Seizures, Migraine Prevention, Pneumonia, Organ Transplant, Rejection Prophylaxis, Diabetes, Type 2, Bowel Preparation, High Blood Pressure, Headache, Diabetes, Type 2, Insomnia, Vaginal Yeast Infection, Allergic Rhinitis, Psoriasis, Diabetes, Type 2, ADHD, Hypercalcemia of Malignancy, Rosacea, Sin

 High Blood Pressure, Muscle Spasm, Allergic Reactions, Eye Redness/Itching, Hypertriglyceridemia, Expectoration, Abnormal Uterine Bleeding, Headache, Abnormal Uterine Bleeding, Giardiasis, Migraine Prevention, Diabetes, Type 2, Allergies, Otitis Media, Vaginal Yeast Infection, Chronic Pain, Urinary Incontinence, Birth Control, Alcohol Dependence, Weight Loss, Smoking Cessation, Anxiety, Hepatitis C, Nausea/Vomiting, Urticaria, Gastroparesis, GERD, Weight Loss, High Blood Pressure, Obesity, Women (oxybutynin), Bipolar Disorde, Acne, Ulcerative Colitis, Maintenance, Renal Cell Carcinoma, Acne, Ovulation Induction, Emergency Contraception, Weight Loss, Bacterial Vaginitis, Cough, Pulmonary Embolism, Asthma, acute, Diabetes, Type 2, Erectile Dysfunction, Benign Prostatic Hyperplasia, Hepatitis C, Psoriatic Arthritis, Smoking Cessation, ge (amlodipine / valsartan), Seizures, Anxiety, Schizoaffective Disorde, Lennox-Gastaut Syndrome, Epilepsy, Asthma, Maintenance, Acne, Anxiety, Influenza P

 Allergic Rhinitis, Diabetes, Type 2, Insomnia, High Blood Pressure, Birth Control, Diagnosis and Investigation, Migraine, Cold Sores, ADHD, Psoriatic Arthritis, Seizure Prevention, Allergic Rhinitis, Seasonal Allergic Conjunctivitis, Acne, Weight Loss, Migraine Prevention, Insomnia, Nausea/Vomiting, Chemotherapy Induced, Obesity, Hypersomnia, Weight Loss, Overactive Bladde, Vaginal Yeast Infection, Vaginal Yeast Infection, Weight Loss, Prevention of Thromboembolism in Atrial Fibrillation, Vaginal Yeast Infection, Acne, Migraine Prevention, Migraine Prevention, Birth Control, Vaginal Yeast Infection, Obesity, Pain, Nausea/Vomiting, Sjogren's Syndrome, Vaginal Yeast Infection, Bipolar Disorde, Emergency Contraception, Obesity, Pain, Erectile Dysfunction, High Blood Pressure, Diabetes, Type 2, Irritable Bowel Syndrome, Narcolepsy, Bowel Preparation, Allergic Rhinitis, Bipolar Disorde, Diabetes, Type 2, Amenorrhea, Diabetes, Type 2, Cough, Edema, Psoriatic Arthritis, Dry Eye Disease, Bron

Cluster 1 words: b'quot', b'wa', b'day', b'feel', b'year', b'effect',

Cluster 1 titles: Anxiety, Irritable Bowel Syndrome, Insomnia, Bowel Preparation, Hypothyroidism, After Thyroid Removal, ADHD, Weight Loss, Constipation, Depression, Neutropenia Associated with Chemotherapy, Hepatitis C, Insomnia, Multiple Sclerosis, Asthma, acute, Anxiety, Basal Cell Carcinoma, Irritable Bowel Syndrome, GERD, Sinusitis, Migraine Prevention, Emergency Contraception, Constipation, ADHD, Bipolar Disorde, Neuralgia, Anxiety, Plaque Psoriasis, Epilepsy, Chronic Myelogenous Leukemia, Panic Disorde, Panic Disorde, Anxiety, Birth Control, Major Depressive Disorde, Bladder Infection, Migraine Prevention, Anxiety and Stress, ADHD, Bipolar Disorde, Chlamydia Infection, Acute Coronary Syndrome, Diabetes, Type 2, Smoking Cessation, Overactive Bladde, Obesity, Anorexia, Migraine Prevention, Vaginal Yeast Infection, Acne, Panic Disorde, Birth Control, Bipolar Disorde, Alcohol Withdrawal, Nasal Congestion, Anxiety

Cluster 2 words: b'painful', b'wa', b'day', b'work', b'several', b'help',



Cluster 2 titles: Pain, Chronic Pain, Pain, Depression, ibromyalgia, Pain, Neuropathic Pain, Back Pain, Birth Control, Abnormal Uterine Bleeding, Headache, Pain, Cough, Acute Coronary Syndrome, Pain, Varicella-Zoste, Pain, Chronic Pain, Depression, Irritable Bowel Syndrome, ibromyalgia, Chronic Pain, Pain, ibromyalgia, Osteoarthritis, Endometriosis, Obesity, Diabetic Peripheral Neuropathy, Otitis Externa, Peripheral Neuropathy, Vaginal Yeast Infection, Polycystic Ovary Syndrome, ibromyalgia, Back Pain, Rheumatoid Arthritis, Chronic Idiopathic Constipation, Pain, Birth Control, Pain, Constipation, Pain, Migraine, ibromyalgia, Vitamin D Deficiency, ibromyalgia, Pain, Muscle Spasm, Pain, Anxiety, Muscle Spasm, Osteoarthritis, Birth Control, Diabetes, Type 2, Birth Control, Pain, Birth Control, Cervical Dystonia, Dental Abscess, Pain, Pain, Pain, Pain, Birth Control, Bowel Preparation, Gastroparesis, Pain, Interstitial Cystitis, Chronic Pain, Not Listed / Othe, ibromyalgia, Pain, Generaliz

 Allergic Rhinitis, Conjunctivitis, Bacterial, Irritable Bowel Syndrome, Constipation, Chronic Pain, moterol), Pain, Bacterial Skin Infection, GERD, Varicella-Zoste, Restless Legs Syndrome, Pain, Vaginal Yeast Infection, Smoking Cessation, Birth Control, Hypogonadism, Male, Tendonitis, Gout, Acute, Pain, Pain, Rheumatoid Arthritis, High Cholesterol, Chronic Pain, Osteoporosis, High Cholesterol, Pain, Rheumatoid Arthritis, Bacterial Infection, Birth Control, Osteoarthritis, Chronic Pain, High Blood Pressure, Gout, Acute, Irritable Bowel Syndrome, Pain, ADHD, Vaginal Yeast Infection, Pain, Chronic Pain, Pain, Kidney Infections, Pain, Muscle Spasm, Pain, Ectopic Pregnancy, Muscle Spasm, Pain, Dental Abscess, Migraine Prevention, Opiate Withdrawal, Pain, Pain, Rheumatoid Arthritis, Birth Control, Birth Control, Pain,

Cluster 3 words: b'wa', b'feel', b'anxiety', b'depression', b'sleeping', b'help',



Cluster 3 titles: Major Depressive Disorde, Depression, Constipation, Chronic, Hyperhidrosis, Anxiety and Stress, Birth Control, Constipation, Chronic, mance Anxiety, Depression, Anxiety, Hot Flashes, Nausea/Vomiting of Pregnancy, Anxiety, Birth Control, Anxiety and Stress, Depression, ADHD, Vaginal Yeast Infection, Anxiety, Depression, Bipolar Disorde, Bipolar Disorde, Depression, Birth Control, Anxiety, Restless Legs Syndrome, Depression, Pain, Post Traumatic Stress Disorde, Depression, Depression, Obsessive Compulsive Disorde, Insomnia, Schizophrenia, Anxiety, Migraine Prevention, ADHD, Anxiety, Anxiety, Multiple Sclerosis, Depression, Social Anxiety Disorde, Birth Control, Depression, Anxiety, ADHD, Social Anxiety Disorde, Cance, Panic Disorde, Depression, Anxiety and Stress, Insomnia, Bacterial Vaginitis, Bipolar Disorde, Anxiety, Opiate Withdrawal, Birth Control, Hot Flashes, Post Traumatic Stress Disorde, Anxiety, Sedation, Obsessive Compulsive Disorde, Migraine Prevention, Chro

 Anxiety and Stress, Urticaria, Depression, Anxiety, Anxiety, Insomnia, Obesity, Anxiety, Depression, Birth Control, Hypogonadism, Male, Depression, Bipolar Disorde, Psoriasis, Bipolar Disorde, Depression, Depression, Major Depressive Disorde, Anxiety, Anxiety and Stress, Anxiety and Stress, Tonsillitis/Pharyngitis, Bipolar Disorde, ADHD, Epilepsy, Erectile Dysfunction, Allergic Rhinitis, Depression, ADHD, Bipolar Disorde, Bipolar Disorde, Nausea/Vomiting, Insomnia, Anxiety and Stress, Allergic Rhinitis, Obesity, Insomnia, Anxiety and Stress, Insomnia, Depression, Panic Disorde, Panic Disorde, HIV Infection, Anxiety, Anxiety, Depression, Not Listed / Othe, Obsessive Compulsive Disorde, Panic Disorde, Major Depressive Disorde, Insomnia, Anxiety, Depression, Anxiety, Birth Control, Bipolar Disorde, Anxiety, Depression, Insomnia, Cough and Nasal Congestion, Insomnia, Allergic Rhinitis, Depression, Bipolar Disorde, Depression, Anxiety, Allergic Rhinitis, Depression, Anxiety, Migraine Preve

Cluster 4 titles: Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Endometriosis, Birth Control, Birth Control, Overactive Bladde, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Acne, Acne, Birth Control, Birth Control, Birth Control, Weight Loss, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Menstrual Disorders, Emergency Contraception, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Acne, Birth Control, Birth Control, Birth Control, Herpes Simplex, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Emergency Contraception, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Emergency Contracept

 Birth Control, Birth Control, Birth Control, Acne, Birth Control, Birth Control, Birth Control, Abnormal Uterine Bleeding, Birth Control, Birth Control, Birth Control, Depression, Abnormal Uterine Bleeding, Birth Control, Birth Control, Birth Control, Endometriosis, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Emergency Contraception, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Emergency Contraception, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth Control, Birth C

In [None]:
import os  # for os.path.basename

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

MDS()

# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]
print()
print()

In [None]:

#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e'}

#set up cluster names using a dict
cluster_names = {0: 'Family, home, war', 
                 1: 'Police, killed, murders', 
                 2: 'Father, New York, brothers', 
                 3: 'Dance, singing, love', 
                 4: 'Killed, soldiers, captain'}

In [None]:
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles)) 

#group by cluster
groups = df.groupby('label')


# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, 
            label=cluster_names[name], color=cluster_colors[name], 
            mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelleft='off')
    
ax.legend(numpoints=1)  #show legend with only 1 point

#add label in x,y position with the label as the film title
for i in range(len(df)):
    ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)  

    
    
plt.show() #show the plot

In [None]:
plt.close()

In [None]:

#define custom toolbar location
class TopToolbar(mpld3.plugins.PluginBase):
    """Plugin for moving toolbar to top of figure"""

    JAVASCRIPT = """
    mpld3.register_plugin("toptoolbar", TopToolbar);
    TopToolbar.prototype = Object.create(mpld3.Plugin.prototype);
    TopToolbar.prototype.constructor = TopToolbar;
    function TopToolbar(fig, props){
        mpld3.Plugin.call(this, fig, props);
    };

    TopToolbar.prototype.draw = function(){
      // the toolbar svg doesn't exist
      // yet, so first draw it
      this.fig.toolbar.draw();

      // then change the y position to be
      // at the top of the figure
      this.fig.toolbar.toolbar.attr("x", 150);
      this.fig.toolbar.toolbar.attr("y", 400);

      // then remove the draw function,
      // so that it is not called again
      this.fig.toolbar.draw = function() {}
    }
    """
    def __init__(self):
        self.dict_ = {"type": "toptoolbar"}

In [None]:
#define custom toolbar location
class TopToolbar(mpld3.plugins.PluginBase):
    """Plugin for moving toolbar to top of figure"""

    JAVASCRIPT = """
    mpld3.register_plugin("toptoolbar", TopToolbar);
    TopToolbar.prototype = Object.create(mpld3.Plugin.prototype);
    TopToolbar.prototype.constructor = TopToolbar;
    function TopToolbar(fig, props){
        mpld3.Plugin.call(this, fig, props);
    };

    TopToolbar.prototype.draw = function(){
      // the toolbar svg doesn't exist
      // yet, so first draw it
      this.fig.toolbar.draw();

      // then change the y position to be
      // at the top of the figure
      this.fig.toolbar.toolbar.attr("x", 150);
      this.fig.toolbar.toolbar.attr("y", 400);

      // then remove the draw function,
      // so that it is not called again
      this.fig.toolbar.draw = function() {}
    }
    """
    def __init__(self):
        self.dict_ = {"type": "toptoolbar"}
Here is the actual creation of the interactive scatterplot. I won't go into much more detail about it since it's pretty much a straightforward copy of one of the mpld3 examples, though I use a pandas groupby to group by cluster, then iterate through the groups as I layer the scatterplot. Note that relative to doing this with raw D3, mpld3 is much simpler to integrate into your Python workflow. If you click around the rest of my website you'll see that I do love D3, but for basic interactives I will probably use mpld3 a lot going forward.

Note that mpld3 lets you define some custom CSS, which I use to style the font, the axes, and the left margin on the figure.

In [78]:
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles)) 

#group by cluster
groups = df.groupby('label')

#define custom css to format the font and to remove the axis labeling
css = """
text.mpld3-text, div.mpld3-tooltip {
  font-family:Arial, Helvetica, sans-serif;
}

g.mpld3-xaxis, g.mpld3-yaxis {
display: none; }

svg.mpld3-figure {
margin-left: -200px;}
"""

# Plot 
fig, ax = plt.subplots(figsize=(14,6)) #set plot size
ax.margins(0.03) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    points = ax.plot(group.x, group.y, marker='o', linestyle='', ms=18, 
                     label=cluster_names[name], mec='none', 
                     color=cluster_colors[name])
    ax.set_aspect('auto')
    labels = [i for i in group.title]
    
    #set tooltip using points, labels and the already defined 'css'
    tooltip = mpld3.plugins.PointHTMLTooltip(points[0], labels,
                                       voffset=10, hoffset=10, css=css)
    #connect tooltip to fig
    mpld3.plugins.connect(fig, tooltip, TopToolbar())    
    
    #set tick marks as blank
    ax.axes.get_xaxis().set_ticks([])
    ax.axes.get_yaxis().set_ticks([])
    
    #set axis as blank
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)

    
ax.legend(numpoints=1) #show legend with only one dot

mpld3.display() #show the plot

#uncomment the below to export to html
#html = mpld3.fig_to_html(fig)
#print(html)

## Hierarchical Clustering

In [None]:
from scipy.cluster.hierarchy import ward, dendrogram

linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances

fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=titles);

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.tight_layout() #show plot with tight layout

#uncomment below to save figure
plt.savefig('ward_clusters.png', dpi=200) #save figure as ward_clusters

In [None]:
plt.close()