In [8]:
from IPython.core.display import HTML
css_file = 'style.css'
HTML(open(css_file, 'r').read())

In [3]:
from pandas import read_excel, merge

In [4]:
from numpy import arange

In [5]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [31]:
# Plotly requires pip install plotly
from plotly.offline import iplot
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

# K-means clustering

## The data

In this hypothetical example we have a set of 99 patients using a smart watch, healthcare data app.  While the app continuously collects data, it does require the user to actively sync the data, so as to comply with healthcare regulatiuons.<br/>
To remind them to do so, a campaign consisting of e-mails, short-message services, WhatsApp messages, pamphlets, telephone and long letters are send out during the year (six types of campaigns).  These campaigns are numbered, 1 through 32, and a campaign consists only of a single modality.<br/>
A record is kept of which campaign the patient responded to.  The goal is to cluster the patients, so as to see which campaigns they respond to and which not, so as to target future campaigns.  Thus can be achieved through k-means clustering.  This machine learning technique creates centroids for each cluster based on geometrical distance between an individual data point and the calculated centroid.<br>
The example works just as well for a hypothetical business with a marketing campaign and looking at the respons with respect to sales.

In [9]:
# Importing an Excel spreadsheet with two sheets as two DataFrames
df_campaign = read_excel("PatientResponse.xlsx", sheetname = 0)


The `sheetname` keyword is deprecated, use `sheet_name` instead



In [10]:
df_response = read_excel("PatientResponse.xlsx", sheetname = 1)
# Adding a column of value 1 to act as a count for that instance
df_response["n"] = 1


The `sheetname` keyword is deprecated, use `sheet_name` instead



In [11]:
df_campaign.tail()

Unnamed: 0,CampaignID,Type,Month
27,28,Long letter,November
28,29,SMS,November
29,30,email,December
30,31,WhatsApp,December
31,32,Long letter,December


In [12]:
df_response.tail()

Unnamed: 0,Patient,CampaignID,n
319,99,31,1
320,16,32,1
321,29,32,1
322,46,32,1
323,99,32,1


In [13]:
# Merge on the CampaignID columns
df = merge(df_campaign, df_response, on = "CampaignID")

In [14]:
df.tail()

Unnamed: 0,CampaignID,Type,Month,Patient,n
319,31,WhatsApp,December,99,1
320,32,Long letter,December,16,1
321,32,Long letter,December,29,1
322,32,Long letter,December,46,1
323,32,Long letter,December,99,1


In [15]:
# Create a pivot table to count each of the 32 campaigns
table = df.pivot_table(index = ["Patient"], columns = ["CampaignID"], values = "n")

In [16]:
table.tail()

CampaignID,1,2,3,4,5,6,7,8,9,10,...,23,24,25,26,27,28,29,30,31,32
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
95,,,,,,,,,,,...,,,1.0,,,,,1.0,,
96,,,,,,,,1.0,,,...,,,,,,,,1.0,1.0,
97,1.0,,,,,,,,,1.0,...,,,,,,,,,1.0,
98,,,,1.0,,1.0,,,,,...,,,,,1.0,,,,,
99,,,,,,1.0,,,1.0,,...,,,,,,,,,1.0,1.0


In [17]:
# Fill NA values with 0 and reset the index to CampaignID
table = table.fillna(0).reset_index()

In [18]:
table.tail()

CampaignID,Patient,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
91,95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
92,96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
93,97,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
94,98,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
95,99,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [19]:
# Extracting the columns (32 campaigns)
cols = table.columns[1:]

In [20]:
cols

Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32],
      dtype='object', name='CampaignID')

## K-mean clustering

In [21]:
cluster = KMeans(n_clusters = 5) # At least 7-times times cluster = patients

In [22]:
# Predict the cluster from first patient down all the rows
table["cluster"] = cluster.fit_predict(table[table.columns[2:]])

In [23]:
table.tail()

CampaignID,Patient,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,cluster
91,95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4
92,96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,3
93,97,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
94,98,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4
95,99,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,4


In [24]:
# Principal component separation to create a 2-dimensional picture
pca = PCA(n_components = 2)
table['x'] = pca.fit_transform(table[cols])[:,0]
table['y'] = pca.fit_transform(table[cols])[:,1]
table = table.reset_index()

In [25]:
table.tail()

CampaignID,index,Patient,1,2,3,4,5,6,7,8,...,26,27,28,29,30,31,32,cluster,x,y
91,91,95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4,-0.121409,-0.437478
92,92,96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,3,0.666059,-0.843051
93,93,97,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,-0.474608,-0.368972
94,94,98,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4,-0.485692,-0.393428
95,95,99,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,4,-0.83159,-1.119408


In [26]:
patient_clusters = table[["Patient", "cluster", "x", "y"]]

In [27]:
patient_clusters.tail()

CampaignID,Patient,cluster,x,y
91,95,4,-0.121409,-0.437478
92,96,3,0.666059,-0.843051
93,97,1,-0.474608,-0.368972
94,98,4,-0.485692,-0.393428
95,99,4,-0.83159,-1.119408


In [28]:
final = merge(df_response, patient_clusters)
final = merge(df_campaign, final);

In [29]:
final.tail()

Unnamed: 0,CampaignID,Type,Month,Patient,n,cluster,x,y
319,31,WhatsApp,December,64,1,1,-0.105954,-0.267206
320,32,Long letter,December,16,1,0,-0.433448,-0.020029
321,32,Long letter,December,46,1,0,-0.436308,-0.522012
322,32,Long letter,December,99,1,4,-0.83159,-1.119408
323,32,Long letter,December,29,1,0,-0.449028,0.029626


In [32]:
trace0 = go.Scatter(x = patient_clusters[patient_clusters.cluster == 0]["x"],
                    y = patient_clusters[patient_clusters.cluster == 0]["y"],
                    name = "Cluster 1",
                    mode = "markers",
                    marker = dict(size = 10,
                                 color = "rgba(15, 152, 152, 0.5)",
                                 line = dict(width = 1, color = "rgb(0,0,0)")))
trace1 = go.Scatter(x = patient_clusters[patient_clusters.cluster == 1]["x"],
                    y = patient_clusters[patient_clusters.cluster == 1]["y"],
                    name = "Cluster 2",
                    mode = "markers",
                    marker = dict(size = 10,
                                 color = "rgba(180, 18, 180, 0.5)",
                                 line = dict(width = 1, color = "rgb(0,0,0)")))
trace2 = go.Scatter(x = patient_clusters[patient_clusters.cluster == 2]["x"],
                    y = patient_clusters[patient_clusters.cluster == 2]["y"],
                    name = "Cluster 3",
                    mode = "markers",
                    marker = dict(size = 10,
                                 color = "rgba(132, 132, 132, 0.8)",
                                 line = dict(width = 1, color = "rgb(0,0,0)")))
trace3 = go.Scatter(x = patient_clusters[patient_clusters.cluster == 3]["x"],
                    y = patient_clusters[patient_clusters.cluster == 3]["y"],
                    name = "Cluster 4",
                    mode = "markers",
                    marker = dict(size = 10,
                                 color = "rgba(122, 122, 12, 0.8)",
                                 line = dict(width = 1, color = "rgb(0,0,0)")))
trace4 = go.Scatter(x = patient_clusters[patient_clusters.cluster == 4]["x"],
                    y = patient_clusters[patient_clusters.cluster == 4]["y"],
                    name = "Cluster 5",
                    mode = "markers",
                    marker = dict(size = 10,
                                 color = "rgba(230, 20, 30, 0.5)",
                                 line = dict(width = 1, color = "rgb(0,0,0)")))

data = [trace0, trace1, trace2, trace3, trace4]
fig=go.Figure(data)
iplot(fig)

In [33]:
# e-mails, short-message services, WhatsApp messages, pamphlets, telephone and long letters
final["0"] = final.cluster == 0
final.groupby("0").Type.value_counts()

0      Type       
False  WhatsApp       82
       email          69
       SMS            56
       Telephone      41
       Long letter    16
       Pamphlet       12
True   WhatsApp       19
       Long letter    16
       email           7
       Pamphlet        3
       Telephone       2
       SMS             1
Name: Type, dtype: int64

In [34]:
# Number of patients in this cluster
final[final.cluster == 0]["Patient"].count()

48

In [35]:
# List of patients
final[final.cluster == 0]["Patient"]

7      82
8      89
27     82
28     89
31     13
32     16
34     81
38     82
39     81
42     82
49     46
94     82
95     89
96     89
112    82
113    89
116    13
119    29
122    66
135    89
137    81
144    81
149    89
150    16
152    66
180    82
181    16
182    81
183    46
184    29
185    66
200    81
228    82
229    89
230    29
235    82
236    89
241    81
242    81
261    13
275    66
284    89
307    13
309    46
318    66
320    16
321    46
323    29
Name: Patient, dtype: int64