# Introduction & set-up
Lorem ipsum

## Package imports

In [1]:
import numpy as np
import pandas as pd
from fancyimpute import MICE, SoftImpute, KNN  #if speed desired, KNN or SoftImpute
from sklearn.preprocessing import StandardScaler
import hdbscan

# (Optional) To get a progress bar on processing
from tqdm import tqdm

ImportError: No module named 'hdbscan'

## Data import and preparation

In [5]:
# Import data
data = pd.read_stata(r"X:\Box\a_dataverse_JJ\IPA Studies\88 - Robinson - Savings Constraints, Kenya\Robinson - Savings Constraints\dataset_savingsAEJ.dta", convert_categoricals=False, convert_dates=False)

In [6]:
data.shape

(395, 65)

In [8]:
# ID columns designated as 'other', those that were randomly generated, and those with personal identification / all uniques
cols_other = []
cols_rand = []
cols_uid = []

for c in list(data.columns):
    # consider making others categoricals
    if 'other' in c:
        cols_other.append(c)
    # these two search for 'id' variables, because they don't have meaning and should be excluded
    if 'rand' in c and 'brand' not in c:
        cols_rand.append(c)
    if len(set(data[c])) == len(data[c]):
        cols_uid.append(c)
        
droppable_cols = set(cols_other + cols_rand + cols_uid)

In [9]:
for dc in droppable_cols:
    del data[dc]

In [10]:
data.shape

(395, 64)

In [11]:
# cleaning timestamped data in a form that works for training models
for c in list(data.columns):
    if type(data[c][0]) == pd.Timestamp:
        data[c] = data[c].astype(np.int64)

In [12]:
# converting string variable to numeric categoricals
data = pd.get_dummies(data)
data.shape

(395, 64)

In [14]:
# Impute missing data: MICE is best choice for statistical accuracy; if speed desired, KNN or SoftImpute
## DO NOT use sklearn's imputation function (introduces bias)

if data.isnull().values.any(): #tests if there are any missing values
    data_imputed = SoftImpute().complete(data.values)
else:
    data_imputed = data
    print('There appears to be no missing values.')

[SoftImpute] Max Singular Value of X_init = 289184.522124
[SoftImpute] Iter 1: observed MAE=41.524098 rank=15
[SoftImpute] Iter 2: observed MAE=41.918620 rank=15
[SoftImpute] Iter 3: observed MAE=42.264384 rank=15
[SoftImpute] Iter 4: observed MAE=42.525934 rank=15
[SoftImpute] Iter 5: observed MAE=42.718912 rank=15
[SoftImpute] Iter 6: observed MAE=42.857858 rank=15
[SoftImpute] Iter 7: observed MAE=42.956129 rank=15
[SoftImpute] Iter 8: observed MAE=43.025989 rank=15
[SoftImpute] Iter 9: observed MAE=43.078216 rank=15
[SoftImpute] Iter 10: observed MAE=43.117332 rank=15
[SoftImpute] Iter 11: observed MAE=43.146743 rank=15
[SoftImpute] Iter 12: observed MAE=43.168116 rank=15
[SoftImpute] Iter 13: observed MAE=43.183626 rank=15
[SoftImpute] Iter 14: observed MAE=43.195236 rank=15
[SoftImpute] Iter 15: observed MAE=43.204137 rank=15
[SoftImpute] Iter 16: observed MAE=43.210296 rank=15
[SoftImpute] Iter 17: observed MAE=43.214496 rank=15
[SoftImpute] Iter 18: observed MAE=43.218202 rank=

In [15]:
# Normalize the data
data_normalized = pd.DataFrame(StandardScaler().fit_transform(data_imputed))

# Algorithm implementation on new data

In [16]:
data_normalized

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,-1.326375,1.640603,-0.655298,-0.844841,-0.990017,-0.463168,-1.186469,-2.680549,-0.708450,-0.556439,...,0.012989,0.170923,-0.088526,-0.488078,-0.150081,-1.573524,-0.439645,-0.636206,-0.526153,-0.787000
1,-1.324603,1.640603,-0.655298,-0.844841,-0.990017,-0.442763,0.761296,0.606963,-0.708450,-0.556439,...,0.012488,-0.642221,0.969456,0.141784,-0.143522,0.190869,0.150895,0.790986,0.073397,0.824675
2,-1.323717,1.640603,-0.655298,-0.844841,1.012542,2.868536,-1.318651,-2.677802,-0.708450,-0.556439,...,0.012993,0.170644,-0.088584,-0.487801,-0.150136,-1.572213,-0.439406,-0.635662,-0.525832,-0.786387
3,-1.322831,1.640603,-0.655298,-0.844841,-0.990017,-1.565971,0.761296,0.606963,-0.708450,-0.556439,...,0.012488,-0.257234,-0.081599,0.223512,-0.123931,-0.799182,1.958203,-0.152179,4.017436,-0.752117
4,-1.321945,1.640603,-0.655298,-0.844841,-0.990017,-0.528295,0.761296,0.338897,-0.708450,-0.556439,...,-0.184431,-0.273238,-0.117335,0.187267,0.148664,0.918507,-0.225074,-0.115226,-0.407792,-0.446224
5,-1.321059,1.640603,-0.655298,-0.844841,-0.990017,-0.875785,0.761296,0.566753,-0.708450,-0.556439,...,0.555934,-0.004265,0.019924,-0.059305,-0.143522,0.982274,-0.036572,0.178873,-0.151798,0.229900
6,-1.320173,1.640603,-0.655298,-0.844841,-0.990017,-0.380371,0.761296,0.524716,1.411533,1.801969,...,0.048879,0.439621,-0.081599,-0.521514,-0.143522,-0.033424,-0.452379,-0.663107,-0.464706,-0.596544
7,-1.318401,1.640603,-0.655298,-0.844841,-0.990017,-0.389533,0.761296,0.606963,1.411533,1.801969,...,-0.110981,0.174434,0.249660,-0.213820,0.573452,1.188804,-0.453005,-0.664616,-0.405745,-0.440818
8,-1.317515,1.640603,-0.655298,-0.844841,1.012542,-0.543591,0.761296,0.566301,1.411533,1.801969,...,-0.175620,2.089260,-0.081599,0.178664,-0.101541,0.559586,-0.450641,-0.658919,-0.462419,-0.590503
9,-1.315743,1.640603,-0.655298,-0.844841,1.012542,-0.543591,0.761296,0.281670,1.411533,1.801969,...,-0.023870,0.661805,-0.081599,0.486520,0.084521,0.677845,-0.446048,-0.647849,-0.434502,-0.516770


In [74]:
# Fit and run the model
clusterer = hdbscan.HDBSCAN(min_cluster_size=50) # , allow_single_cluster=True)
cluster_labels = clusterer.fit_predict(data_normalized)

In [75]:
len(np.unique(cluster_labels))

1

In [76]:
for x in set(list(cluster_labels)):
    print (str(x))
    print (list(cluster_labels).count(x))

-1
395


In [100]:
list(cluster_labels).count(-1)

7