In [94]:
import pandas as pd
import numpy as np

df = pd.read_csv('diabetes_012_health_indicators_BRFSS2015.csv')
df.shape

(253680, 22)

# Recoding and Clustering

In [95]:
# FEATURE SELECTION: drop column based on random forest result
cols_to_save = ['Diabetes_012','GenHlth', 'HighBP', 'BMI', 'Age', 'HighChol',
                'DiffWalk', 'Income', 'PhysHlth', 'HeartDiseaseorAttack']
df_fs = df.filter(cols_to_save)
df_fs.shape

(253680, 10)

In [96]:
age_recode = {
    1: 1, #18-24
    2: 2, 3: 2, 4: 2,  # 25-39
    5: 3, 6: 3, 7: 3, # 40-54
    8: 4, 9: 4,# 55-64
    10: 5, 11: 5, 12: 5, 13: 5 # 65+
}

df_fs['Age'] = df_fs['Age'].map(age_recode)
df_fs['Age'].value_counts()

Age
5    89070
4    64076
3    62290
2    32544
1     5700
Name: count, dtype: int64

In [97]:
income_recode = {
    1: 1, 2: 1, # proverty $15000- 
    3: 2, 4: 2, # low income $15000-$24999
    5: 3, 6: 3, # medium-low income 25000-$49999
    7: 4, # medium income $50000 - $75000
    8: 5 # high income $75000+
}

df_fs['Income'] = df_fs['Income'].map(income_recode)
df_fs['Income'].value_counts()

Income
5    90385
3    62353
4    43219
2    36129
1    21594
Name: count, dtype: int64

In [98]:
# Prep for DBSCAN: drop response variable and numerical predictors
df_cluster = df_fs.drop(['Diabetes_012','BMI','PhysHlth'], axis = 1)
df_cluster = df_cluster.astype(int)
print(df_cluster.shape)

(253680, 7)


pip install kmodes

In [100]:
from kmodes.kmodes import KModes

# init='Cao: initial centroids based on density of points
# n_clusters=44: all possible cluster = 2^4 * 5^3 = 2000, use sqrt
km = KModes(n_clusters=44, init='Cao', n_init=3, verbose=0)
clusters = km.fit_predict(df_cluster)

# Add cluster membership to DataFrame
df_cluster['Membership'] = clusters
df_cluster.head()

Unnamed: 0,GenHlth,HighBP,Age,HighChol,DiffWalk,Income,HeartDiseaseorAttack,Membership
0,5,1,4,1,1,2,0,41
1,3,0,3,0,0,1,0,20
2,5,1,4,1,1,5,0,5
3,2,1,5,0,0,3,0,17
4,2,1,5,1,0,2,0,26


In [101]:
# Summary DataFrame on Membership
cluster_summary = df_cluster['Membership'].value_counts().reset_index()
cluster_summary.columns = ['Cluster', 'Size']
cluster_summary['Percentage(%)'] = (cluster_summary['Size'] / df_cluster.shape[0]) * 100

cluster_summary.head()

Unnamed: 0,Cluster,Size,Percentage(%)
0,0,36606,14.429991
1,1,23868,9.408704
2,9,12466,4.914065
3,6,11925,4.700804
4,8,10551,4.159177


In [102]:
# Concatenate back response variable and numerical predictors
df_cluster = df_cluster.join(df_fs[['Diabetes_012', 'BMI', 'PhysHlth']])
print(df_cluster.shape)
df_cluster.head()

(253680, 11)


Unnamed: 0,GenHlth,HighBP,Age,HighChol,DiffWalk,Income,HeartDiseaseorAttack,Membership,Diabetes_012,BMI,PhysHlth
0,5,1,4,1,1,2,0,41,0.0,40.0,15.0
1,3,0,3,0,0,1,0,20,0.0,25.0,0.0
2,5,1,4,1,1,5,0,5,0.0,28.0,30.0
3,2,1,5,0,0,3,0,17,0.0,27.0,0.0
4,2,1,5,1,0,2,0,26,0.0,24.0,0.0


In [105]:
# Calculate the count and percentage of Diabetes_012 == 1 within each cluster
diabetes_rates = df_cluster.groupby('Membership').apply(
    lambda g: pd.Series({
        'Diabetes_Count': (g['Diabetes_012'] == 1).sum(),
        'Diabetes_Percentage': 100 * (g['Diabetes_012'] == 1).sum() / len(g)
    })
).reset_index()

# add to cluster_summary
cluster_summary = cluster_summary.merge(diabetes_rates, left_on='Cluster', right_on='Membership')

In [106]:
cluster_summary

Unnamed: 0,Cluster,Size,Percentage(%),Membership,Diabetes_Count,Diabetes_Percentage
0,0,36606,14.429991,0,348.0,0.950664
1,1,23868,9.408704,1,723.0,3.02916
2,9,12466,4.914065,9,70.0,0.561527
3,6,11925,4.700804,6,150.0,1.257862
4,8,10551,4.159177,8,324.0,3.070799
5,19,9582,3.7772,19,51.0,0.532248
6,16,9078,3.578524,16,34.0,0.374532
7,17,8604,3.391675,17,209.0,2.429103
8,13,8455,3.332939,13,136.0,1.608516
9,10,8280,3.263955,10,223.0,2.693237


In [107]:
# Merge cluster with similar Diabetes_Percentage within a range of ±0.1%
cluster_summary = cluster_summary.sort_values(by='Diabetes_Percentage')

# Func to regroup based on Diabetes_Percentage range
def group_clusters(data, tolerance=0.1): #±0.1%: the data is with %
    groups = []
    temp_group = []
    last_percentage = -1

    for _, row in data.iterrows():
        if last_percentage < 0 or abs(row['Diabetes_Percentage'] - last_percentage) <= tolerance:
            temp_group.append(row)
        else:
            groups.append(temp_group)
            temp_group = [row]
        last_percentage = row['Diabetes_Percentage']
    groups.append(temp_group)  # add  last group

    # Assign new cluster ID to each group
    new_cluster_id = 0
    for group in groups:
        for item in group:
            data.loc[item.name, 'New_Cluster_ID'] = new_cluster_id
        new_cluster_id += 1

    return data

new_cluster_summary = group_clusters(cluster_summary)
new_cluster_summary

Unnamed: 0,Cluster,Size,Percentage(%),Membership,Diabetes_Count,Diabetes_Percentage,New_Cluster_ID
11,36,7286,2.872122,36,16.0,0.219599,0.0
6,16,9078,3.578524,16,34.0,0.374532,1.0
5,19,9582,3.7772,19,51.0,0.532248,2.0
2,9,12466,4.914065,9,70.0,0.561527,2.0
24,38,3454,1.361558,38,24.0,0.694847,3.0
29,29,2258,0.890098,29,21.0,0.930027,4.0
0,0,36606,14.429991,0,348.0,0.950664,4.0
28,39,2806,1.106118,39,27.0,0.962224,4.0
21,18,4268,1.682435,18,46.0,1.077788,5.0
19,28,4845,1.909886,28,55.0,1.135191,5.0


In [109]:
# Merge to assign new cluster IDs based on the Diabetes_Percentage similarities
df_cluster = df_cluster.merge(cluster_summary[['Membership', 'New_Cluster_ID']], on='Membership', how='left')

# Replace the old cluster IDs with new IDs
df_cluster['Membership'] = df_cluster['New_Cluster_ID']
df_cluster.drop('New_Cluster_ID', axis=1, inplace=True)

In [112]:
df_cluster['Membership'] = df_cluster['Membership'].astype(int)
df_cluster

Unnamed: 0,GenHlth,HighBP,Age,HighChol,DiffWalk,Income,HeartDiseaseorAttack,Membership,Diabetes_012,BMI,PhysHlth
0,5,1,4,1,1,2,0,16,0.0,40.0,15.0
1,3,0,3,0,0,1,0,10,0.0,25.0,0.0
2,5,1,4,1,1,5,0,14,0.0,28.0,30.0
3,2,1,5,0,0,3,0,12,0.0,27.0,0.0
4,2,1,5,1,0,2,0,13,0.0,24.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
253675,3,1,3,1,0,4,0,13,0.0,45.0,5.0
253676,4,1,5,1,1,2,0,14,2.0,18.0,0.0
253677,1,0,2,0,0,1,0,2,0.0,28.0,0.0
253678,3,1,3,0,0,1,0,7,0.0,23.0,0.0


# Outlier Detection

In [158]:
def remove_outliers_IQR(group):
    Q1 = group['BMI'].quantile(0.25)
    Q3 = group['BMI'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return group[(group['BMI'] >= lower_bound) & (group['BMI'] <= upper_bound)]

# Apply the function to each group and concatenate the results
cleaned_df = df_cluster.groupby('Membership').apply(remove_outliers_IQR).reset_index(drop=True)

In [159]:
cleaned_df

Unnamed: 0,GenHlth,HighBP,Age,HighChol,DiffWalk,Income,HeartDiseaseorAttack,Membership,Diabetes_012,BMI,PhysHlth,is_outlier
0,1,0,3,0,0,5,0,0,0.0,23.0,0.0,1
1,1,0,3,0,0,5,0,0,0.0,23.0,0.0,1
2,1,0,3,0,0,5,0,0,0.0,22.0,0.0,1
3,1,0,3,0,0,5,0,0,0.0,22.0,0.0,1
4,1,0,3,0,0,5,0,0,0.0,25.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
245562,5,1,4,1,1,1,0,16,0.0,29.0,4.0,1
245563,5,1,2,1,1,2,0,16,0.0,26.0,30.0,1
245564,3,1,4,1,1,2,1,16,0.0,37.0,2.0,1
245565,4,1,4,1,0,3,1,16,2.0,27.0,5.0,1


In [164]:
cleaned_df = cleaned_df.drop(['is_outlier'], axis = 1)
cleaned_df

Unnamed: 0,GenHlth,HighBP,Age,HighChol,DiffWalk,Income,HeartDiseaseorAttack,Membership,Diabetes_012,BMI,PhysHlth
0,1,0,3,0,0,5,0,0,0.0,23.0,0.0
1,1,0,3,0,0,5,0,0,0.0,23.0,0.0
2,1,0,3,0,0,5,0,0,0.0,22.0,0.0
3,1,0,3,0,0,5,0,0,0.0,22.0,0.0
4,1,0,3,0,0,5,0,0,0.0,25.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
245562,5,1,4,1,1,1,0,16,0.0,29.0,4.0
245563,5,1,2,1,1,2,0,16,0.0,26.0,30.0
245564,3,1,4,1,1,2,1,16,0.0,37.0,2.0
245565,4,1,4,1,0,3,1,16,2.0,27.0,5.0


In [139]:
# Drop prediabetes
cleaned_df = cleaned_df[cleaned_df['Diabetes_012'] != 1]
Diabetes_012_recode = {0: 0, 2: 1}

cleaned_df['Diabetes_012'] = cleaned_df['Diabetes_012'].map(Diabetes_012_recode)
cleaned_df['Diabetes_012'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['Diabetes_012'] = cleaned_df['Diabetes_012'].map(Diabetes_012_recode)


Diabetes_012
0    208141
1     33016
Name: count, dtype: int64

In [143]:
count = cleaned_df['Membership'].value_counts()
percent = count/cleaned_df.shape[0]
percent

Membership
14    0.203187
4     0.166846
7     0.099097
2     0.087955
13    0.086728
12    0.048582
6     0.047600
10    0.041367
1     0.036325
5     0.036288
8     0.033870
15    0.033518
0     0.029213
16    0.022404
3     0.013692
9     0.011615
11    0.001713
Name: count, dtype: float64

In [144]:
# member 11 is too small (<0.01, drop it)
cleaned_df = cleaned_df[cleaned_df['Membership'] != 11]
count = cleaned_df['Membership'].value_counts()
percent = count/cleaned_df.shape[0]
percent

Membership
14    0.203536
4     0.167132
7     0.099267
2     0.088106
13    0.086877
12    0.048666
6     0.047681
10    0.041438
1     0.036387
5     0.036350
8     0.033928
15    0.033575
0     0.029263
16    0.022443
3     0.013716
9     0.011635
Name: count, dtype: float64

In [165]:
# Data set 1: balance by equal sampling from each group

groups = cleaned_df.groupby('Membership')

# Sampling from each group
df_balanced = groups.apply(lambda x: x.sample(n=1000, random_state=42) if len(x) > 1000 else x)
df_balanced = df_balanced.reset_index(drop=True)
df_balanced.shape

(16423, 11)

In [146]:
# write file
filename = 'balanced.csv'
df_balanced.to_csv(filename, index=False)

In [147]:
# Data set 2: with 3 variable

# Randomly sample 16000 observations
df_rand_3 = cleaned_df.sample(n=16000, random_state=42)

# Keep 3 predictors only
cols_to_save = ['Diabetes_012', 'BMI','PhysHlth', 'Membership']
df_rand_3 = df_rand_3.filter(cols_to_save)
df_rand_3.shape

(16000, 4)

In [148]:
# write file
filename = 'random.csv'
df_rand_3.to_csv(filename, index=False)

In [153]:
# Test set

# Prevant overlapping
excluded_indices = df_rand_3.index.union(df_balanced.index)
remaining_df = cleaned_df.loc[~cleaned_df.index.isin(excluded_indices)]

# 30% of 160000: 4800 obs from remaining df
if len(remaining_df) >= 4800:
    test_set = remaining_df.sample(n=4800, random_state=42)
else:
    print("Not enough data to sample 4800 unique entries.")

test_set.shape

(4800, 12)

In [166]:
test_set

Unnamed: 0,GenHlth,HighBP,Age,HighChol,DiffWalk,Income,HeartDiseaseorAttack,Membership,Diabetes_012,BMI,PhysHlth,is_outlier
107072,3,1,3,0,0,4,0,7,0,25.0,0.0,1
83184,2,0,1,0,0,2,0,5,0,37.0,0.0,1
189371,4,1,5,0,1,4,0,14,0,24.0,20.0,1
126477,2,0,4,1,0,4,0,8,0,28.0,0.0,1
218555,3,1,5,1,1,3,1,14,0,31.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
29220,1,0,4,0,0,5,0,2,0,24.0,0.0,1
137859,3,0,3,0,0,3,0,10,0,28.0,0.0,1
40357,2,0,3,0,0,3,0,3,0,32.0,1.0,1
46256,1,1,4,1,0,5,0,4,0,29.0,0.0,1


In [168]:
test_set_balanced = test_set.drop(['is_outlier'],axis = 1)
test_set_balanced

Unnamed: 0,GenHlth,HighBP,Age,HighChol,DiffWalk,Income,HeartDiseaseorAttack,Membership,Diabetes_012,BMI,PhysHlth
107072,3,1,3,0,0,4,0,7,0,25.0,0.0
83184,2,0,1,0,0,2,0,5,0,37.0,0.0
189371,4,1,5,0,1,4,0,14,0,24.0,20.0
126477,2,0,4,1,0,4,0,8,0,28.0,0.0
218555,3,1,5,1,1,3,1,14,0,31.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
29220,1,0,4,0,0,5,0,2,0,24.0,0.0
137859,3,0,3,0,0,3,0,10,0,28.0,0.0
40357,2,0,3,0,0,3,0,3,0,32.0,1.0
46256,1,1,4,1,0,5,0,4,0,29.0,0.0


In [169]:
# write file
filename = 'balanced_test.csv'
test_set_balanced.to_csv(filename, index=False)

In [170]:
cols_to_save = ['Diabetes_012', 'BMI','PhysHlth', 'Membership']
test_set_rand_3 = test_set.filter(cols_to_save)
test_set_rand_3.shape

(4800, 4)

In [172]:
# write file
filename = 'random_test.csv'
test_set_rand_3.to_csv(filename, index=False)