# K Means Clustering

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
cust_df = pd.read_csv('Cust_Segmentation.csv')
cust_df.head()

Unnamed: 0,Customer Id,Age,Edu,Years Employed,Income,Card Debt,Other Debt,Defaulted,Address,DebtIncomeRatio
0,1,41,2,6,19,0.124,1.073,0.0,NBA001,6.3
1,2,47,1,26,100,4.582,8.218,0.0,NBA021,12.8
2,3,33,2,10,57,6.111,5.802,1.0,NBA013,20.9
3,4,29,2,4,19,0.681,0.516,0.0,NBA009,6.3
4,5,47,1,31,253,9.308,8.908,0.0,NBA008,7.2


In [3]:
df = cust_df.drop('Address', axis = 1)
df.head()

Unnamed: 0,Customer Id,Age,Edu,Years Employed,Income,Card Debt,Other Debt,Defaulted,DebtIncomeRatio
0,1,41,2,6,19,0.124,1.073,0.0,6.3
1,2,47,1,26,100,4.582,8.218,0.0,12.8
2,3,33,2,10,57,6.111,5.802,1.0,20.9
3,4,29,2,4,19,0.681,0.516,0.0,6.3
4,5,47,1,31,253,9.308,8.908,0.0,7.2


In [4]:
df.dtypes

Customer Id          int64
Age                  int64
Edu                  int64
Years Employed       int64
Income               int64
Card Debt          float64
Other Debt         float64
Defaulted          float64
DebtIncomeRatio    float64
dtype: object

In [5]:
X = df.values
X[0:5]

array([[1.000e+00, 4.100e+01, 2.000e+00, 6.000e+00, 1.900e+01, 1.240e-01,
        1.073e+00, 0.000e+00, 6.300e+00],
       [2.000e+00, 4.700e+01, 1.000e+00, 2.600e+01, 1.000e+02, 4.582e+00,
        8.218e+00, 0.000e+00, 1.280e+01],
       [3.000e+00, 3.300e+01, 2.000e+00, 1.000e+01, 5.700e+01, 6.111e+00,
        5.802e+00, 1.000e+00, 2.090e+01],
       [4.000e+00, 2.900e+01, 2.000e+00, 4.000e+00, 1.900e+01, 6.810e-01,
        5.160e-01, 0.000e+00, 6.300e+00],
       [5.000e+00, 4.700e+01, 1.000e+00, 3.100e+01, 2.530e+02, 9.308e+00,
        8.908e+00, 0.000e+00, 7.200e+00]])

## Preprocessing

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
X = np.nan_to_num(X)
X = StandardScaler().fit_transform(X)
X[0:5]

array([[-1.7300143 ,  0.74291541,  0.31212243, -0.37878978, -0.71845859,
        -0.68381116, -0.59048916, -0.52379654, -0.57652509],
       [-1.72593888,  1.48949049, -0.76634938,  2.5737211 ,  1.38432469,
         1.41447366,  1.51296181, -0.52379654,  0.39138677],
       [-1.72186347, -0.25251804,  0.31212243,  0.2117124 ,  0.26803233,
         2.13414111,  0.80170393,  1.90913822,  1.59755385],
       [-1.71778805, -0.75023477,  0.31212243, -0.67404087, -0.71845859,
        -0.42164323, -0.75446707, -0.52379654, -0.57652509],
       [-1.71371263,  1.48949049, -0.76634938,  3.31184882,  5.35624866,
         3.63890032,  1.71609424, -0.52379654, -0.44250653]])

## Training the model

In [8]:
from sklearn import cluster

In [9]:
clusterNum = 3
model = cluster.KMeans()
model

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [10]:
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [11]:
category_labels = model.labels_

## Saving the output into a new CSV File

In [12]:
cust_df['Category'] = category_labels

In [13]:
cust_df.to_csv('Cust_Segmentation_output.csv')

In [14]:
cust_df.groupby('Category').mean()

Unnamed: 0_level_0,Customer Id,Age,Edu,Years Employed,Income,Card Debt,Other Debt,Defaulted,DebtIncomeRatio
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,430.142857,29.266667,1.771429,2.847619,26.27619,0.914629,1.638971,1.0,10.220952
1,371.980769,41.429487,1.416667,15.666667,62.987179,1.243635,2.521769,0.008403,6.623077
2,686.066265,32.126506,1.289157,6.180723,31.572289,0.842361,1.760289,0.0,8.522892
3,421.098765,34.246914,3.469136,5.333333,43.740741,1.061062,2.409827,0.03125,8.538272
4,435.461538,42.923077,1.846154,16.551282,92.820513,3.719641,8.499256,0.031746,15.076923
5,228.542857,31.057143,1.308571,5.457143,27.817143,0.661806,1.475349,0.0,7.896571
6,406.106667,36.013333,2.026667,6.893333,42.84,3.38932,5.541053,0.902778,21.862667
7,433.285714,46.285714,1.928571,22.642857,213.142857,11.737786,16.239857,0.727273,16.821429
