# Cluster Analysis

In [99]:
import warnings
warnings.filterwarnings('ignore')

from helper_funcs import *
from sqlalchemy import create_engine

import numpy as np
import scipy.stats as st
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# SQLAlchemy Engine
engine = create_engine(generate_url())

In [46]:
df = pd.read_sql_table(table_name='features', con=engine.connect())
df.head(5)

Unnamed: 0,OrderYear,OrderMonth,DaysToShip,Segment,Region,State,Department,Division,Profit
0,2015,11,4,Corporate,East,Vermont,Furniture,Bookcases,1013.13
1,2016,1,4,Corporate,Central,Wisconsin,Furniture,Bookcases,407.13
2,2017,5,4,Consumer,South,Georgia,Furniture,Bookcases,374.63
3,2016,6,2,Consumer,South,Georgia,Furniture,Bookcases,291.38
4,2016,10,0,Corporate,East,Connecticut,Furniture,Bookcases,271.42


In [4]:
df.shape

(8058, 9)

In [59]:
# Detect columns as either numeric or categorical
num_cols = df.select_dtypes(exclude='object').columns.tolist()
cat_cols = df.select_dtypes(include='object').columns.tolist()

print('Categorical Features:', cat_cols)
#cat_cols.tolist()

Categorical Features: ['Segment', 'Region', 'State', 'Department', 'Division']


#### KMeans with Recoded Categorical Variables

SKLearn's OneHotEncoder may work too, but for our purposes, we're gonna do this quick and dirty.

In [128]:
# Encode categorical data with dummy variables, normalize numerical features, then concatenate back together.
# For consistency among feature column labels, make quick adjustments.
norm_df = pd.concat([df[num_cols].apply(st.zscore), pd.get_dummies(df[cat_cols])], axis=1)
norm_df = norm_df.rename(mapper={s:str(s).replace(' ', '_').strip() for s in norm_df.columns})
norm_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8058 entries, 0 to 8057
Data columns (total 80 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   OrderYear                   8058 non-null   float64
 1   OrderMonth                  8058 non-null   float64
 2   DaysToShip                  8058 non-null   float64
 3   Profit                      8058 non-null   float64
 4   Segment_Consumer            8058 non-null   uint8  
 5   Segment_Corporate           8058 non-null   uint8  
 6   Segment_Home Office         8058 non-null   uint8  
 7   Region_Central              8058 non-null   uint8  
 8   Region_East                 8058 non-null   uint8  
 9   Region_South                8058 non-null   uint8  
 10  Region_West                 8058 non-null   uint8  
 11  State_Alabama               8058 non-null   uint8  
 12  State_Arizona               8058 non-null   uint8  
 13  State_Arkansas              8058 

In [139]:
# Correct a TypeError
norm_df.columns = norm_df.columns.astype(str)

In [137]:
inertial_vals = []

K_vals  = range(1, 10)
for k in K_vals:
    kmeans = KMeans(k)
    kmeans.fit(norm_df)
    inertial_vals.append(kmeans.inertia_)

plt.plot(K_vals, inertial_vals, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Sum of Squared Distances (Inertia)')
plt.show()

In [156]:
kmeans = KMeans(n_clusters=5, random_state=101).fit(norm_df)
df['kMeans Cluster Label'] = kmeans.labels_

In [157]:
df['kMeans Cluster Label'].value_counts()

1    2411
2    2059
0    1989
3    1581
4      18
Name: kMeans Cluster Label, dtype: int64

In [158]:
# TODO: Explore possible patterns among clusters

In [159]:
engine.connect().close()
engine.dispose()