## Imports

In [1]:
import scipy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import math

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score
from kmodes.kmodes import KModes
from kmodes.kprototypes import KPrototypes

## Reading in Data

In [6]:
db = pd.read_csv("./datasets/cleaned.csv", )
db.head()

Unnamed: 0,complaint_number,status,date_entered,house_number,zip_code,house_street,special_district,complaint_category,unit,inspection_date,days_until_inspection,med_inc_zip
0,1245555,CLOSED,2009-01-02,930,10025,WEST END AVENUE,NOT SPECIAL,58,BOILR,2009-06-02,151,82352
1,1245608,CLOSED,2009-01-02,428,10013,BROADWAY,NOT SPECIAL,23,SCFLD,2009-10-09,280,106056
2,1245621,CLOSED,2009-01-02,146,10001,WEST 28 STREET,NOT SPECIAL,63,ELEVR,2009-01-22,20,85221
3,1245634,CLOSED,2009-01-03,388,10013,BROADWAY,NOT SPECIAL,56,BOILR,2009-01-07,4,106056
4,1245648,CLOSED,2009-01-03,375,10016,3 AVENUE,NOT SPECIAL,59,ELCTR,2009-01-08,5,109250


## Preprocessing

In [18]:
drop_cols = [
    "complaint_number",
    "days_until_inspection",
    "status",
    "house_number",
    "house_street",
]
X = db.drop(columns=drop_cols)

## Clustering

Since the dataset contains mostly categorical data and some numerical data as well, using a typical clustering algorithm like KMeans will not work. While we can one-hot encode the categorical columns, this does not give an effective range of numbers to base distances off of, so a more advanced method will be used, KPrototypes. This alogrithm splits the categorical and numerical data apart, and then applies KModes and KMeans, respectively, to the features to create the most appropriate clusters.

In [20]:
# refreshing on which columns are categorical
X.head()

Unnamed: 0,date_entered,zip_code,special_district,complaint_category,unit,inspection_date,med_inc_zip
0,2009-01-02,10025,NOT SPECIAL,58,BOILR,2009-06-02,82352
1,2009-01-02,10013,NOT SPECIAL,23,SCFLD,2009-10-09,106056
2,2009-01-02,10001,NOT SPECIAL,63,ELEVR,2009-01-22,85221
3,2009-01-03,10013,NOT SPECIAL,56,BOILR,2009-01-07,106056
4,2009-01-03,10016,NOT SPECIAL,59,ELCTR,2009-01-08,109250


In [22]:
# trying kprotypes
kp = KPrototypes(n_clusters=5, random_state=42)

In [23]:
kp_model = kp.fit(X, categorical=[0,1,2,3,4,5])

In [None]:
kp_model.