# K-Prototype in Clustering Mixed attributes

- dissimilarity measure 
- prototype: center of the cluster 
<img src="img_clustering/prototype.png" alt="datetime" style="width: 400px;"/>


   - $1$: $1^{st}$ cluster
   - $r$: Numerical attributes
   - $c$: Categorical attributes
   - $y_{i1}$: Object $X_i$ belongs to cluster 1
   - $x_{i1}$: $j^{th}$ attribute
   - $q_{ij}$: $j^{th}$ attribute of the prototype in cluster 1
   - $m_r$: No.of Numerical attributes
   - $m_c$: No.of categorical attributes
   - $m$ = $m_r$ + $m_c$
   - $r_1$: Weight of the categorical attribute in cluster 1
   - If $r_1$ is small, it indicates that the clustering is dominated by numerical attributes
   - If $r_1$ is large, it indicates that the clustering is dominated by categorical attributes
   - $E_1$ = Minimum sum of the difference of all the elements and the prototypes in cluster 1
   - $E_1^r$= Minimum sum of the difference of the numerical attributes of all the elements and the prototypes in cluster 1
   - $E_1^c$= Minimum sum of the difference of the categorical attributes of all the elements and the prototypes in cluster 1

# AFSNT.csv

In [1]:
import pandas as pd

In [2]:
df_afsnt = pd.read_csv("data/AFSNT.csv", encoding="CP949")

In [3]:
def time_dicrete(stt):
    time_dic = []
    for i in range(len(stt)):
        st = int(stt[i].split(':')[0])*60 + int(stt[i].split(':')[1])
        if st >= 300 and st < 480:
            time_dic.append('5시-8시')
        elif st >= 480 and st < 660:
            time_dic.append('8시-11시')
        elif st >= 660 and st < 840:
            time_dic.append('11시-14시')
        elif st >= 840 and st < 1020:
            time_dic.append('14시-17시')
        elif st >= 1020 and st < 1200:
            time_dic.append('17시-20시')
        elif st >= 1200 and st < 1380:
            time_dic.append('20시-23시')
        elif st >=1380 or st<300:
            time_dic.append('23시-3시')
    return time_dic 

In [4]:
df_afsnt['STT_DIC'] = time_dicrete(df_afsnt.STT)

In [5]:
# 시간 format 변경
df_afsnt["STT"] = pd.to_datetime(df_afsnt["STT"], format = "%H:%M").dt.time
df_afsnt["ATT"] = pd.to_datetime(df_afsnt["ATT"], format = "%H:%M").dt.time

In [6]:
# df.rename( columns= { '변경전': '변경후'} )
df_afsnt = df_afsnt.rename(columns={ 
    "SDT_YY": "year", "SDT_MM":"month", "SDT_DD":"day",  "SDT_DY":"wday", 
    "ARP":"origin", "ODP":"dest", 
    "FLO":"airline", "FLT":"flight" ,"REG":"tailnum", 
    "AOD":"is_arrive", 
    "IRR":"is_regular", 
    "STT":"sched_time", "ATT":"real_time", 
    "DLY":"is_delay", "DRR":"cause_delay", 
    "CNL":"is_cancel", "CNR":"cause_cancel"
})

In [7]:
# year, month, day를 합쳐서 date 변수 생성
df_afsnt["date"] = pd.to_datetime(df_afsnt[
    ["year", "month", "day"]])

In [8]:
# sched_datetime, real datetime 변수 생성 
df_afsnt["sched_datetime"] = pd.to_datetime(df_afsnt['date'].astype(str) + " " + df_afsnt["sched_time"].astype(str))
df_afsnt["real_datetime"] = pd.to_datetime(df_afsnt['date'].astype(str) + " " + df_afsnt["real_time"].astype(str))

In [12]:
# delay 변수 생성
# 양수면 늦게 도착, 음수면 빨리 도착 
temp = df_afsnt["real_datetime"] - df_afsnt["sched_datetime"] 

In [13]:
# dataframe화 
temp = pd.DataFrame(temp.dt.components)

In [14]:
df_afsnt['delay'] = temp['minutes'] + 60 * temp['hours'] + 1440 * temp['days']
del temp

In [15]:
import numpy as np
df_afsnt['delay2'] = np.where(abs(df_afsnt['delay']) < 30, 0, df_afsnt['delay'])

In [16]:
 df_afsnt['key'] = df_afsnt["origin"].astype(str)+" "+df_afsnt["is_arrive"].astype(str) + " " + df_afsnt['date'].astype(str) + " " + df_afsnt["STT_DIC"].astype(str) 
    

In [17]:
b = pd.DataFrame(df_afsnt.key.value_counts())
b = b.reset_index()
b.columns = ['key','busy']

In [18]:
df_afsnt_busy = pd.merge(df_afsnt, b, on = 'key')

# feature 선택

In [19]:
# flight와 tailnum은 아주아주 많군 
# delay를 넣을지 delay2를 넣을지
c = df_afsnt_busy[['wday','airline','is_arrive','is_regular',
                   'is_delay','cause_delay','is_cancel','cause_cancel','STT_DIC','delay','busy']]
origin = df_afsnt_busy['origin']
dest = df_afsnt_busy['dest']

In [24]:
c_sim = df_afsnt_busy[['airline','busy','delay']]

In [35]:
c_sim_drop = c_sim.drop('airline', axis = 1)

In [36]:
c_sim_drop

Unnamed: 0,busy,delay,airlineID
0,47,27,9
1,47,13,9
2,47,20,9
3,47,11,9
4,47,15,9
5,47,20,9
6,47,16,9
7,47,17,9
8,47,18,9
9,47,17,1


# Standardising for continuous variables

In [45]:
columns_to_normalize    = ['busy', 'delay']
c_sim_drop[columns_to_normalize] = c_sim_drop[columns_to_normalize].apply(lambda x: (x - x.mean())/(x.max() - x.min()))


# Encoding
## Label encoding

In [29]:
from sklearn.preprocessing import LabelEncoder

In [31]:
le = LabelEncoder()
# LabelEncoder은 fit 후 transform
le.fit(c_sim.airline)  
le.transform(c_sim.airline)
c_sim['airlineID'] = le.transform(c_sim.airline)

# class 확인 
print('인코딩 클래스:', le.classes_) 

인코딩 클래스: ['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


## One hot encoding


In [23]:
c_en = pd.get_dummies(c)
c_en.iloc[:, 0:2] = c_en.iloc[:, 0:2].astype(float)

In [25]:
c_sim_en = pd.get_dummies(c_sim)
c_sim_en.iloc[:, 0:2] = c_sim_en.iloc[:, 0:2].astype(float)

In [40]:
c_sim_en

Unnamed: 0,busy,delay,airline_A,airline_B,airline_C,airline_D,airline_E,airline_F,airline_G,airline_H,airline_I,airline_J,airline_K,airline_L
0,47.0,27.0,0,0,0,0,0,0,0,0,0,1,0,0
1,47.0,13.0,0,0,0,0,0,0,0,0,0,1,0,0
2,47.0,20.0,0,0,0,0,0,0,0,0,0,1,0,0
3,47.0,11.0,0,0,0,0,0,0,0,0,0,1,0,0
4,47.0,15.0,0,0,0,0,0,0,0,0,0,1,0,0
5,47.0,20.0,0,0,0,0,0,0,0,0,0,1,0,0
6,47.0,16.0,0,0,0,0,0,0,0,0,0,1,0,0
7,47.0,17.0,0,0,0,0,0,0,0,0,0,1,0,0
8,47.0,18.0,0,0,0,0,0,0,0,0,0,1,0,0
9,47.0,17.0,0,1,0,0,0,0,0,0,0,0,0,0


In [42]:
columns_to_normalize    = ['busy', 'delay']
c_sim_en[columns_to_normalize] = c_sim_en[columns_to_normalize].apply(lambda x: (x - x.mean())/(x.max() - x.min()))



# Clustering

In [22]:
import numpy as np
from kmodes.kprototypes import KPrototypes
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")
colors = ['b', 'orange', 'g', 'r', 'c', 'm', 'y', 'k', 'Brown', 'ForestGreen']



In [37]:
kproto = KPrototypes(n_clusters=3, init='Cao', verbose=2)
clusters = kproto.fit_predict(c_sim_drop, categorical=[2])
# Print cluster centroids of the trained model.
print(kproto.cluster_centroids_)
# Print training statistics
print(kproto.cost_)
print(kproto.n_iter_)

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 336448, ncost: 7788600180.523233
Run: 1, iteration: 2/100, moves: 301846, ncost: 6075128818.033095
Run: 1, iteration: 3/100, moves: 59000, ncost: 1141049333.6768756
Run: 1, iteration: 4/100, moves: 13938, ncost: 1135351990.7790496
Run: 1, iteration: 5/100, moves: 12216, ncost: 1133031476.1586125
Run: 1, iteration: 6/100, moves: 10611, ncost: 1131736458.1469867
Run: 1, iteration: 7/100, moves: 8738, ncost: 1131027881.851043
Run: 1, iteration: 8/100, moves: 7045, ncost: 1130639021.7078032
Run: 1, iteration: 9/100, moves: 5332, ncost: 1130434711.0934799
Run: 1, iteration: 10/100, moves: 4255, ncost: 1130312352.7288983
Run: 1, iteration: 11/100, moves: 2972, ncost: 1130258544.9266942
Run: 1, iteration: 12/100, moves: 2465, ncost: 1130218932.7189233
Run: 1, iteration: 13/100, moves: 2532, ncost: 1130183700.9224029
Run: 1, iteration: 14/100, moves: 1352, ncost: 1130173139.4988549


KeyboardInterrupt: 

In [46]:
kproto = KPrototypes(n_clusters=3, init='Cao', verbose=1, max_iter= 1)
clusters = kproto.fit_predict(c_sim_drop, categorical=[2])
# Print cluster centroids of the trained model.
print(kproto.cluster_centroids_)
# Print training statistics
print(kproto.cost_)
print(kproto.n_iter_)

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/1, moves: 17120, ncost: 79555.33708248437
Run: 1, iteration: 2/1, moves: 2167, ncost: 79553.4961585071
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 2, iteration: 1/1, moves: 139944, ncost: 76642.51111630406
Run: 2, iteration: 2/1, moves: 31184, ncost: 76529.50182202719
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 3, iteration: 1/1, moves: 356114, ncost: 80786.8057497249
Run: 3, iteration: 2/1, moves: 71072, ncost: 79582.29660773957
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 4, iteration: 1/1, moves: 324707, ncost: 72506.39227309344
Run: 4, iteration: 2/1, moves: 51613, ncost: 72290.27763340427
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 5, iteration: 1/1, moves: 128063, ncost: 76586.15369745222
Run: 5, iteration: 2/1, moves: 1

In [52]:
kproto.labels_

array([0, 0, 0, ..., 2, 2, 2], dtype=uint16)

In [49]:
for s, c in zip(origin, clusters):
    print("Result: {}, cluster:{}".format(s, c))
# Plot the results
for i in set(kproto.labels_):
    index = kproto.labels_ == i
    plt.plot(c_sim_drop[index, 0], c_sim_drop[index, 1], 'o')
    plt.suptitle('Data points categorized with category score', fontsize=18)
    plt.xlabel('Category Score', fontsize=16)
    plt.ylabel('Category Type', fontsize=16)
plt.show()
# Clustered result
fig1, ax3 = plt.subplots()
scatter = ax3.scatter(origin, clusters, c=clusters, s=50)
ax3.set_xlabel('Data points')
ax3.set_ylabel('Cluster')
plt.colorbar(scatter)
ax3.set_title('Data points classifed according to known centers')
plt.show()
result = zip(origin, kproto.labels_)
sortedR = sorted(result, key=lambda x: x[1])
print(sortedR)

TypeError: unhashable type: 'numpy.ndarray'