In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [2]:
raw = pd.read_excel('Data_User_Modeling_Dataset_Hamdi Tolga KAHRAMAN.xls', sheet_name=None)

In [3]:
raw.keys()

dict_keys(['Information', 'Training_Data', 'Test_Data'])

In [4]:
data = pd.concat([raw['Training_Data'], raw['Test_Data']], ignore_index=True)

In [5]:
data.head()

Unnamed: 0,STG,SCG,STR,LPR,PEG,UNS,Unnamed: 6,Unnamed: 7,Attribute Information:
0,0.0,0.0,0.0,0.0,0.0,very_low,,,STG (The degree of study time for goal object ...
1,0.08,0.08,0.1,0.24,0.9,High,,,SCG (The degree of repetition number of user f...
2,0.06,0.06,0.05,0.25,0.33,Low,,,STR (The degree of study time of user for rela...
3,0.1,0.1,0.15,0.65,0.3,Middle,,,LPR (The exam performance of user for related ...
4,0.08,0.08,0.08,0.98,0.24,Low,,,PEG (The exam performance of user for goal obj...


In [6]:
data = data.iloc[:,:6]

In [7]:
data.columns = data.columns.str.strip()

In [8]:
data['UNS'] = data.UNS.str.lower().replace('very low', 'very_low')

In [9]:
data.UNS = data.UNS.astype(pd.CategoricalDtype(categories=['high', 'middle', 'low', 'very_low'], ordered=True))

In [10]:
km_model = KMeans(n_clusters=2, max_iter=100, random_state=123).fit(data.drop('UNS', axis=1))

In [11]:
data['cluster'] = km_model.labels_ + 1
data

Unnamed: 0,STG,SCG,STR,LPR,PEG,UNS,cluster
0,0.00,0.00,0.00,0.00,0.00,very_low,2
1,0.08,0.08,0.10,0.24,0.90,high,1
2,0.06,0.06,0.05,0.25,0.33,low,2
3,0.10,0.10,0.15,0.65,0.30,middle,2
4,0.08,0.08,0.08,0.98,0.24,low,2
...,...,...,...,...,...,...,...
398,0.90,0.78,0.62,0.32,0.89,high,1
399,0.85,0.82,0.66,0.83,0.83,high,1
400,0.56,0.60,0.77,0.13,0.32,low,1
401,0.66,0.68,0.81,0.57,0.57,middle,1


In [12]:
data.cluster.value_counts()

2    210
1    193
Name: cluster, dtype: int64

In [13]:
temp = data.pivot_table(index='UNS', columns='cluster', values='STG', aggfunc='count', fill_value=0)
temp

cluster,1,2
UNS,Unnamed: 1_level_1,Unnamed: 2_level_1
high,102,0
middle,90,32
low,1,128
very_low,0,50


In [14]:
round(temp / temp.sum() * 100, 2)

cluster,1,2
UNS,Unnamed: 1_level_1,Unnamed: 2_level_1
high,52.85,0.0
middle,46.63,15.24
low,0.52,60.95
very_low,0.0,23.81


In [15]:
for i in range(2, 11):
    print(i)
    km_model = KMeans(n_clusters=i, max_iter=100, random_state=123).fit(data.drop('UNS', axis=1))
    data[f'c_{i}'] = km_model.labels_
    temp = data.pivot_table(index='UNS', columns=f'c_{i}', values='STG', aggfunc='count', fill_value=0)
    display(round(temp / temp.sum() * 100, 2))

2


c_2,0,1
UNS,Unnamed: 1_level_1,Unnamed: 2_level_1
high,52.85,0.0
middle,46.63,15.24
low,0.52,60.95
very_low,0.0,23.81


3


c_3,0,1,2
UNS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
high,36.76,0.0,91.23
middle,62.5,15.24,8.77
low,0.74,60.95,0.0
very_low,0.0,23.81,0.0


4


c_4,0,1,2,3
UNS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
high,0.0,36.76,91.23,0.0
middle,31.91,62.5,8.77,1.72
low,61.7,0.74,0.0,60.34
very_low,6.38,0.0,0.0,37.93


5


c_5,0,1,2,3,4
UNS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
high,91.23,33.82,0.0,0.0,39.71
middle,8.77,64.71,31.91,1.72,60.29
low,0.0,1.47,61.7,60.34,0.0
very_low,0.0,0.0,6.38,37.93,0.0


6


c_6,0,1,2,3,4,5
UNS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
high,39.71,33.82,0.0,0.0,91.23,0.0
middle,60.29,64.71,2.63,31.91,8.77,0.0
low,0.0,1.47,60.53,61.7,0.0,60.0
very_low,0.0,0.0,36.84,6.38,0.0,40.0


7


c_7,0,1,2,3,4,5,6
UNS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
high,0.0,0.0,39.71,91.23,0.0,33.82,0.0
middle,30.23,2.63,60.29,8.77,0.0,64.71,33.33
low,62.79,60.53,0.0,0.0,60.0,1.47,60.78
very_low,6.98,36.84,0.0,0.0,40.0,0.0,5.88


8


c_8,0,1,2,3,4,5,6,7
UNS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high,0.0,33.82,93.75,0.0,39.71,0.0,0.0,88.0
middle,30.23,64.71,6.25,2.63,60.29,0.0,33.33,12.0
low,62.79,1.47,0.0,60.53,0.0,60.0,60.78,0.0
very_low,6.98,0.0,0.0,36.84,0.0,40.0,5.88,0.0


9


c_9,0,1,2,3,4,5,6,7,8
UNS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
high,0.0,93.75,39.71,48.57,0.0,0.0,0.0,88.0,18.18
middle,33.33,6.25,60.29,51.43,2.63,0.0,30.23,12.0,78.79
low,60.78,0.0,0.0,0.0,60.53,60.0,62.79,0.0,3.03
very_low,5.88,0.0,0.0,0.0,36.84,40.0,6.98,0.0,0.0


10


c_10,0,1,2,3,4,5,6,7,8,9
UNS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
high,0.0,0.0,45.0,0.0,0.0,48.57,18.18,93.75,88.0,37.5
middle,0.0,30.23,55.0,33.33,2.63,51.43,78.79,6.25,12.0,62.5
low,60.0,62.79,0.0,60.78,60.53,0.0,3.03,0.0,0.0,0.0
very_low,40.0,6.98,0.0,5.88,36.84,0.0,0.0,0.0,0.0,0.0


In [16]:
from sklearn.metrics import silhouette_samples, silhouette_score

In [17]:
data.columns

Index(['STG', 'SCG', 'STR', 'LPR', 'PEG', 'UNS', 'cluster', 'c_2', 'c_3',
       'c_4', 'c_5', 'c_6', 'c_7', 'c_8', 'c_9', 'c_10'],
      dtype='object')

In [18]:
# iris 의 모든 개별 데이터에 실루엣 계수값을 구함. 
score_samples = silhouette_samples(data[['STG', 'SCG', 'STR', 'LPR', 'PEG']], data['c_2'])
print('silhouette_samples( ) return 값의 shape' , score_samples.shape)

silhouette_samples( ) return 값의 shape (403,)


In [19]:
for i in range(2, 11):
    average_score = silhouette_score(data[['STG', 'SCG', 'STR', 'LPR', 'PEG']], data[f'c_{i}'])
    print(f'Cluster #{i} Silhouette Analysis Score:{average_score:.3f}')

Cluster #2 Silhouette Analysis Score:0.207
Cluster #3 Silhouette Analysis Score:0.203
Cluster #4 Silhouette Analysis Score:0.193
Cluster #5 Silhouette Analysis Score:0.177
Cluster #6 Silhouette Analysis Score:0.172
Cluster #7 Silhouette Analysis Score:0.173
Cluster #8 Silhouette Analysis Score:0.182
Cluster #9 Silhouette Analysis Score:0.175
Cluster #10 Silhouette Analysis Score:0.173


In [28]:
fig, ax = plt.subplots(figsize=(10,6))
ax.plot(data.groupby('c_8').mean()[['STG', 'SCG', 'STR', 'LPR', 'PEG']], marker='o')
ax.set_ylim(0,1)
plt.show()

In [27]:
for i in range(10):
    km_model = KMeans(n_clusters=8, max_iter=100, random_state=i).fit(data[['STG', 'SCG', 'STR', 'LPR', 'PEG']])
    data[f'val_{i}'] = km_model.labels_
    print(i)
    display(data.pivot_table(index='UNS', columns=f'val_{i}', values='STG', aggfunc='count'))

0


val_0,0,1,2,3,4,5,6,7
UNS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high,20,0,0,24,36,20,2,0
middle,18,13,0,32,4,37,18,0
low,0,36,28,0,0,0,28,37
very_low,0,8,16,0,0,0,3,23


1


val_1,0,1,2,3,4,5,6,7
UNS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high,0,40,1,21,16,0,0,24
middle,0,3,20,34,20,0,11,34
low,39,0,30,0,1,28,31,0
very_low,23,0,1,0,0,17,9,0


2


val_2,0,1,2,3,4,5,6,7
UNS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high,15,0,0,41,21,1,24,0
middle,17,14,0,2,34,18,37,0
low,1,36,37,0,0,28,0,27
very_low,0,8,23,0,0,3,0,16


3


val_3,0,1,2,3,4,5,6,7
UNS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high,31,0,0,25,23,0,21,2
middle,5,0,0,31,16,11,42,17
low,0,28,43,0,0,31,0,27
very_low,0,16,26,0,0,5,0,3


4


val_4,0,1,2,3,4,5,6,7
UNS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high,15,19,18,20,0,0,0,30
middle,17,34,36,7,24,0,3,1
low,2,0,0,1,47,27,52,0
very_low,0,0,0,0,4,17,29,0


5


val_5,0,1,2,3,4,5,6,7
UNS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high,0,19,0,21,36,3,23,0
middle,0,17,0,36,4,18,34,13
low,28,1,37,0,0,27,0,36
very_low,16,0,23,0,0,3,0,8


6


val_6,0,1,2,3,4,5,6,7
UNS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high,16,0,1,21,24,0,0,40
middle,17,0,18,34,36,0,14,3
low,1,37,28,0,0,27,36,0
very_low,0,23,3,0,0,16,8,0


7


val_7,0,1,2,3,4,5,6,7
UNS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high,0,21,23,23,0,33,0,2
middle,0,39,16,32,13,4,0,18
low,37,0,0,0,36,0,28,28
very_low,23,0,0,0,8,0,16,3


8


val_8,0,1,2,3,4,5,6,7
UNS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high,25,0,0,0,22,26,3,26
middle,5,11,0,1,40,41,18,6
low,1,30,28,43,0,0,27,0
very_low,0,5,16,26,0,0,3,0


9


val_9,0,1,2,3,4,5,6,7
UNS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high,15,0,20,0,24,25,0,18
middle,16,20,38,0,33,6,1,8
low,1,49,0,29,0,1,49,0
very_low,0,6,0,17,0,0,27,0
