In [40]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import metrics
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors

In [2]:
ROOT_PATH = './'

In [3]:
event_space_dataset_dir = ROOT_PATH + 'Dataset/event_space_data/'
user_data = []

In [4]:
# 把对解决问题无关的列去掉
for i in range(3):
    user_data.append(pd.read_csv(event_space_dataset_dir + '44-' + str(i + 1) + '.csv'))
    user_data[i].drop('l_space_min', axis=1, inplace=True)
    user_data[i].drop('l_space_mean', axis=1, inplace=True)
    user_data[i].drop('r_space_min', axis=1, inplace=True)
    user_data[i].drop('r_space_mean', axis=1, inplace=True)

In [5]:
# 看缺省值
for i in range(3):
    print(user_data[i].isnull().sum())

event          0
l_space_max    0
r_space_max    0
dtype: int64
event          0
l_space_max    0
r_space_max    0
dtype: int64
event          0
l_space_max    0
r_space_max    0
dtype: int64


In [6]:
for i in range(3):
    df = user_data[i]
    print('第' + str(i + 1) + '次')
    # 填充缺省值
    for j in range(1, 5):
        print('事件' + str(j) + '的各列平均值：')
        print(df[df['event'] == j].median())
        # 使用相同event下的平均值来填充对应列的缺省值
        # 疑问：用平均值填充还是用最大值填充呢？
        df[df['event'] == j] = df[df['event'] == j].fillna(df[df['event'] == j].median())

第1次
事件1的各列平均值：
event          1.000000
l_space_max    0.479270
r_space_max    0.518556
dtype: float64
事件2的各列平均值：
event          2.000000
l_space_max    0.444185
r_space_max    0.494773
dtype: float64
事件3的各列平均值：
event          3.000000
l_space_max    0.488147
r_space_max    0.510657
dtype: float64
事件4的各列平均值：
event          4.000000
l_space_max    0.491215
r_space_max    0.512136
dtype: float64
第2次
事件1的各列平均值：
event          1.000000
l_space_max    0.494975
r_space_max    0.506952
dtype: float64
事件2的各列平均值：
event          2.000000
l_space_max    0.470106
r_space_max    0.518556
dtype: float64
事件3的各列平均值：
event          3.000000
l_space_max    0.496396
r_space_max    0.481508
dtype: float64
事件4的各列平均值：
event          4.000000
l_space_max    0.503633
r_space_max    0.498393
dtype: float64
第3次
事件1的各列平均值：
event          1.000000
l_space_max    0.514296
r_space_max    0.546443
dtype: float64
事件2的各列平均值：
event          2.000000
l_space_max    0.490408
r_space_max    0.552268
dtype: float64
事件3的各列平均

In [7]:
# 3次求平均值
merge_dataset = (user_data[0] + user_data[1] + user_data[2]) / 3

In [8]:
# event转格式成int
# merge_dataset['event'] = merge_dataset['event'].astype(int)
# print(merge_dataset)

In [9]:
# 保留左右手的最大值，因为左右手的类别是合并在一起的
# TO DO

In [10]:
# 按照事件进行平均值运算后保留表
event_data = merge_dataset[merge_dataset['event'] == 1].median()
df = pd.DataFrame({
    'event': event_data['event'],
    'l_space_max': event_data['l_space_max'],
    'r_space_max': event_data['r_space_max']
}, index=[0])
for i in range(2, 5):
    ds = merge_dataset[merge_dataset['event'] == i].median()
    df = df.append(ds, ignore_index=True)

  df = df.append(ds, ignore_index=True)
  df = df.append(ds, ignore_index=True)
  df = df.append(ds, ignore_index=True)


In [11]:
df['event'] = df['event'].astype(int)
df

Unnamed: 0,event,l_space_max,r_space_max
0,1,0.504587,0.527243
1,2,0.461935,0.522902
2,3,0.512463,0.504878
3,4,0.50955,0.50702


In [12]:
# 左右手取最大值(先只取右手代替)
df = df.drop('l_space_max', axis=1)
df

Unnamed: 0,event,r_space_max
0,1,0.527243
1,2,0.522902
2,3,0.504878
3,4,0.50702


In [13]:
# 插入用户编号
USER_ID = 1
df.insert(0, 'uid', USER_ID)
df

Unnamed: 0,uid,event,r_space_max
0,1,1,0.527243
1,1,2,0.522902
2,1,3,0.504878
3,1,4,0.50702


In [14]:
df = df.rename(columns={'r_space_max':'avg_max'})

In [15]:
df

Unnamed: 0,uid,event,avg_max
0,1,1,0.527243
1,1,2,0.522902
2,1,3,0.504878
3,1,4,0.50702


经过一系列处理，整合得到处理过后的数据集：
接下来尝试一下先拼接用户特性与事件动作幅度，后进行聚类

In [149]:
event_df_list = []
PATH = './Dataset/czm_sorted_data/'
for i in range(4):
    event_df_list.append(pd.read_csv(PATH + 'user_scale_by_event_' + str(i + 1) + '.csv'))

In [150]:
event_df_list[0].head()

Unnamed: 0,uid,event,avg_max
0,3,1,0.530834
1,4,1,0.480312
2,5,1,0.565333
3,7,1,0.478278
4,8,1,0.359157


In [151]:
user_character = pd.read_csv(ROOT_PATH + 'Dataset/user.csv')
user_character.head()

Unnamed: 0,uid,gender,age,height,weight,side,VR_exp,game_fre,sport_fre,difficulty,enjoyment,fatigue,personality,familiarity
0,3,0,0,2,1,1,0,0,1,0,1,0,0,0
1,4,0,0,2,0,1,0,0,0,1,1,1,1,0
2,5,1,1,1,0,1,1,0,0,0,1,0,1,1
3,7,1,1,1,0,1,1,0,1,1,0,1,0,0
4,8,1,1,2,1,1,0,0,1,1,1,1,0,0


In [175]:
########################
# 尝试以每个事件为单位，把用户特征与动作幅度进行拼接
EVENT_NUM = 1
curr_event_df = pd.merge(user_character, event_df_list[EVENT_NUM - 1], how='left', on='uid').copy()
# 删掉无关的列
curr_event_df.drop('event', axis=1, inplace=True)
curr_event_df.drop('uid', axis=1, inplace=True)
curr_event_df.drop('difficulty', axis=1, inplace=True)
curr_event_df.drop('enjoyment', axis=1, inplace=True)
curr_event_df.drop('fatigue', axis=1, inplace=True)

In [176]:
curr_event_df

# X = event_df
# kmeans_model = KMeans(n_clusters=2, random_state=1).fit(X)
# labels = kmeans_model.labels_
# metrics.silhouette_score(X, labels, metric='euclidean')

Unnamed: 0,gender,age,height,weight,side,VR_exp,game_fre,sport_fre,personality,familiarity,avg_max
0,0,0,2,1,1,0,0,1,0,0,0.530834
1,0,0,2,0,1,0,0,0,1,0,0.480312
2,1,1,1,0,1,1,0,0,1,1,0.565333
3,1,1,1,0,1,1,0,1,0,0,0.478278
4,1,1,2,1,1,0,0,1,0,0,0.359157
5,1,1,0,1,1,0,1,0,0,0,0.422268
6,0,1,2,1,1,0,1,0,1,0,0.459557
7,0,1,2,3,1,0,0,0,0,0,0.407063
8,0,1,3,2,1,1,0,1,1,0,0.490426
9,0,1,2,2,1,0,1,1,0,0,0.534859


In [177]:
# 尝试把距离变成类别
column_num = curr_event_df.shape[1]
curr_event_df.insert(column_num - 1,'class_id', -1)
curr_event_df.head()

Unnamed: 0,gender,age,height,weight,side,VR_exp,game_fre,sport_fre,personality,familiarity,class_id,avg_max
0,0,0,2,1,1,0,0,1,0,0,-1,0.530834
1,0,0,2,0,1,0,0,0,1,0,-1,0.480312
2,1,1,1,0,1,1,0,0,1,1,-1,0.565333
3,1,1,1,0,1,1,0,1,0,0,-1,0.478278
4,1,1,2,1,1,0,0,1,0,0,-1,0.359157


In [172]:
# 导入聚类后定义的边界
event_boundary_df = pd.read_csv('./Dataset/imgs/event_centroid_boundary.csv')
event = event_boundary_df[event_boundary_df['event'] == 1]

In [178]:
# 给类别按条件赋值
class_0_df = curr_event_df[curr_event_df['avg_max'] < event['class_boundary_1'].item()]
class_1_df = curr_event_df[(event['class_boundary_1'].item() <= curr_event_df['avg_max']) & (curr_event_df['avg_max'] < event['class_boundary_2'].item())]
class_2_df = curr_event_df[(event['class_boundary_2'].item() <= curr_event_df['avg_max']) & (curr_event_df['avg_max'] < event['class_boundary_3'].item())]
class_3_df = curr_event_df[event['class_boundary_3'].item() < curr_event_df['avg_max']]
for i in range(class_0_df.shape[0]):
    curr_event_df.loc[class_0_df.index[i], 'class_id'] = 0
for i in range(class_1_df.shape[0]):
    curr_event_df.loc[class_1_df.index[i], 'class_id'] = 1
for i in range(class_2_df.shape[0]):
    curr_event_df.loc[class_2_df.index[i], 'class_id'] = 2
for i in range(class_3_df.shape[0]):
    curr_event_df.loc[class_3_df.index[i], 'class_id'] = 3

print(curr_event_df)
curr_event_df.drop('avg_max', axis=1, inplace=True)

Unnamed: 0,gender,age,height,weight,side,VR_exp,game_fre,sport_fre,personality,familiarity,class_id,avg_max
0,0,0,2,1,1,0,0,1,0,0,3,0.530834
1,0,0,2,0,1,0,0,0,1,0,2,0.480312
2,1,1,1,0,1,1,0,0,1,1,3,0.565333
3,1,1,1,0,1,1,0,1,0,0,2,0.478278
4,1,1,2,1,1,0,0,1,0,0,0,0.359157
5,1,1,0,1,1,0,1,0,0,0,1,0.422268
6,0,1,2,1,1,0,1,0,1,0,2,0.459557
7,0,1,2,3,1,0,0,0,0,0,1,0.407063
8,0,1,3,2,1,1,0,1,1,0,2,0.490426
9,0,1,2,2,1,0,1,1,0,0,3,0.534859


In [174]:
curr_event_df.to_csv('./Dataset/czm_sorted_data/data_for_prediction/character+class_id+event_' + str(EVENT_NUM) + '.csv', index=False)