In [1]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors

In [2]:
ROOT_PATH = '../../'

In [3]:
event_space_dataset_dir = ROOT_PATH + 'Dataset/event_space_data/'
user_data = []

In [4]:
# 把对解决问题无关的列去掉
for i in range(3):
    user_data.append(pd.read_csv(event_space_dataset_dir + '1-' + str(i + 1) + '.csv'))
    user_data[i].drop('l_space_min', axis=1, inplace=True)
    user_data[i].drop('l_space_mean', axis=1, inplace=True)
    user_data[i].drop('r_space_min', axis=1, inplace=True)
    user_data[i].drop('r_space_mean', axis=1, inplace=True)

In [5]:
# 看缺省值
for i in range(3):
    print(user_data[i].isnull().sum())

event          0
l_space_max    0
r_space_max    0
dtype: int64
event           0
l_space_max    17
r_space_max    17
dtype: int64
event           0
l_space_max    54
r_space_max    54
dtype: int64


In [6]:
for i in range(3):
    df = user_data[i]
    print('第' + str(i + 1) + '次')
    # 填充缺省值
    for j in range(1, 5):
        print('事件' + str(j) + '的各列平均值：')
        print(df[df['event'] == j].median())
        # 使用相同event下的平均值来填充对应列的缺省值
        # 疑问：用平均值填充还是用最大值填充呢？
        df[df['event'] == j] = df[df['event'] == j].fillna(df[df['event'] == j].median())

第1次
事件1的各列平均值：
event          1.000000
l_space_max    0.270185
r_space_max    0.449444
dtype: float64
事件2的各列平均值：
event          2.000000
l_space_max    0.266271
r_space_max    0.420119
dtype: float64
事件3的各列平均值：
event          3.000000
l_space_max    0.266965
r_space_max    0.407615
dtype: float64
事件4的各列平均值：
event          4.000000
l_space_max    0.298149
r_space_max    0.434426
dtype: float64
第2次
事件1的各列平均值：
event          1.000000
l_space_max    0.325576
r_space_max    0.509117
dtype: float64
事件2的各列平均值：
event          2.000000
l_space_max    0.294109
r_space_max    0.486004
dtype: float64
事件3的各列平均值：
event          3.000000
l_space_max    0.317015
r_space_max    0.508959
dtype: float64
事件4的各列平均值：
event          4.000000
l_space_max    0.368917
r_space_max    0.478017
dtype: float64
第3次
事件1的各列平均值：
event          1.000000
l_space_max    0.460509
r_space_max    0.513559
dtype: float64
事件2的各列平均值：
event          2.000000
l_space_max    0.399685
r_space_max    0.506211
dtype: float64
事件3的各列平均

In [9]:
# 3次求平均值
merge_dataset = (user_data[0] + user_data[1] + user_data[2]) / 3

     event  l_space_max  r_space_max
0      4.0     0.263634     0.289551
1      2.0     0.239821     0.376116
2      2.0     0.328899     0.509152
3      4.0     0.398803     0.368063
4      2.0     0.284081     0.363254
..     ...          ...          ...
211    2.0     0.381470     0.445119
212    4.0     0.358669     0.559078
213    4.0     0.400983     0.475154
214    4.0     0.373665     0.540463
215    2.0     0.396206     0.456882

[216 rows x 3 columns]


In [14]:
# event转格式成int
merge_dataset['event'] = merge_dataset['event'].astype(int)
print(merge_dataset)

     event  l_space_max  r_space_max
0        4     0.263634     0.289551
1        2     0.239821     0.376116
2        2     0.328899     0.509152
3        4     0.398803     0.368063
4        2     0.284081     0.363254
..     ...          ...          ...
211      2     0.381470     0.445119
212      4     0.358669     0.559078
213      4     0.400983     0.475154
214      4     0.373665     0.540463
215      2     0.396206     0.456882

[216 rows x 3 columns]


In [17]:
# 保留左右手的最大值，因为左右手的类别是合并在一起的
# TO DO

event          1.000000
l_space_max    0.359067
r_space_max    0.494004
dtype: float64

In [81]:
# 按照事件进行平均值运算后保留表
event_data = merge_dataset[merge_dataset['event'] == 1].median()
df = pd.DataFrame({
    'event': event_data['event'],
    'l_space_max': event_data['l_space_max'],
    'r_space_max': event_data['r_space_max']
}, index=[0])
for i in range(2, 5):
    ds = merge_dataset[merge_dataset['event'] == i].median()
    df = df.append(ds, ignore_index=True)

  df = df.append(ds, ignore_index=True)
  df = df.append(ds, ignore_index=True)
  df = df.append(ds, ignore_index=True)


In [83]:
df['event'] = df['event'].astype(int)
df

Unnamed: 0,event,l_space_max,r_space_max
0,1,0.359067,0.494004
1,2,0.322218,0.463438
2,3,0.34105,0.456071
3,4,0.374459,0.467006
