In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(37) # 使得每次运行得到的随机数都一样

In [2]:
# 1 准备数据集
# 从文件中加载数据集
feature_data_path='E:\PyProjects\DataSet\BuildingInOut/Dodgers.data'
feature_set=pd.read_csv(feature_data_path,header=None)
print(feature_set.info())
# print(feature_set.head()) 
# print(feature_set.tail()) # 检查没有问题

label_data_path='E:\PyProjects\DataSet\BuildingInOut/Dodgers_utf8.events'
label_set=pd.read_csv(label_data_path,header=None)
print(label_set.info())
# print(label_set.head())
# print(label_set.tail()) 
# 读取没有问题，虽然最后一列有？，但我们用不到这一列，故不予理睬

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50400 entries, 0 to 50399
Data columns (total 2 columns):
0    50400 non-null object
1    50400 non-null int64
dtypes: int64(1), object(1)
memory usage: 787.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 6 columns):
0    81 non-null object
1    81 non-null object
2    81 non-null object
3    81 non-null int64
4    81 non-null object
5    81 non-null object
dtypes: int64(1), object(5)
memory usage: 3.9+ KB
None


In [3]:
# 删除缺失数据
feature_set2=feature_set[feature_set[1]!=-1] # 只获取不是-1的DataFrame即可。
# print(feature_set2) # 没有问题

feature_set2=feature_set2.reset_index(drop=True)
print(feature_set2.head())
# 第0列既包含日期，又包含时间，故要拆分成两列
need_split_col=feature_set2[0].copy()
feature_set2[0]=need_split_col.map(lambda x: x.split()[0].strip())
feature_set2[2]=need_split_col.map(lambda x: x.split()[1].strip())
print(feature_set2.head()) # 拆分没有问题

                0   1
0  4/11/2005 7:35  23
1  4/11/2005 7:40  42
2  4/11/2005 7:45  37
3  4/11/2005 7:50  24
4  4/11/2005 7:55  39
           0   1     2
0  4/11/2005  23  7:35
1  4/11/2005  42  7:40
2  4/11/2005  37  7:45
3  4/11/2005  24  7:50
4  4/11/2005  39  7:55


In [4]:
# 将两个DataFrame中的日期格式统一，两个DataFrame中的日期目前还是String类型，格式不统一无法比较
feature_set2[0]=pd.to_datetime(feature_set2[0])
print(feature_set2[0][:5]) # 打印第0列的前5行

label_set[0]=pd.to_datetime(label_set[0])
print(label_set[0][:5])


0   2005-04-11
1   2005-04-11
2   2005-04-11
3   2005-04-11
4   2005-04-11
Name: 0, dtype: datetime64[ns]
0   2005-04-12
1   2005-04-13
2   2005-04-15
3   2005-04-16
4   2005-04-17
Name: 0, dtype: datetime64[ns]


In [5]:
# 合并两个文件到一个数据集中
feature_set2[3]='NoName' # 对手球队名称暂时用NoName来初始化 
feature_set2[4]=0 # 是否比赛期间暂时用否来代替

def calc_mins(time_str):
    nums=time_str.split(':')
    return 60*int(nums[0])+int(nums[1]) # 将时间转换为分钟数

for row_id,date in enumerate(label_set[0]): # 先取出label中的日期
    temp_df=feature_set2[feature_set2[0]==date]
    if temp_df is None:
        continue
    
    # 只要这一天有比赛，不管是不是正在比赛，都把对手球队名称写入第3列
    rows=temp_df.index.tolist()
    feature_set2.loc[rows,3]=label_set.iloc[row_id,4]
    start_min=calc_mins(label_set.iloc[row_id,1])
    stop_min=calc_mins(label_set.iloc[row_id,2])
    for row in temp_df[2]: # 在逐一判断时间是否位于label中时间之间
        feature_min=calc_mins(row)
        if feature_min>=start_min and feature_min<=stop_min: 
            feature_row=temp_df[temp_df[2]==row].index.tolist()
            feature_set2.loc[feature_row,4]=1 
        
# feature_set2.to_csv('d:/feature_set2_Dodgers.csv') # 保存后打印查看没有问题

In [6]:
feature_set3=feature_set2[feature_set2[3]!='NoName'].reset_index(drop=True) # 去掉NoName的样本

# 进一步处理，由于日期在以后的日子里不可重复，作为feature并不合适，而可以用星期数来代替，
feature_set3[5]=feature_set3[0].map(lambda x: x.strftime('%w')) # 将日期转换为星期数
feature_set3=feature_set3.reindex(columns=[0,2,5,3,4,1])
print(feature_set3.tail()) # 查看转换没有问题

# feature_set3.to_csv('E:\PyProjects\DataSet\BuildingInOut/Dodgers_Sorted_Set.txt') # 将整理好的数据集保存，下次可以直接读取

               0      2  5        3  4   1
22411 2005-09-29  23:35  4  Arizona  0   9
22412 2005-09-29  23:40  4  Arizona  0  13
22413 2005-09-29  23:45  4  Arizona  0  11
22414 2005-09-29  23:50  4  Arizona  0  14
22415 2005-09-29  23:55  4  Arizona  0  17


In [7]:
# 由于第0列只是包含日期，作为特征向量并不合适，故而需要删除
feature_set3.drop([0],axis=1,inplace=True)
# 而第3列明显是字符串类型，里面的内容对机器学习而言如同天书，故需要编码
from sklearn import preprocessing
player_encoder=preprocessing.LabelEncoder()
feature_set3[3]=player_encoder.fit_transform(feature_set3[3])

time_encoder=preprocessing.LabelEncoder()
feature_set3[2]=time_encoder.fit_transform(feature_set3[2])

print(feature_set3.tail())

         2  5  3  4   1
22411  187  4  0  0   9
22412  188  4  0  0  13
22413  189  4  0  0  11
22414  190  4  0  0  14
22415  191  4  0  0  17


In [8]:
dataset_X,dataset_y=feature_set3.iloc[:,:-1].values,feature_set3.iloc[:,-1].values
# 拆分数据集为train set和test set
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y=train_test_split(dataset_X,dataset_y,
                                                  test_size=0.3,random_state=42)
# print(train_X.shape) # (15691, 4)
# print(train_y.shape) # (15691,)
# print(test_X.shape) # (6725, 4)

In [9]:
from sklearn.svm import SVR # 此处不一样，导入的是SVR而不是SVC
regressor = SVR(kernel='rbf',C=10.0,epsilon=0.2) # 这些参数是优化得来
regressor.fit(train_X, train_y)


SVR(C=10.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [10]:
y_predict_test=regressor.predict(test_X)
# 使用评价指标来评估模型的好坏
import sklearn.metrics as metrics
print('平均绝对误差：{}'.format(
    round(metrics.mean_absolute_error(y_predict_test,test_y),2)))
print('均方误差MSE：{}'.format(
    round(metrics.mean_squared_error(y_predict_test,test_y),2)))
print('中位数绝对误差：{}'.format(
    round(metrics.median_absolute_error(y_predict_test,test_y),2)))
print('解释方差分：{}'.format(
    round(metrics.explained_variance_score(y_predict_test,test_y),2)))
print('R方得分：{}'.format(
    round(metrics.r2_score(y_predict_test,test_y),2)))


平均绝对误差：5.16
均方误差MSE：50.45
中位数绝对误差：3.75
解释方差分：0.63
R方得分：0.62
