In [30]:
import pandas as pd
import numpy as np

data_filename = "NBA15-16_dataset/basketball.csv"

dataset = pd.read_csv(data_filename)
#查看前5行数据
dataset.head()
dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1316 entries, 0 to 1315
Data columns (total 10 columns):
Date               1316 non-null object
Start (ET)         1316 non-null object
Visitor/Neutral    1316 non-null object
PTS                1316 non-null int64
Home/Neutral       1316 non-null object
PTS.1              1316 non-null int64
Unnamed: 6         1316 non-null object
Unnamed: 7         82 non-null object
Attend.            1316 non-null int64
Notes              2 non-null object
dtypes: int64(3), object(7)
memory usage: 102.9+ KB


In [5]:
# 通过上面的输出我们发现一些问题：
# （1）Date属性不是Date对象而是String对象
# （2）第一行标题列不完整或是部分列对应的属性名不正确
# ——
# 清洗数据集
# 我们可以通过pd.read_csv函数来解决上述问题。
dataset = pd.read_csv(data_filename, parse_dates=["Date"])
#修改每列名称
dataset.columns = ["Date", "Start (ET)", "Visitor Team", "VisitorPts", "Home Team", "HomePts", "OT?", "Score Type", "Attend.", "Notes"]
dataset.head()


Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Attend.,Notes
0,2015-10-27,8:00 pm,Detroit Pistons,106,Atlanta Hawks,94,Box Score,,19187,
1,2015-10-27,8:00 pm,Cleveland Cavaliers,95,Chicago Bulls,97,Box Score,,21957,
2,2015-10-27,10:30 pm,New Orleans Pelicans,95,Golden State Warriors,111,Box Score,,19596,
3,2015-10-28,7:30 pm,Philadelphia 76ers,95,Boston Celtics,112,Box Score,,18624,
4,2015-10-28,7:30 pm,Chicago Bulls,115,Brooklyn Nets,100,Box Score,,17732,


In [7]:
# 查看column的属性
print(dataset.dtypes)

Date            datetime64[ns]
Start (ET)              object
Visitor Team            object
VisitorPts               int64
Home Team               object
HomePts                  int64
OT?                     object
Score Type              object
Attend.                  int64
Notes                   object
dtype: object


In [9]:
#抽取新特征
dataset["HomeWin"] = dataset["VisitorPts"] < dataset["HomePts"]
dataset.head()

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Attend.,Notes,HomeWin
0,2015-10-27,8:00 pm,Detroit Pistons,106,Atlanta Hawks,94,Box Score,,19187,,False
1,2015-10-27,8:00 pm,Cleveland Cavaliers,95,Chicago Bulls,97,Box Score,,21957,,True
2,2015-10-27,10:30 pm,New Orleans Pelicans,95,Golden State Warriors,111,Box Score,,19596,,True
3,2015-10-28,7:30 pm,Philadelphia 76ers,95,Boston Celtics,112,Box Score,,18624,,True
4,2015-10-28,7:30 pm,Chicago Bulls,115,Brooklyn Nets,100,Box Score,,17732,,False


In [11]:
# 由于Pandas和scikit-learn没有完美整合，而Numpy和scikit-learn能很好地协同工作
# ，因此，可以先将Pandas中的值转化为Numpy，然后再将Numpy配合scikit-learn工作。
# 这里我们抽取属性“HomeWin”列为类别特征列y_true，这样就能转化为scikit-learn能识别的形式。

y_true = dataset["HomeWin"].values

# 此外，体育赛事的预测基线一个更好的选择是预测在每场比赛中主队获胜情况，众所周知，
# 主队几乎在所有比赛中都会有一定的优势。那么我们的数据集中主队有多大的优势呢，
# 我们可以通过查询主队获胜的平均概率获得：

dataset["HomeWin"].mean()

0.5942249240121581

In [24]:
#We first create a (default) dictionary to store the team's last result
from collections import defaultdict

won_last = defaultdict(int)

#We then create a new feature on our dataset to store the results of our new features
#gives a false value to all teams (including the previous year's champion!) when they are firstly seen

dataset["HomeLastWin"] = 0
dataset["VisitorLastWin"] = 0

#The key of this dictionary will be the team and the value will be whether they won
#their previous game. We can then iterate over all the rows and update the current 
#row with the team's last result

for index, row in dataset.iterrows():
    home_team = row['Home Team'] #当场比赛主队
    visitor_team = row['Visitor Team']
#     row["HomeLastWin"] = won_last[home_team]

    dataset.loc[index,"HomeLastWin"] = won_last[home_team]
    dataset.loc[index,"VisitorLastWin"] = won_last[visitor_team]
#     dataset.set_value(index, "HomeLastWin", won_last[home_team])
#     dataset.set_value(index, "VisitorLastWin", won_last[visitor_team])
    won_last[home_team] = int(row["HomeWin"])
    won_last[visitor_team] = 1 - int(row["HomeWin"])

#查看前6行数据    
dataset.head(40)
dataset.iloc[1000:1005]


Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Attend.,Notes,HomeWin,HomeLastWin,VisitorLastWin
1000,2016-03-15,7:00 pm,Denver Nuggets,110,Orlando Magic,116,Box Score,,16988,,True,0,0
1001,2016-03-15,8:30 pm,Los Angeles Clippers,87,San Antonio Spurs,108,Box Score,,18418,,True,1,0
1002,2016-03-16,7:00 pm,Oklahoma City Thunder,130,Boston Celtics,109,Box Score,,18624,,False,0,1
1003,2016-03-16,7:00 pm,Orlando Magic,99,Charlotte Hornets,107,Box Score,,16148,,True,0,1
1004,2016-03-16,7:00 pm,Dallas Mavericks,98,Cleveland Cavaliers,99,Box Score,,20562,,True,0,1


In [29]:
from sklearn.tree import DecisionTreeClassifier
# from sklearn.cross_validation import cross_val_score
import numpy as np

clf = DecisionTreeClassifier(random_state=14)
X_previouswins = dataset[["HomeLastWin", "VisitorLastWin"]].values
print(X_previouswins.shape)
# scores = cross_val_score(clf, X_previouswins, y_true, scoring='accuracy')
# print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))


(1316, 2)
