In [1]:
#import package
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as m
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
#讀取csv檔
df,df_test=pd.read_csv("train.csv"),pd.read_csv("test.csv")
df=df.fillna(value=0) #將遺漏值補0

In [3]:
#將分鐘*60+秒數 並且區分成7群
df["seconds_bin"]=pd.cut(df["minutes_remaining"]*60+df["seconds_remaining"],7,labels=range(7))
df["seconds_bin"]=df.seconds_bin.astype(int) 
df["seconds_bin"].head()

0    2
1    1
2    1
3    5
4    2
Name: seconds_bin, dtype: int64

In [4]:
#利用loc_x和loc_y的值來計算出投籃的仰角
df["angle_bin"]=pd.cut(df.apply(lambda row: 90 if row["loc_y"]==0 else m.degrees(m.atan(row["loc_x"]/abs(row["loc_y"]))),axis=1),7,labels=range(7))
df["angle_bin"]=df.angle_bin.astype(int)
df["angle_bin"].head()

0    6
1    4
2    6
3    5
4    4
Name: angle_bin, dtype: int64

In [5]:
#將分鐘*60+秒數
df["time"] = df.minutes_remaining*60+df.seconds_remaining
df["time"].head()

0    299
1    112
2    125
3    534
4    269
Name: time, dtype: int64

In [6]:
#將game_date的資料把斜線部分刪掉且把缺失值補零讓位數相等
month=[]
for i in range(len(df["game_date"])):
    if df["game_date"][i][6]=="/":
        if df["game_date"][i][-2]=="/":
            month.append(df["game_date"][i][0:4]+"0"+df["game_date"][i][5]+"0"+df["game_date"][i][-1])
        if df["game_date"][i][-2]!="/":
            month.append(df["game_date"][i][0:4]+"0"+df["game_date"][i][5]+df["game_date"][i][-2:])        
    if df["game_date"][i][6]!="/":
        if df["game_date"][i][-2]=="/":
            month.append(df["game_date"][i][0:4]+df["game_date"][i][5:7]+"0"+df["game_date"][i][-1])
        if df["game_date"][i][-2]!="/":
            month.append(df["game_date"][i][0:4]+df["game_date"][i][5:7]+df["game_date"][i][-2:])
for i in range(len(month)):
    month[i]=int(month[i])
df["game_date"]=month
df["game_date"].head()

0    20120217
1    19990420
2    20091229
3    20000410
4    20030218
Name: game_date, dtype: int64

In [7]:
#選取出投籃次數最高的前十名
df = df[df.action_type.isin(['Jump Shot', 'Layup Shot', 'Driving Layup Shot', 'Turnaround Jump Shot',
       'Fadeaway Jump Shot', 'Running Jump Shot', 'Turnaround Fadeaway shot',
       'Pullup Jump shot', 'Slam Dunk Shot', 'Reverse Layup Shot'])]

In [8]:
#把投籃距離分成兩部分（一部份大於十，一部份小於十）
greater10,smaller10 = df[df.shot_distance>10],df[df.shot_distance<10]

In [9]:
#取目標值和特徵值
labels1=greater10.action_type.values
labels2=smaller10.action_type.values
features_greater10=greater10[["shot_id","lat","loc_x","loc_y","lon","shot_distance",
                              "period","shot_made_flag","game_date","opponent","seconds_bin","angle_bin","time"]]
features_smaller10=smaller10[["shot_id","lat","loc_x","loc_y","lon","shot_distance",
                              "period","shot_made_flag","game_date","opponent","seconds_bin","angle_bin","time"]]

In [10]:
#讓特徵值中是屬於字串的部分編碼
features1=pd.get_dummies(features_greater10)
features2=pd.get_dummies(features_smaller10)

In [11]:
#把兩格的資料切割成測試資料與訓練資料
train_x1,test_x1,train_y1,test_y1=train_test_split(features1,labels1,test_size=0.25,random_state=12)
train_x2,test_x2,train_y2,test_y2=train_test_split(features2,labels2,test_size=0.25,random_state=12)

In [13]:
#以RandomForestClassifier當作預測模型
rf1=RandomForestClassifier(max_depth=8,n_estimators=150).fit(train_x1,train_y1)
rf1.score(test_x1,test_y1)

0.8488126649076517

In [14]:
#以RandomForestClassifier當作預測模型
rf2 = RandomForestClassifier(max_depth=12,n_estimators=200
                             ,min_samples_split=150,min_samples_leaf=8).fit(X=train_x2,y=train_y2)
rf2.score(test_x2,test_y2)

0.5863874345549738

In [16]:
#分別將測試資料的秒數、投籃仰角做處理 並將遺漏值補0
df_test["seconds_bin"]=pd.cut(df_test["minutes_remaining"]*60+df_test["seconds_remaining"],7,labels=range(7))
df_test["seconds_bin"]=df_test.seconds_bin.astype(int)
df_test["angle_bin"]=pd.cut(df_test.apply(lambda row: 90 if row["loc_y"]==0 else m.degrees(m.atan(row["loc_x"]/abs(row["loc_y"]))),axis=1),7,labels=range(7))
df_test["angle_bin"]=df_test.angle_bin.astype(int)
df_test["time"] = df_test.minutes_remaining*60+df_test.seconds_remaining
df_test=df_test.fillna(value=0)

In [17]:
#將game_date的資料把斜線部分刪掉且把缺失值補零讓位數相等
month=[]
for i in range(len(df_test["game_date"])):
    if df_test["game_date"][i][6]=="/":
        if df_test["game_date"][i][-2]=="/":
            month.append(df_test["game_date"][i][0:4]+"0"+df_test["game_date"][i][5]+"0"+df_test["game_date"][i][-1])
        if df_test["game_date"][i][-2]!="/":
            month.append(df_test["game_date"][i][0:4]+"0"+df_test["game_date"][i][5]+df_test["game_date"][i][-2:])
            
    if df_test["game_date"][i][6]!="/":
        if df_test["game_date"][i][-2]=="/":
            month.append(df_test["game_date"][i][0:4]+df_test["game_date"][i][5:7]+"0"+df_test["game_date"][i][-1])
        if df_test["game_date"][i][-2]!="/":
            month.append(df_test["game_date"][i][0:4]+df_test["game_date"][i][5:7]+df_test["game_date"][i][-2:])
for i in range(len(month)):
    month[i]=int(month[i])
df_test["game_date"]=month

In [18]:
#把投籃距離分成兩部分（一部份大於十，一部份小於十）
testgreater10,testsmaller10 = df_test[df_test.shot_distance>=10],df_test[df_test.shot_distance<10]
#取出index和特徵值
tid1,tid2 = testgreater10.index,testsmaller10.index
tfeatures_greater10=testgreater10[["shot_id","lat","loc_x","loc_y","lon","shot_distance","period","shot_made_flag","game_date","opponent","seconds_bin","angle_bin","time"]]
tfeatures_smaller10=testsmaller10[["shot_id","lat","loc_x","loc_y","lon","shot_distance","period","shot_made_flag","game_date","opponent","seconds_bin","angle_bin","time"]]

In [19]:
#讓特徵值中是屬於字串的部分編碼
fea_test1=pd.get_dummies(tfeatures_greater10)
fea_test2=pd.get_dummies(tfeatures_smaller10)

In [20]:
#將所預測的投籃動作寫成一個Series 並依index排列
res1 = pd.Series(rf1.predict(fea_test1),index=tid1) 

In [21]:
#將所預測的投籃動作寫成一個Series 並依index排列
res2 = pd.Series(rf2.predict(fea_test2),index=tid2) 

In [None]:
#將兩個Series合併成一個
ans = pd.concat([res1,res2])

In [None]:
#將shot_id和action_type寫成DataFrame
pred=pd.DataFrame({"ID":df_test["shot_id"],"dd":ans})

In [None]:
#將DataFrame匯出csvv
pred.to_csv("test32.csv",index=False,header=False)