### 目的：遺伝的アルゴリズムを用いて、バスケ選手の組み合わせ最適化を試してみる

In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import optimization  # 遺伝的アルゴリズムによる最適化用

In [2]:
# 2013年のstatsを加工したものを読み込む
nba_data=pd.read_csv("in_data.csv",engine="python")

In [3]:
nba_data.head()

Unnamed: 0,player,pos,age,g,gs,mp,fg,fga,fg.,x3p,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,Quincy Acy_TOT,2,23,63,0,847,66,141,0.468,4,...,0.66,72,144,216,28,23,26,30,122,171
1,Steven Adams_OKC,4,20,81,20,1197,93,185,0.503,0,...,0.581,142,190,332,43,40,57,71,203,265
2,Jeff Adrien_TOT,3,27,53,12,961,143,275,0.52,0,...,0.639,102,204,306,38,24,36,39,108,362
3,Arron Afflalo_ORL,1,28,73,73,2552,464,1011,0.459,128,...,0.815,32,230,262,248,35,3,146,136,1330
4,Alexis Ajinca_NOP,4,25,56,30,951,136,249,0.546,0,...,0.836,94,183,277,40,23,46,63,187,328


In [4]:
# "pos"を目的変数、それ以外の項目を説明変数として、CVで評価

# 最適なポジション、その他のポジションの適正をスタッツから推定する。
# Model_selectionツールを用いて、スコアを算出  （コマンドラインで実行）
# その際、年齢・出場機会等はポジションに直接関係ないと考えられるため特徴量から除外する
#del nba_data["age"]
#del nba_data["g"]
#del nba_data["gs"]
#del nba_data["mp"]


# 結果を読み込む
score_data=pd.read_csv("prob_score.csv")
score_data2=score_data[["player","prob_0_mean","prob_1_mean","prob_2_mean","prob_3_mean","prob_4_mean"]]

In [5]:
# もっとも推定スコアの高いポジションをIDごとに決定。
max_score=score_data2.set_index("player").max(axis=1)
score_data3=score_data2.set_index("player")
score_data3.head()

Unnamed: 0_level_0,prob_0_mean,prob_1_mean,prob_2_mean,prob_3_mean,prob_4_mean
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A.J. Price_MIN,0.139841,0.200381,0.324685,0.165974,0.169119
Aaron Brooks_TOT,0.475572,0.428453,0.070188,0.017948,0.007839
Aaron Gray_TOT,0.02391,0.042359,0.119462,0.363834,0.450435
Adonis Thomas_TOT,0.155477,0.270785,0.189228,0.119619,0.264891
Al Harrington_WAS,0.200176,0.364564,0.326,0.083879,0.025381


In [6]:
# ベストのポジションの確率で全てのポジションのスコアで割る
ad_score_data=score_data3.div(max_score,axis=0)
ad_score_data["pos"]=nba_data.set_index("player")["pos"]

ad_score_data.head()

Unnamed: 0_level_0,prob_0_mean,prob_1_mean,prob_2_mean,prob_3_mean,prob_4_mean,pos
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A.J. Price_MIN,0.430699,0.617156,1.0,0.511185,0.520872,1
Aaron Brooks_TOT,1.0,0.900922,0.147586,0.03774,0.016483,0
Aaron Gray_TOT,0.053081,0.09404,0.265215,0.80774,1.0,4
Adonis Thomas_TOT,0.574171,1.0,0.698813,0.441751,0.978233,2
Al Harrington_WAS,0.549082,1.0,0.894219,0.230081,0.06962,3


In [7]:
# 並べかえ
ad_score_data2=ad_score_data.sort_values("pos")
ad_score_data_sort=ad_score_data2

In [8]:
# 特徴量を追加
nba_data["ptsp48m"]=nba_data["pts"]/nba_data["mp"]*48
nba_data["ptspg"]=nba_data["pts"]/nba_data["g"]

In [9]:
# 結合
ad_score_data_sort_merge=pd.merge(ad_score_data_sort,nba_data,left_index=True,right_on="player")

scoring_data=ad_score_data_sort_merge.set_index("player")
# 20試合以上出場している選手に限定
scoring_data_over20g=scoring_data[scoring_data["g"]>=20]
scoring_data_over20g.columns=[u'prob_PG', u'prob_SG', u'prob_SF', u'prob_PF',u'prob_C', u'pos', u'pos_y', u'age', u'g', u'gs', u'mp', u'fg',u'fga', u'fg.', u'x3p', u'x3pa', u'x3p.', u'x2p', u'x2pa', u'x2p.',u'efg.', u'ft', u'fta', u'ft.', u'orb', u'drb', u'trb', u'ast', u'stl',u'blk', u'tov', u'pf', u'pts', u'ptsp48m', u'ptspg']
del scoring_data_over20g["pos_y"]
scoring_data_over20g=scoring_data_over20g.sort_values(["pos","ptsp48m"])
scoring_data_over20g["mppg"]=scoring_data_over20g["mp"]/scoring_data_over20g["g"]
scoring_data_over20g.head()

Unnamed: 0_level_0,prob_PG,prob_SG,prob_SF,prob_PF,prob_C,pos,age,g,gs,mp,...,trb,ast,stl,blk,tov,pf,pts,ptsp48m,ptspg,mppg
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Earl Watson_POR,0.865837,1.0,0.247823,0.059786,0.044963,0,34,24,0,161,...,15,28,5,1,17,33,12,3.57764,0.5,6.708333
Phil Pressey_BOS,1.0,0.876011,0.084232,0.035552,0.017169,0,22,75,11,1132,...,105,241,68,5,87,97,213,9.031802,2.84,15.093333
Pablo Prigioni_NYK,0.507572,1.0,0.566807,0.185084,0.115539,0,36,66,27,1283,...,129,228,67,2,60,132,252,9.427903,3.818182,19.439394
Diante Garrett_UTA,0.929917,1.0,0.223897,0.078305,0.015071,0,25,71,0,1048,...,97,120,41,5,75,75,248,11.358779,3.492958,14.760563
Gal Mekel_DAL,0.883408,1.0,0.341756,0.116725,0.064556,0,25,31,1,292,...,27,63,4,1,31,31,73,12.0,2.354839,9.419355


In [11]:
# 機械学習で算出したポジションをもとに並べ替え
pos_dict={"prob_PG":0,"prob_SG":1,"prob_SF":2,"prob_PF":3,"prob_C":4,}
scoring_data_over20g["cur_pos"]=scoring_data_over20g[["prob_PG","prob_SG","prob_SF","prob_PF","prob_C"]].T.apply(np.argmax).map(pos_dict)
scoring_data_over20g=scoring_data_over20g[['prob_PG','prob_SG','prob_SF','prob_PF','prob_C','pos','cur_pos','age','g','gs','mp','fg','fga','fg.','x3p','x3pa','x3p.','x2p','x2pa','x2p.','efg.',  'ft', 'fta', 'ft.', 'orb','drb','trb','ast','stl','blk','tov','pf','pts','ptsp48m','ptspg','mppg']]
scoring_data_over20g=scoring_data_over20g.sort_values(["cur_pos","ptspg"])
scoring_data_over20g.head()

Unnamed: 0_level_0,prob_PG,prob_SG,prob_SF,prob_PF,prob_C,pos,cur_pos,age,g,gs,...,trb,ast,stl,blk,tov,pf,pts,ptsp48m,ptspg,mppg
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Donald Sloan_IND,1.0,0.925918,0.565129,0.241126,0.08173,1,0,26,48,1,...,44,50,10,1,22,19,110,13.469388,2.291667,8.166667
Shane Larkin_DAL,1.0,0.764472,0.330018,0.117455,0.03066,0,0,21,48,0,...,42,71,26,1,39,46,132,12.957055,2.75,10.1875
Phil Pressey_BOS,1.0,0.876011,0.084232,0.035552,0.017169,0,0,22,75,11,...,105,241,68,5,87,97,213,9.031802,2.84,15.093333
Dennis Schröder_ATL,1.0,0.724518,0.1322,0.045538,0.0129,0,0,20,49,0,...,60,93,17,0,61,54,183,13.703588,3.734694,13.081633
Matthew Dellavedova_CLE,1.0,0.735398,0.213636,0.049228,0.012163,1,0,23,72,4,...,123,187,33,5,60,121,339,12.802518,4.708333,17.652778


In [35]:
# 5人の1試合平均スコア（ポジションのペナルティつき）の合計値を返す
def point(x):
    if isinstance(x,type(None)):
        return 0
    #uniq 5 members
    if len(np.unique(x))!=5:
        return 0
    ret=0
    for i in range(0,5):
        # 平均スコアに自分の本来のポジションではないときのペナルティをかける
        ret=ret+scoring_data_over20g["ptspg"][x[i]]*scoring_data_over20g.ix[x[i],i]
    return ret*-1

# 10人の1試合平均スコア（ポジションのペナルティつき）の合計値を返す
# 10人のポジションは(PG,PG,SG,SG,SF,SF,PF,PF,C,C)とする
def point10(x): # 最適化関数の引数は、最適化する配列の1つしか渡せない仕様
    if isinstance(x,type(None)):
        return 0
    #uniq 10 members
    if len(np.unique(x))!=10:
        return 0
    ret=0
    # 同じポジションで出場時間の合計が48を超えてはいけない
    for i in range(0,10,2):
        if (scoring_data_over20g["mppg"][x[i]]+scoring_data_over20g["mppg"][x[i+1]])>48:
            return 0
    for i in range(0,10):
        ret=ret+scoring_data_over20g["ptspg"][x[i]]*scoring_data_over20g.ix[x[i],i/2]
    return ret*-1


In [36]:
# 探索領域を設定。この場合、全選手数×10枠
domain10=[(0,len(scoring_data_over20g)-1)]*10

In [37]:
s=optimization.geneticoptimize(domain10,point10,popsize=100000,maxiter=35,elite=0.3,mutprob=0.5)
# GAの1世代ごとのスコアを表示。徐々に最適解に近づいていることを確認する

-89.1554456433
-89.1554456433
-89.6867292782
-92.6577035938
-96.112099588
-103.515781976
-112.379546663
-117.823875826
-121.263373252
-122.724722314
-129.717633626
-131.940981476
-136.025439448
-138.250196068
-139.075594102
-141.018473568
-144.525497819
-144.525497819
-144.525497819
-146.189941735
-146.189941735
-146.479888331
-147.160039156
-148.345301731
-148.345301731
-148.345301731
-149.627164854
-149.627164854
-149.627164854
-149.680707234
-149.680707234
-149.680707234
-149.680707234
-149.680707234
-149.680707234


In [38]:
# 合計の出場時間が余っている分でもう１人足す
limited_time = 240 - scoring_data_over20g.ix[s,:]["mppg"].sum()
last_one=np.argmax(scoring_data_over20g[scoring_data_over20g["mppg"]<=limited_time]["ptspg"])

In [39]:
# 適正ポジション、1試合あたりのスコアをもとに最適化した１１人のメンバー
best11=pd.concat([scoring_data_over20g.ix[s,:],pd.DataFrame(scoring_data_over20g.ix[last_one]).T])
best11

Unnamed: 0_level_0,prob_PG,prob_SG,prob_SF,prob_PF,prob_C,pos,cur_pos,age,g,gs,...,trb,ast,stl,blk,tov,pf,pts,ptsp48m,ptspg,mppg
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Stephen Curry_GSW,1.0,0.947142,0.539805,0.440297,0.301665,0.0,0.0,25.0,78.0,78.0,...,334.0,666.0,128.0,14.0,294.0,194.0,1873.0,31.589599,24.012821,36.487179
Jannero Pargo_CHA,0.839137,1.0,0.264212,0.13704,0.06895,0.0,1.0,34.0,29.0,0.0,...,20.0,52.0,14.0,1.0,26.0,17.0,136.0,26.975207,4.689655,8.344828
MarShon Brooks_TOT,0.478733,1.0,0.411506,0.57991,0.189014,1.0,1.0,25.0,35.0,0.0,...,54.0,25.0,14.0,4.0,24.0,21.0,159.0,24.151899,4.542857,9.028571
Kevin Durant_OKC,0.432591,1.0,0.691338,0.820112,0.606119,2.0,1.0,25.0,81.0,81.0,...,598.0,445.0,103.0,59.0,285.0,174.0,2593.0,39.866752,32.012346,38.54321
Charlie Villanueva_DET,0.270128,0.860255,1.0,0.279695,0.183024,3.0,2.0,29.0,20.0,0.0,...,34.0,6.0,4.0,5.0,7.0,16.0,92.0,24.533333,4.6,9.0
Paul George_IND,0.580082,0.898667,1.0,0.701187,0.368561,2.0,2.0,23.0,80.0,80.0,...,542.0,283.0,151.0,22.0,224.0,198.0,1737.0,28.770186,21.7125,36.225
Byron Mullens_TOT,0.18117,0.722419,0.997649,1.0,0.2395,4.0,3.0,24.0,45.0,0.0,...,92.0,14.0,15.0,11.0,27.0,64.0,189.0,21.913043,4.2,9.2
Carmelo Anthony_NYK,0.309774,0.714683,0.824974,1.0,0.6912,3.0,3.0,29.0,77.0,77.0,...,622.0,242.0,95.0,51.0,198.0,224.0,2112.0,33.995976,27.428571,38.727273
Marreese Speights_GSW,0.014282,0.062895,0.396222,0.942112,1.0,4.0,4.0,26.0,79.0,3.0,...,290.0,32.0,10.0,34.0,66.0,148.0,508.0,24.830957,6.43038,12.43038
Anthony Davis_NOP,0.264092,0.429497,0.489184,0.783563,1.0,4.0,4.0,20.0,67.0,66.0,...,673.0,105.0,89.0,189.0,109.0,200.0,1394.0,28.37659,20.80597,35.19403


In [40]:
# １試合あたりの平均スコア（ポジションのペナルティつき）
(point10(s)-scoring_data_over20g.ix[last_one]["ptspg"])*-1

153.41241455107709

In [41]:
# 1試合あたりの平均出場時間の合計
best11["mppg"].sum()

239.64388533750179

### 各ポジション２名ずつで計48分以下、全員の出場時間の合計が48×5以下になっているので、プレータイムの不整合はなく、理論上は平均153点とれる恐ろしいチームができた。実際にはチームスポーツで、１つのボールをシェアすることになるので、単純な足し算ではないが、このチームをみてみたい。

In [42]:
## 番外編
### バランスの良い（各ポジションのスコアが高い）選手を探してみる
### 各ポジションの予測値の合計値を算出

In [43]:
pos_sum=pd.DataFrame(scoring_data_over20g[["prob_PG","prob_SG","prob_SF","prob_PF","prob_C"]].T.sum()).sort_values(0,ascending=False)
pos_sum.head(10)

Unnamed: 0_level_0,0
player,Unnamed: 1_level_1
Tony Mitchell_DET,3.848197
Rasual Butler_IND,3.753229
Landry Fields_TOR,3.739885
Kevin Love_MIN,3.709292
Josh Smith_DET,3.658339
Josh McRoberts_CHA,3.644019
LeBron James_MIA,3.553422
Kevin Durant_OKC,3.550159
Paul George_IND,3.548497
Carmelo Anthony_NYK,3.540631


In [44]:
# スタメンクラスに絞ってみる
pos_sum2=pd.concat([scoring_data_over20g,pos_sum],axis=1).sort_values(0,ascending=False)
pos_sum2[pos_sum2["gs"]>40].head(10)

Unnamed: 0,prob_PG,prob_SG,prob_SF,prob_PF,prob_C,pos,cur_pos,age,g,gs,...,ast,stl,blk,tov,pf,pts,ptsp48m,ptspg,mppg,0
Kevin Love_MIN,0.352552,0.779375,0.763066,1.0,0.814299,3,3,25,77,77,...,341,59,35,196,136,2010,34.494101,26.103896,36.324675,3.709292
Josh Smith_DET,0.314345,0.478686,1.0,0.993935,0.871374,2,2,28,77,76,...,252,105,110,199,197,1264,22.224176,16.415584,35.454545,3.658339
Josh McRoberts_CHA,0.724323,0.840972,1.0,0.585198,0.493527,3,2,26,78,78,...,333,58,46,83,189,661,13.444068,8.474359,30.25641,3.644019
LeBron James_MIA,0.555907,1.0,0.778336,0.741021,0.478157,3,1,29,77,77,...,488,121,26,270,126,2089,34.552722,27.12987,37.688312,3.553422
Kevin Durant_OKC,0.432591,1.0,0.691338,0.820112,0.606119,2,1,25,81,81,...,445,103,59,285,174,2593,39.866752,32.012346,38.54321,3.550159
Paul George_IND,0.580082,0.898667,1.0,0.701187,0.368561,2,2,23,80,80,...,283,151,22,224,198,1737,28.770186,21.7125,36.225,3.548497
Carmelo Anthony_NYK,0.309774,0.714683,0.824974,1.0,0.6912,3,3,29,77,77,...,242,95,51,198,224,2112,33.995976,27.428571,38.727273,3.540631
Rudy Gay_TOT,0.326308,0.708148,1.0,0.852536,0.630404,2,2,27,73,73,...,209,95,56,224,171,1457,27.631766,19.958904,34.671233,3.517397
James Harden_HOU,0.696282,1.0,0.772216,0.649429,0.392196,1,1,24,73,73,...,446,115,29,265,177,1851,31.994238,25.356164,38.041096,3.510122
DeAndre Jordan_LAC,0.388527,0.775872,0.659658,0.67516,1.0,4,4,25,82,82,...,74,80,203,123,264,856,14.316376,10.439024,35.0,3.499217


### 各チームのエース、スター選手が並んだ。ここに上がっている選手は、一人で何でもやれる印象が強い。DeAndre Jordanはオールラウンダーというよりは、プレースタイルが独特のため、スタッツからはポジションが決められなかったということかもしれない。まあ、それはそれで上位10人に入ったことへの納得はいく。