In [722]:
import sqlite3
import pandas as pd
from pandas import DataFrame, Series
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [723]:
con = sqlite3.connect('race.db')

In [724]:
#テーブル名を把握する
cursor = con.cursor()
cursor.execute('select * from sqlite_master WHERE type="table"')
cursor.fetchall()



[('table',
  'race_info',
  'race_info',
  2,
  'CREATE TABLE race_info (\n  id integer primary key autoincrement,\n\n  race_name     text    not null,\n  surface       text    not null,\n  distance      integer not null,\n  weather       text    not null,\n  surface_state text    not null,\n\n  race_start    text    not null,\n  race_number   integer not null,\n\n  surface_score integer,\n  date          text    not null,\n  place_detail  text    not null,\n  race_class    text    not null\n)'),
 ('table',
  'sqlite_sequence',
  'sqlite_sequence',
  3,
  'CREATE TABLE sqlite_sequence(name,seq)'),
 ('table',
  'race_result',
  'race_result',
  6,
  'CREATE TABLE race_result (\n  race_id integer not null,\n\n  order_of_finish text    not null,\n  frame_number       integer not null,\n  horse_number       integer not null,\n  horse_id           text    not null,\n  sex                text    not null,\n  age                integer not null,\n  basis_weight       real    not null,\n  jock

In [725]:
# SQL文を文字列として書く。複数行になっても良いように、引用符を3つ書いておく。

sql_query = '''SELECT race_id,horse_number,grade, order_of_finish, age, dhweight,distance, enterTimes,
                                        hweight, jwinper, odds, owinper, sex, surface, twinper, weather,
                                        weight, jEps, month, course, placeCode, headCount
                                        FROM feature'''

df = pd.read_sql(sql_query,con)
df.to_csv('feature.csv')
#df_result = pd.read_sql(query,con)

In [726]:
df

Unnamed: 0,race_id,horse_number,grade,order_of_finish,age,dhweight,distance,enterTimes,hweight,jwinper,...,sex,surface,twinper,weather,weight,jEps,month,course,placeCode,headCount
0,1,6,4,1.0,2.0,0.0,1500.0,0.0,474.0,0.266667,...,牝,芝,0.000000,晴,54.0,410.413333,8.0,右,札幌,10.0
1,1,8,4,2.0,2.0,-8.0,1500.0,0.0,448.0,0.000000,...,牡,芝,0.000000,晴,54.0,64.923077,8.0,右,札幌,10.0
2,1,7,4,3.0,2.0,0.0,1500.0,0.0,476.0,0.111111,...,牡,芝,0.000000,晴,54.0,77.777778,8.0,右,札幌,10.0
3,1,2,4,4.0,2.0,0.0,1500.0,0.0,484.0,0.000000,...,牡,芝,0.000000,晴,51.0,32.500000,8.0,右,札幌,10.0
4,1,5,4,5.0,2.0,6.0,1500.0,0.0,518.0,0.000000,...,牡,芝,0.166667,晴,54.0,40.000000,8.0,右,札幌,10.0
5,1,3,4,6.0,2.0,2.0,1500.0,0.0,466.0,0.176471,...,牡,芝,0.142857,晴,54.0,237.711765,8.0,右,札幌,10.0
6,1,10,4,7.0,2.0,8.0,1500.0,0.0,444.0,0.000000,...,牡,芝,0.000000,晴,53.0,76.500000,8.0,右,札幌,10.0
7,1,9,4,8.0,2.0,0.0,1500.0,0.0,442.0,0.066667,...,牡,芝,0.000000,晴,54.0,138.326667,8.0,右,札幌,10.0
8,1,1,4,9.0,2.0,0.0,1500.0,0.0,424.0,0.125000,...,牡,芝,0.000000,晴,54.0,62.500000,8.0,右,札幌,10.0
9,1,4,4,10.0,2.0,-2.0,1500.0,1.0,500.0,0.100000,...,牡,芝,0.083333,晴,54.0,242.600000,8.0,右,札幌,10.0


In [727]:
pd.value_counts(df['sex'])

牡    43701
牝    26963
セ     2336
Name: sex, dtype: int64

In [728]:
pd.value_counts(df['placeCode'])

東京    10095
中山     9503
京都     9365
阪神     8124
中京     7594
札幌     6896
福島     5913
新潟     5785
小倉     5486
函館     4239
Name: placeCode, dtype: int64

In [729]:
#null値を持つレコードを消しておく
df=df.dropna()
df.reset_index(drop=True, inplace=True) #drop=Trueにしないと古いindexが残る
df.head()

Unnamed: 0,race_id,horse_number,grade,order_of_finish,age,dhweight,distance,enterTimes,hweight,jwinper,...,sex,surface,twinper,weather,weight,jEps,month,course,placeCode,headCount
0,1,6,4,1.0,2.0,0.0,1500.0,0.0,474.0,0.266667,...,牝,芝,0.0,晴,54.0,410.413333,8.0,右,札幌,10.0
1,1,10,4,7.0,2.0,8.0,1500.0,0.0,444.0,0.0,...,牡,芝,0.0,晴,53.0,76.5,8.0,右,札幌,10.0
2,1,9,4,8.0,2.0,0.0,1500.0,0.0,442.0,0.066667,...,牡,芝,0.0,晴,54.0,138.326667,8.0,右,札幌,10.0
3,1,1,4,9.0,2.0,0.0,1500.0,0.0,424.0,0.125,...,牡,芝,0.0,晴,54.0,62.5,8.0,右,札幌,10.0
4,1,4,4,10.0,2.0,-2.0,1500.0,1.0,500.0,0.1,...,牡,芝,0.083333,晴,54.0,242.6,8.0,右,札幌,10.0


In [730]:
df[df.race_id==11]

Unnamed: 0,race_id,horse_number,grade,order_of_finish,age,dhweight,distance,enterTimes,hweight,jwinper,...,sex,surface,twinper,weather,weight,jEps,month,course,placeCode,headCount
84,11,8,2,5.0,6.0,-8.0,1200.0,1.0,480.0,0.111111,...,牝,芝,0.166667,晴,55.0,77.777778,8.0,右,札幌,10.0
85,11,7,2,6.0,4.0,6.0,1200.0,0.0,512.0,0.125,...,牡,芝,0.0,晴,57.0,244.63125,8.0,右,札幌,10.0
86,11,3,2,7.0,3.0,0.0,1200.0,0.0,466.0,0.0,...,牡,芝,0.0,晴,54.0,32.5,8.0,右,札幌,10.0
87,11,9,2,8.0,3.0,-4.0,1200.0,0.0,468.0,0.0,...,牡,芝,0.0,晴,54.0,110.825,8.0,右,札幌,10.0


In [731]:
#dropnaされたレコードが一着だった場合そのレースデータを全削除する(意味のない予測になるので)
race = pd.value_counts(df.race_id).index
race=race.sort_values()

for n in race:
    tmp = df[df.race_id==n]
    if tmp.min()['order_of_finish'] != 1:
        df = df[df.race_id!=n]

df.reset_index(drop=True, inplace=True)

In [732]:
df[df.order_of_finish==1].head()

Unnamed: 0,race_id,horse_number,grade,order_of_finish,age,dhweight,distance,enterTimes,hweight,jwinper,...,sex,surface,twinper,weather,weight,jEps,month,course,placeCode,headCount
0,1,6,4,1.0,2.0,0.0,1500.0,0.0,474.0,0.266667,...,牝,芝,0.0,晴,54.0,410.413333,8.0,右,札幌,10.0
5,3,1,4,1.0,3.0,0.0,1700.0,0.0,532.0,0.0,...,牡,ダ,0.0,晴,56.0,110.825,8.0,右,札幌,12.0
12,4,6,5,1.0,2.0,0.0,1200.0,0.0,442.0,0.111111,...,牝,芝,0.333333,晴,54.0,151.466667,8.0,右,札幌,14.0
19,5,16,4,1.0,3.0,0.0,2000.0,0.0,456.0,0.0,...,牝,芝,0.0,晴,54.0,40.0,8.0,右,札幌,16.0
31,6,9,3,1.0,5.0,-4.0,1700.0,0.0,424.0,0.1,...,牡,ダ,0.0,晴,57.0,242.6,8.0,右,札幌,12.0


In [733]:
#ダミー変数を作成, 元の属性を削除
sexes=pd.get_dummies(df['sex'])
df = pd.concat((df,sexes), axis=1)
df.drop('sex', axis=1, inplace=True)

sur = pd.get_dummies(df['surface'])
df= pd.concat((df,sur), axis=1)
df.drop('surface', axis=1, inplace=True)

weather = pd.get_dummies(df['weather'])
df= pd.concat((df,weather), axis=1)
df.drop('weather', axis=1, inplace=True)

course = pd.get_dummies(df['course'])
df= pd.concat((df,course), axis=1)
df.drop('course', axis=1, inplace=True)

place = pd.get_dummies(df['placeCode'])
df= pd.concat((df,place), axis=1)
df.drop('placeCode', axis=1, inplace=True)

mon=pd.get_dummies(df['month'])
df=pd.concat((df,mon), axis=1)
df.drop('month', axis=1, inplace=True)

gra=pd.get_dummies(df['grade'])
df=pd.concat((df,gra), axis=1)
df.drop('grade', axis=1, inplace=True)

In [734]:
#順位を優勝有無に変換
def order_of_finish_towinloss(race):
    order = race
    if order ==1:
    #if order <=3:
        return 1
    else:
        return 0

    
df['winloss']=df['order_of_finish'].apply(order_of_finish_towinloss)
df.drop('order_of_finish', axis=1, inplace=True)

In [735]:
#データ整形完了。データ数を確認
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68976 entries, 0 to 68975
Data columns (total 58 columns):
race_id         68976 non-null int64
horse_number    68976 non-null int64
age             68976 non-null float64
dhweight        68976 non-null float64
distance        68976 non-null float64
enterTimes      68976 non-null float64
hweight         68976 non-null float64
jwinper         68976 non-null float64
odds            68976 non-null float64
owinper         68976 non-null float64
twinper         68976 non-null float64
weight          68976 non-null float64
jEps            68976 non-null float64
headCount       68976 non-null float64
セ               68976 non-null float64
牝               68976 non-null float64
牡               68976 non-null float64
ダ               68976 non-null float64
芝               68976 non-null float64
小雨              68976 non-null float64
晴               68976 non-null float64
曇               68976 non-null float64
雨               68976 non-null float6

In [736]:
#不均衡データすぎだな。
pd.value_counts(df['winloss'])

0    63914
1     5062
Name: winloss, dtype: int64

In [737]:
#最小オッズ戦略のためのdfを作り、dfからrace_idを消す
odds_df = pd.concat([DataFrame(df.race_id), DataFrame(df.odds), DataFrame(df.winloss)], axis=1)
df_raceid = DataFrame(df.race_id)  #race_idもとっておく
df.drop('race_id',axis=1, inplace=True)
odds_df.head()

Unnamed: 0,race_id,odds,winloss
0,1,3.2,1
1,1,77.4,0
2,1,53.4,0
3,1,43.4,0
4,1,10.5,0


In [738]:
# temp=odds_df[odds_df.race_id==5]
# temp['predict']=0
# temp.ix[temp.odds==temp.min()['odds'], 'predict']=1
# ret = temp
# temp=odds_df[odds_df.race_id==6]
# temp['predict']=0
# temp.ix[temp.odds==temp.min()['odds'], 'predict']=1
# ret=pd.concat([ret, temp])
# ret.ix[:,'predict']


In [739]:
# #最少オッズの予測を1、それ以外を0にする
#race=pd.value_counts(odds_df.race_id).index
#race=rac.sort_values()
# for n in race:
#     tmp = odds_df[odds_df.race_id==n]
#     tmp['predict'] = 0
#     tmp.ix[tmp.odds==tmp.min()['odds'], 'predict']=1
    
#     if n==0:
#         new_odds_df = tmp
#     else:
#         new_odds_df=pd.concat([new_odds_df,tmp])

# new_odds_df.reset_index(drop=True,inplace=True)


In [740]:
new_odds_df.shape

(61089, 4)

In [741]:
# #最少オッズ戦略の正解率
# accuracy=0
# for key,row in new_odds_df.iterrows():
#     if row['predict'] == row['winloss']:
#         accuracy=accuracy+1

# accuracy /= new_odds_df.shape[0]
# accuracy

In [742]:
#支持率のみでデータフレームを作成しておく(対抗手法)
support = DataFrame(0.788/(df['odds']-0.1))
support.rename(columns={'odds':'support'}, inplace=True)

sup_df = pd.concat([support,DataFrame(df['winloss'])], axis=1)
sup_df.head()

Unnamed: 0,support,winloss
0,0.254194,1
1,0.010194,0
2,0.014784,0
3,0.018199,0
4,0.075769,0


In [743]:
#まずテストデータと訓練データに分ける
train = df.ix[0:df.shape[0]-5001]
test = df.ix[df.shape[0]-5000:]
print(train.shape, test.shape)
print(test[test.winloss==1].shape)

sup_train = sup_df.ix[0:sup_df.shape[0]-5001]
sup_test = sup_df.ix[sup_df.shape[0]-5000:]

#確率最大馬割り当て法(レースで一着だと予測される馬が複数あった場合、確率最大の馬だけに1を割り当てて他を0にする)のためのid
race_id_train=DataFrame(df_raceid.ix[0:df.shape[0]-5001])
race_id_train.reset_index(drop=True,inplace=True)

race_id_test=DataFrame(df_raceid.ix[df.shape[0]-5000:])
race_id_test.reset_index(drop=True,inplace=True)

race_id_sup_train = DataFrame(df_raceid.ix[0:sup_df.shape[0]-5001])
race_id_sup_train.reset_index(drop=True, inplace=True)

race_id_sup_test = DataFrame(df_raceid.ix[sup_df.shape[0]-5000:])
race_id_sup_test.reset_index(drop=True, inplace=True)
test

(63976, 57) (5000, 57)
(400, 57)


Unnamed: 0,horse_number,age,dhweight,distance,enterTimes,hweight,jwinper,odds,owinper,twinper,...,10.0,11.0,12.0,0,1,2,3,4,5,winloss
63976,1,3.0,-2.0,1700.0,3.0,414.0,0.050000,22.9,0.117647,0.08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
63977,7,3.0,2.0,1700.0,6.0,458.0,0.090000,48.1,0.000000,0.08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
63978,4,3.0,0.0,1700.0,0.0,466.0,0.100000,13.4,0.062500,0.08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
63979,9,3.0,14.0,1700.0,5.0,510.0,0.090000,29.0,0.061856,0.15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
63980,15,3.0,-6.0,1700.0,4.0,482.0,0.070000,50.6,0.070000,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
63981,11,3.0,-14.0,1700.0,3.0,438.0,0.040000,232.4,0.000000,0.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
63982,3,3.0,0.0,1700.0,0.0,492.0,0.020000,228.5,0.000000,0.05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
63983,4,3.0,4.0,1200.0,5.0,430.0,0.020000,7.8,0.032258,0.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
63984,11,3.0,0.0,1200.0,10.0,468.0,0.080000,4.4,0.050000,0.05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
63985,10,3.0,12.0,1200.0,6.0,466.0,0.030000,21.7,0.030000,0.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0


In [744]:
#訓練データについて、優勝と非優勝でデータを分ける
train_1 = train[train['winloss']==1]
train_0 = train[train['winloss']!=1]
train_0.reset_index(drop=True, inplace=True)
print(train_1.shape, train_0.shape)

suptrain_1 = sup_train[sup_train['winloss']==1]
suptrain_0 = sup_train[sup_train['winloss']!=1]
suptrain_0.reset_index(drop=True, inplace=True)

(4662, 57) (59314, 57)


In [745]:
#以下均衡データを作る。方針はダウンサンプリング
#非優勝データから優勝データ数分だけランダムに取り出す
def perm(df_0, df_1):
    idx = np.random.permutation(df_0.shape[0])
    for n in range(df_1.shape[0]):
        if n==0:
            temp_df = DataFrame(df_0.ix[idx[n]]).T
        
        else:
            temp_df=pd.concat([temp_df, DataFrame(df_0.ix[idx[n]]).T])

    return  temp_df

In [746]:
#取り出された非優勝データと優勝データを結合し均衡データを作る

#提案手法
new_train_0 = perm(train_0, train_1)
train = pd.concat([train_1,new_train_0], ignore_index=True)

#均衡になってる？
print(pd.value_counts(train.winloss))

#オッズなしにする
#new_train.drop('odds', axis=1, inplace=True)
#test.drop('odds', axis=1, inplace=True)

#対抗手法
new_suptrain_0  = perm(suptrain_0, suptrain_1)
sup_train = pd.concat([suptrain_1,new_suptrain_0], ignore_index=True)
sup_train.info()

1.0    4662
0.0    4662
Name: winloss, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9324 entries, 0 to 9323
Data columns (total 2 columns):
support    9324 non-null float64
winloss    9324 non-null float64
dtypes: float64(2)
memory usage: 218.5 KB


In [747]:
#グリッドサーチで最適モデルを返す
def randomforest(dframe):
    params =  [{'n_estimators': [10, 30, 50, 70, 90, 110, 130, 150, 200], 
           'max_features':['auto', 'sqrt', 'log2', None]}]
    model = GridSearchCV(RandomForestClassifier(oob_score=True),
                     params, cv=3, scoring='accuracy', n_jobs=-1)
    if dframe.shape[1]>2:
        model.fit(dframe.ix[:, 0:dframe.shape[1]-2], dframe.ix[: ,dframe.shape[1]-1])
    else:
        model.fit(DataFrame(dframe.ix[:,0]), dframe.ix[:, 1])
    
    return model.best_estimator_

In [748]:
#学習
model = randomforest(train)


#注意：model.fitとpredictはXの引数がdataframeでないとエラーになる
model_sup = randomforest(sup_train)

  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OO

In [749]:
#予測
output_train = model.predict(train.ix[:, 0:train.shape[1]-2])
output_test = model.predict(test.ix[:, 0:test.shape[1]-2])

output_suptrain = model_sup.predict(DataFrame(sup_train.ix[: , 0]))
output_suptest =  model_sup.predict(DataFrame(sup_test.ix[:, 0]))

In [None]:
#最大確率馬割り当て法 
#モデル、訓練データorテストデータ、モデルの出力、race_idを引数
def maximum_prob(model, df, output,race_id):

    predict_df=DataFrame(model.predict_proba(df.ix[:, 0:df.shape[1]-2]), columns=['prob0', 'prob1'])
    predict_df=pd.concat([predict_df, DataFrame(output, columns=['predict'])], axis=1)
    winloss = DataFrame(df.winloss)
    winloss.reset_index(drop=True,inplace=True)
    predict_df=pd.concat([predict_df, winloss], axis=1)
    predict_df=pd.concat([predict_df, race_id], axis=1)
    predict_df['predict_odds'] = predict_df.prob1/predict_df.prob0 #これが最大の馬に一着とする
    raceid=pd.value_counts(predict_df.race_id).index
    raceid=raceid.sort_values()
    for n in raceid:
        tmp = predict_df[predict_df.race_id==n]
        tmp.predict=0
        tmp.ix[tmp.predict_odds==tmp.max()['predict_odds'], 'predict']=1
        if n==raceid[0]:
            output_df = tmp
        else:
            output_df=pd.concat([output_df, tmp])
        
    return output_df.predict
   
output_train = maximum_prob(model, train,output_train, race_id_train)   
output_test = maximum_prob(model, test, output_test, race_id_test)
output_suptrain = maximum_prob(model_sup, sup_train, output_suptrain,race_id_sup_train)
output_suptest = maximum_prob(model_sup, sup_test, output_suptest,race_id_sup_test)
# predict_df=DataFrame(model.predict_proba(test.ix[:, 0:test.shape[1]-2]), columns=['prob0', 'prob1'])
# predict_df=pd.concat([predict_df, DataFrame(output_test, columns=['predict'])], axis=1)
# winloss = DataFrame(test.winloss)
# winloss.reset_index(drop=True,inplace=True)
# predict_df=pd.concat([predict_df, winloss], axis=1)
# predict_df=pd.concat([predict_df, race_id_test], axis=1)
# predict_df['predict_odds'] = predict_df.prob1/predict_df.prob0 #これが最大の馬に一着とする

# raceid=pd.value_counts(predict_df.race_id).index
# raceid=raceid.sort_values()
# for n in raceid:
#     tmp = predict_df[predict_df.race_id==n]
#     tmp.predict=0
#     tmp.ix[tmp.predict_odds==tmp.max()['predict_odds'], 'predict']=1
    
#     if n==raceid[0]:
#         output_df = tmp
#     else:
#         output_df=pd.concat([output_df, tmp])
        


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [None]:
#正解率、適合率、再現率を見る
print('学習', accuracy_score(output_train,train.ix[:, train.shape[1]-1]))
print('予測', accuracy_score(output_test, test.ix[:, test.shape[1]-1]))

print('学習', accuracy_score(output_suptrain,sup_train.ix[:, 1]))
print('予測',accuracy_score(output_suptest, sup_test.ix[:, 1]))

target_names = [0,1]
print(classification_report(test.ix[:, test.shape[1]-1], output_test,target_names))
print(classification_report(sup_test.ix[:, 1], output_suptest,target_names))

In [None]:
#ロジスティック回帰もしてみる

log_model=LogisticRegression()
log_model.fit(train.ix[:, 0:train.shape[1]-2], train.winloss.values)
print('学習',log_model.score(train.ix[:, 0:train.shape[1]-2], train.winloss.values))
predict = log_model.predict(test.ix[:, 0:test.shape[1]-2])
print('汎化',accuracy_score(predict,test.winloss.values),'\n')

log_model_s=LogisticRegression()
log_model_s.fit(DataFrame(sup_train.support), sup_train.winloss.values)
print('学習', log_model_s.score(DataFrame(sup_train.support), sup_train.winloss.values))
predict_s = log_model_s.predict(DataFrame(sup_test.support))
print('汎化', accuracy_score(predict_s,sup_test.winloss))

target_names = [0,1]
print(classification_report(test.winloss.values, predict,target_names))
print(classification_report(sup_test.winloss, predict_s,target_names))


In [None]:
#重要度フレームを降順で作成
imp=Series(model.feature_importances_)
col = Series(train.columns[0:train.shape[1]-1])
imp = pd.concat([col, imp], axis=1)
imp.rename(columns={0: 'feature', 1:'importance'}, inplace=True)
imp.sort_values('importance', ascending=False)

In [402]:
#これでrace_idマスタができる
rac=pd.value_counts(odds_df.race_id).index
rac=rac.sort_values()


In [671]:
dataf = DataFrame([[4,3],[4,0],[4,1],[1,3]], columns={'aa','y'})

In [672]:
dataf

Unnamed: 0,aa,y
0,4,3
1,4,0
2,4,1
3,1,3


In [673]:
dataf.min()['y']

0

In [674]:
#一個しか消えない
dataf.drop(dataf.aa==4)

Unnamed: 0,aa,y
2,4,1
3,1,3


In [675]:
dataf

Unnamed: 0,aa,y
0,4,3
1,4,0
2,4,1
3,1,3


In [676]:
dataf.ix[dataf.aa==dataf.max()['aa'], 'aa']='max!!'

In [677]:
dataf

Unnamed: 0,aa,y
0,max!!,3
1,max!!,0
2,max!!,1
3,1,3
