# Racing Baseline Script

In [1]:
import numpy as np
import pandas as pd

# Load Data

In [2]:
df = pd.read_csv("../Dataset/hr_race.csv")
df = df.drop(['Unnamed: 0', 'id'], axis=1)
df.head(3)

Unnamed: 0,rcDate,meet,rcNo,ord,rcTime,age,chulNo,corner_txt,differ,hrName,jkName,owName,plc,prdCtyNm,sex,trName,wgBudam,wgHr,win
0,2000-01-30,서울,1,3.0,01:20:01,4,1,- - - 10 - 9 -,4,웨스턴타이틀,함완식,이근우,1.1,호,암,김문갑,54.0,461.0,1.6
1,2000-01-30,서울,1,11.0,01:22:00,4,2,- - - 7 - 7 -,2,두루미,박수홍,범양해운,31.5,한,암,이왕언,53.0,428.0,610.6
2,2000-01-30,서울,1,12.0,01:22:01,5,3,- - - 12 - 11 -,2½,서천,박동랑,어울림네트웍스,5.8,뉴,암,김문갑,52.0,391.0,52.1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466783 entries, 0 to 466782
Data columns (total 19 columns):
rcDate        466783 non-null object
meet          466783 non-null object
rcNo          466783 non-null int64
ord           453232 non-null float64
rcTime        454858 non-null object
age           466783 non-null int64
chulNo        466783 non-null int64
corner_txt    466783 non-null object
differ        418181 non-null object
hrName        466783 non-null object
jkName        466783 non-null object
owName        466783 non-null object
plc           461067 non-null float64
prdCtyNm      466783 non-null object
sex           466783 non-null object
trName        466783 non-null object
wgBudam       466783 non-null float64
wgHr          466064 non-null float64
win           461091 non-null float64
dtypes: float64(5), int64(3), object(11)
memory usage: 67.7+ MB


## Feature
- rcDate : 경기가 열린 날 / 2000-01-30 부터 2017-01-06 까지
- meet : 경기가 열린 장소 / 서울, 제주, 부경
- rcNo : 경주시행일의 경주번호 / 1 부터 15 까지
- ord : 출발검사 도착순위
- rcTime : 경주시간(?)
- age : 말의 나이 / 4 부터 27 까지
- chulNo : 경주 출발게이트 번호 / 1 부터 16 까지
- corner_txt : 통과지점별통과순위
- differ : 선순위 도착마와의 도착 마신 차이 / 머리, 목, 주행중지, 출전제외 등
- hrName : 말의이름 / 웨스턴타이틀, 두루미 등
- jkName : 경주마 기승하는 기수명
- owName : 경주마 소유한 마주명
- plc : 해당마의 연승식 적중시 환급 배당률
- prdCtyNm : 경주마 생산국가명 / 호, 한, 뉴, 미 등
- sex : 말의성별 / 암, 거, 수
- trName : 조교사명
- wgBudam : 경주마의 부담중량(KG)
- wgHr : 경주마 해당일 체중(KG)
- win : 해당마의 단승식 적중시 환급 배당률

# Make dataset
- Training set: 80%
- Test set: 20%
- isWin: boolean

In [4]:
from datetime import datetime
from datetime import timedelta
import re

In [5]:
df['rcDate'] = pd.to_datetime(df['rcDate'])
df['year'] = pd.to_datetime(df.rcDate).dt.year
df['rcTime'] = pd.to_datetime(df['rcTime']).apply(lambda d: datetime.time(d))

In [6]:
df['ord'] = np.nan_to_num(df['ord']).astype(int)
df['isWin'] = df['ord']==(1 or 2)
df = df.drop(df['ord']==0)

In [7]:
train = df[df['year'] < 2014]
test = df[df['year'] >= 2014]

# Preprocessing
- 경기전까지 알 수 있는 신뢰할 수 있는 정보를 사용
- 경주마 정보, 기수 정보, 조교사 정보 등

## 1. One-Hot Encoding
- prdCtyNm : 경주마 생산국가명 / 호, 한 등 17개국
- sex : 말의성별 / 암, 거, 수

In [8]:
cols_to_transform = ['prdCtyNm', 'sex']
train = pd.get_dummies(train, columns=cols_to_transform)

In [9]:
train.head(5)

Unnamed: 0,rcDate,meet,rcNo,ord,rcTime,age,chulNo,corner_txt,differ,hrName,...,prdCtyNm_일,prdCtyNm_중,prdCtyNm_캐,prdCtyNm_프,prdCtyNm_한,prdCtyNm_헨,prdCtyNm_호,sex_거,sex_수,sex_암
2,2000-01-30,서울,1,12,01:22:01,5,3,- - - 12 - 11 -,2½,서천,...,0,0,0,0,0,0,0,0,0,1
3,2000-01-30,서울,1,8,01:21:01,4,4,- - - 8 - 8 -,½,런패스터,...,0,0,0,0,0,0,0,0,0,1
4,2000-01-30,서울,1,10,01:22:00,4,5,- - - 11 - 12 -,¾,한가람,...,0,0,0,0,0,0,0,0,0,1
5,2000-01-30,서울,1,7,01:21:01,6,6,- - - 6 - 6 -,2,캐터리나블루,...,0,0,0,0,0,0,0,0,0,1
6,2000-01-30,서울,1,13,01:24:00,3,7,- - - 13 - 13 -,6,데이빗슨,...,0,0,0,0,0,0,0,1,0,0


## 2. Vectorization
- hrName : 말의이름 / 웨스턴타이틀, 두루미 등
- jkName : 경주마 기승하는 기수명
- owName : 경주마 소유한 마주명
- trName : 조교사명

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
vectorizer = TfidfVectorizer(lowercase=False)

In [None]:
train['jkVec'] = vectorizer.fit_transform(train['jkName'])
train['owName'] = vectorizer.fit_transform(train['owName'])
train['trName'] = vectorizer.fit_transform(train['trName'])

In [28]:
train['hrVec'] = vectorizer.fit_transform(train['hrName'])

## Make dataset
- Training set : 80%
- Test set : 20%

In [22]:
train.jkVec.shape

(351177,)

# Modeling

In [None]:
predictors = ["Sex", "Age", "Embarked"]
x = titanic[predictors]
y = titanic["Survived"]

## DecisionTree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
decision = DecisionTreeClassifier().fit(x,y)
print(decision)

## RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
forst = RandomForestClassifier(n_jobs=-1).fit(x,y)
print(forst)

## Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [None]:
decision_score = cross_val_score(decision, x, y, cv=5).mean()
forst_score = cross_val_score(forst, x, y, cv=5).mean()

print("DecisionTree = {0:.6f}".format(decision_score))
print("RandomForest = {0:.6f}".format(forst_score))