In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

%matplotlib inline

In [2]:
df = pd.read_csv('data/preprocessed_data.csv')

In [3]:
df.head()

Unnamed: 0,Winner,title_bout,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_BODY_att,B_avg_BODY_landed,B_avg_CLINCH_att,B_avg_CLINCH_landed,...,weight_class_Women's Strawweight,B_Stance_Open Stance,B_Stance_Orthodox,B_Stance_Sideways,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch
0,Red,True,5,0.0,4.0,0.0,9.2,6.0,0.2,0.0,...,0,0,1,0,0,0,0,1,0,0
1,Red,True,5,0.0,3.0,0.0,14.6,9.1,11.8,7.3,...,0,0,1,0,0,0,0,0,1,0
2,Red,False,3,0.0,3.0,0.0,15.354839,11.322581,6.741935,4.387097,...,0,0,1,0,0,0,0,1,0,0
3,Blue,False,3,0.0,4.0,0.0,17.0,14.0,13.75,11.0,...,0,0,0,0,0,1,0,1,0,0
4,Blue,False,3,0.0,1.0,0.0,17.0,14.5,2.5,2.0,...,0,0,0,0,1,0,0,0,1,0


### Normalizing data

In [4]:
df_num = df.select_dtypes(include=[np.float, np.int])

In [5]:
scaler = StandardScaler()

df[list(df_num.columns)] = scaler.fit_transform(df[list(df_num.columns)])

### Splitting into train, test

In [6]:
y = df['Winner']
X = df.drop(columns = 'Winner')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=43)

### Fitting baseline Random Forest

In [7]:
model = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=43)

In [8]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=43, verbose=0, warm_start=False)

In [9]:
model.oob_score_

0.66491336633663367

In [10]:
y_preds = model.predict(X_test)

In [11]:
accuracy_score(y_test, y_preds)

0.69444444444444442

In [12]:
model.feature_importances_

array([  4.41323301e-04,   5.55736851e-04,   2.25287106e-03,
         3.48939411e-03,   0.00000000e+00,   8.75121320e-03,
         9.94038763e-03,   7.97435049e-03,   8.06207602e-03,
         1.41007880e-02,   1.19687995e-02,   8.12569885e-03,
         8.11998811e-03,   1.22017619e-02,   1.04892706e-02,
         4.30512450e-03,   9.15336171e-03,   8.98870564e-03,
         6.52802436e-03,   3.48227675e-03,   1.34063918e-02,
         1.18060367e-02,   9.43271211e-03,   4.72990429e-03,
         9.09197135e-03,   6.90225291e-03,   7.12259915e-03,
         8.60985194e-03,   8.11749296e-03,   3.82403631e-03,
         3.39350095e-03,   8.33406315e-03,   7.22956248e-03,
         9.27035861e-03,   8.58298844e-03,   1.06650041e-02,
         1.04871666e-02,   8.51068910e-03,   7.41219834e-03,
         8.72286907e-03,   8.71147199e-03,   4.47247007e-03,
         8.54105512e-03,   8.65444529e-03,   6.62872786e-03,
         3.07170850e-03,   9.07439867e-03,   8.17475224e-03,
         8.91933710e-03,