In [32]:
from d3m_ta2_nyu.d3mds import D3MDS
import pandas
import sklearn.metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import Imputer
from sklearn.tree.tree import DecisionTreeClassifier

# Read data

In [2]:
ds = D3MDS('data/185_baseball/TRAIN/dataset_TRAIN', 'data/185_baseball/TRAIN/problem_TRAIN')

In [3]:
data = ds.get_train_data()
targets = ds.get_train_targets()
target_names = [t['colName'] for t in ds.problem.get_targets()]

In [4]:
data.head()

Unnamed: 0_level_0,Player,Number_seasons,Games_played,At_bats,Runs,Hits,Doubles,Triples,Home_runs,RBIs,Walks,Strikeouts,Batting_average,On_base_pct,Slugging_pct,Fielding_ave,Position
d3mIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,HANK_AARON,23,3298,12364,2174,3771,624,98,755,2297,1402,1383.0,0.305,0.377,0.555,0.98,Outfield
1,JERRY_ADAIR,13,1165,4019,378,1022,163,19,57,366,208,499.0,0.254,0.294,0.347,0.985,Second_base
3,BOBBY_ADAMS,14,1281,4019,591,1082,188,49,37,303,414,447.0,0.269,0.34,0.368,0.955,Third_base
4,JOE_ADCOCK,17,1959,6606,823,1832,295,35,336,1122,594,1059.0,0.277,0.339,0.485,0.994,First_base
5,TOMMIE_AGEE,12,1129,3912,558,999,170,27,130,433,342,918.0,0.255,0.321,0.412,0.975,Outfield


# Do split

In [5]:
FOLDS = 4
RANDOM = 65682867  # The most random of all numbers

In [6]:
splits = StratifiedKFold(n_splits=FOLDS, shuffle=True,
                         random_state=RANDOM).split(data, targets)

train_split, test_split = next(splits)

In [7]:
train_data_split = data.loc[data.index[train_split]]
test_data_split = data.loc[data.index[test_split]]

train_target_split = targets[train_split]
test_target_split = targets[test_split]

# Do encoding on categorical columns

In [8]:
encoder = LabelBinarizer(), LabelBinarizer()

In [9]:
train_data_split[['Player', 'Position']].head()

Unnamed: 0_level_0,Player,Position
d3mIndex,Unnamed: 1_level_1,Unnamed: 2_level_1
0,HANK_AARON,Outfield
1,JERRY_ADAIR,Second_base
5,TOMMIE_AGEE,Outfield
6,LUIS_AGUAYO,Shortstop
7,EDDIE_AINSMITH,Catcher


In [10]:
player = encoder[0].fit_transform(train_data_split['Player'])
position = encoder[1].fit_transform(train_data_split['Position'])

In [11]:
player

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [12]:
categorical_train = pandas.concat([pandas.DataFrame(player, index=train_data_split.index),
                                   pandas.DataFrame(position, index=train_data_split.index)],
                                  axis=1)
categorical_train.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,800,801,802,0,1,2,3,4,5,6
d3mIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [13]:
player = encoder[0].transform(test_data_split['Player'])
position = encoder[1].transform(test_data_split['Position'])

In [15]:
categorical_test = pandas.concat([pandas.DataFrame(player, index=test_data_split.index),
                                  pandas.DataFrame(position, index=test_data_split.index)], axis=1)
categorical_test.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,800,801,802,0,1,2,3,4,5,6
d3mIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
16,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
37,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


# Do imputation on numerical columns

In [16]:
imputer = Imputer()

In [17]:
numerical_train = train_data_split.drop(['Player', 'Position'], axis=1)
numerical_train.head()

Unnamed: 0_level_0,Number_seasons,Games_played,At_bats,Runs,Hits,Doubles,Triples,Home_runs,RBIs,Walks,Strikeouts,Batting_average,On_base_pct,Slugging_pct,Fielding_ave
d3mIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,23,3298,12364,2174,3771,624,98,755,2297,1402,1383.0,0.305,0.377,0.555,0.98
1,13,1165,4019,378,1022,163,19,57,366,208,499.0,0.254,0.294,0.347,0.985
5,12,1129,3912,558,999,170,27,130,433,342,918.0,0.255,0.321,0.412,0.975
6,10,568,1104,142,260,43,10,37,109,94,220.0,0.236,0.307,0.393,0.96
7,15,1078,3048,299,707,108,54,22,317,263,315.0,0.232,0.296,0.324,0.966


In [18]:
numerical_train = imputer.fit_transform(numerical_train)
numerical_train

array([[  2.30000000e+01,   3.29800000e+03,   1.23640000e+04, ...,
          3.77000000e-01,   5.55000000e-01,   9.80000000e-01],
       [  1.30000000e+01,   1.16500000e+03,   4.01900000e+03, ...,
          2.94000000e-01,   3.47000000e-01,   9.85000000e-01],
       [  1.20000000e+01,   1.12900000e+03,   3.91200000e+03, ...,
          3.21000000e-01,   4.12000000e-01,   9.75000000e-01],
       ..., 
       [  1.00000000e+01,   1.12000000e+03,   3.53500000e+03, ...,
          3.57000000e-01,   4.05000000e-01,   9.74000000e-01],
       [  1.90000000e+01,   1.28000000e+03,   4.54600000e+03, ...,
          3.39000000e-01,   3.69000000e-01,   9.52000000e-01],
       [  1.20000000e+01,   1.09500000e+03,   3.28300000e+03, ...,
          2.91000000e-01,   3.72000000e-01,   9.41000000e-01]])

In [19]:
numerical_train = pandas.DataFrame(numerical_train, index=train_data_split.index)
numerical_train.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
d3mIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,23.0,3298.0,12364.0,2174.0,3771.0,624.0,98.0,755.0,2297.0,1402.0,1383.0,0.305,0.377,0.555,0.98
1,13.0,1165.0,4019.0,378.0,1022.0,163.0,19.0,57.0,366.0,208.0,499.0,0.254,0.294,0.347,0.985
5,12.0,1129.0,3912.0,558.0,999.0,170.0,27.0,130.0,433.0,342.0,918.0,0.255,0.321,0.412,0.975
6,10.0,568.0,1104.0,142.0,260.0,43.0,10.0,37.0,109.0,94.0,220.0,0.236,0.307,0.393,0.96
7,15.0,1078.0,3048.0,299.0,707.0,108.0,54.0,22.0,317.0,263.0,315.0,0.232,0.296,0.324,0.966


In [20]:
numerical_test = test_data_split.drop(['Player', 'Position'], axis=1)
numerical_test.head()

Unnamed: 0_level_0,Number_seasons,Games_played,At_bats,Runs,Hits,Doubles,Triples,Home_runs,RBIs,Walks,Strikeouts,Batting_average,On_base_pct,Slugging_pct,Fielding_ave
d3mIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
3,14,1281,4019,591,1082,188,49,37,303,414,447.0,0.269,0.34,0.368,0.955
4,17,1959,6606,823,1832,295,35,336,1122,594,1059.0,0.277,0.339,0.485,0.994
11,11,1195,3927,442,999,140,44,55,342,300,622.0,0.254,0.312,0.354,0.97
16,15,1380,4345,448,1216,170,26,32,377,138,267.0,0.28,0.307,0.353,0.968
37,14,1212,3581,432,915,128,15,155,540,545,577.0,0.256,0.358,0.429,0.986


In [21]:
numerical_test = imputer.transform(numerical_test)
numerical_test

array([[  1.40000000e+01,   1.28100000e+03,   4.01900000e+03, ...,
          3.40000000e-01,   3.68000000e-01,   9.55000000e-01],
       [  1.70000000e+01,   1.95900000e+03,   6.60600000e+03, ...,
          3.39000000e-01,   4.85000000e-01,   9.94000000e-01],
       [  1.10000000e+01,   1.19500000e+03,   3.92700000e+03, ...,
          3.12000000e-01,   3.54000000e-01,   9.70000000e-01],
       ..., 
       [  1.30000000e+01,   1.28000000e+03,   5.37100000e+03, ...,
          3.29000000e-01,   4.03000000e-01,   8.95000000e-01],
       [  1.80000000e+01,   2.10900000e+03,   7.34600000e+03, ...,
          3.95000000e-01,   3.71000000e-01,   9.57000000e-01],
       [  1.30000000e+01,   1.45300000e+03,   5.14400000e+03, ...,
          3.55000000e-01,   4.66000000e-01,   9.81000000e-01]])

In [22]:
numerical_test = pandas.DataFrame(numerical_test, index=test_data_split.index)
numerical_test.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
d3mIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
3,14.0,1281.0,4019.0,591.0,1082.0,188.0,49.0,37.0,303.0,414.0,447.0,0.269,0.34,0.368,0.955
4,17.0,1959.0,6606.0,823.0,1832.0,295.0,35.0,336.0,1122.0,594.0,1059.0,0.277,0.339,0.485,0.994
11,11.0,1195.0,3927.0,442.0,999.0,140.0,44.0,55.0,342.0,300.0,622.0,0.254,0.312,0.354,0.97
16,15.0,1380.0,4345.0,448.0,1216.0,170.0,26.0,32.0,377.0,138.0,267.0,0.28,0.307,0.353,0.968
37,14.0,1212.0,3581.0,432.0,915.0,128.0,15.0,155.0,540.0,545.0,577.0,0.256,0.358,0.429,0.986


# Merge categorical/numerical

In [23]:
train_data_split = pandas.concat([categorical_train, numerical_train], axis=1)
train_data_split.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,5,6,7,8,9,10,11,12,13,14
d3mIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,624.0,98.0,755.0,2297.0,1402.0,1383.0,0.305,0.377,0.555,0.98
1,0,0,0,0,0,0,0,0,0,0,...,163.0,19.0,57.0,366.0,208.0,499.0,0.254,0.294,0.347,0.985
5,0,0,0,0,0,0,0,0,0,0,...,170.0,27.0,130.0,433.0,342.0,918.0,0.255,0.321,0.412,0.975
6,0,0,0,0,0,0,0,0,0,0,...,43.0,10.0,37.0,109.0,94.0,220.0,0.236,0.307,0.393,0.96
7,0,0,0,0,0,0,0,0,0,0,...,108.0,54.0,22.0,317.0,263.0,315.0,0.232,0.296,0.324,0.966


In [26]:
test_data_split = pandas.concat([categorical_test, numerical_test], axis=1)
test_data_split.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,5,6,7,8,9,10,11,12,13,14
d3mIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0,0,0,0,0,0,0,0,0,0,...,188.0,49.0,37.0,303.0,414.0,447.0,0.269,0.34,0.368,0.955
4,0,0,0,0,0,0,0,0,0,0,...,295.0,35.0,336.0,1122.0,594.0,1059.0,0.277,0.339,0.485,0.994
11,0,0,0,0,0,0,0,0,0,0,...,140.0,44.0,55.0,342.0,300.0,622.0,0.254,0.312,0.354,0.97
16,0,0,0,0,0,0,0,0,0,0,...,170.0,26.0,32.0,377.0,138.0,267.0,0.28,0.307,0.353,0.968
37,0,0,0,0,0,0,0,0,0,0,...,128.0,15.0,155.0,540.0,545.0,577.0,0.256,0.358,0.429,0.986


# Run classifier

In [28]:
classifier = DecisionTreeClassifier()

In [30]:
classifier.fit(train_data_split, train_target_split)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [31]:
predictions = classifier.predict(test_data_split)
predictions

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 2,
       0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [33]:
sklearn.metrics.f1_score(test_target_split, predictions, average='macro')

0.58827143805146609