# mnist digit recognizer trained with xgboost

In [37]:
import os
import pandas as pd
from pathlib import Path
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [19]:
print(os.listdir('../Datasets/digit-recognizer'))

['test.csv', 'sample_submission.csv', 'train.csv']


In [20]:
out_path = Path('./digit-recognizer-output')
in_path = Path('../Datasets/digit-recognizer')

In [22]:
df = pd.read_csv(in_path/'train.csv')
df.head(n=2)

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


The data consists of a 1st column named 'label' that contains an integer between 0 and 9. The remaining columns correspond to an "image" of 784 pixels (one column per pixel), which together comprise the greyscale brightness (value 0-255) of a 28x28 image.

In [24]:
Y = df['label']

In [30]:
X = df.drop('label', axis=1)

In [38]:
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [39]:
model = XGBClassifier()

In [41]:
%%time
model.fit(X_train, y_train)

CPU times: user 5min 47s, sys: 1.46 s, total: 5min 48s
Wall time: 5min 49s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [42]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [43]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 93.33%


In [None]:
# output result predictions on test.csv to file
submission_df = pd.DataFrame({'ImageId': list(range(1,len(predictions)+1)), 'Label': predictions})
submission_df.to_csv(out_path/f'submission.csv', index=False)