# Training Model Examples

## K-Nearest Neighbors

In [25]:
import numpy as np
from sklearn import neighbors
from sklearn import metrics
from sklearn.utils import shuffle


X = np.genfromtxt('data/X_train.txt', delimiter=None)
Y = np.genfromtxt('data/Y_train.txt', delimiter=None)

X,Y = shuffle(X, Y)

Xtr = X[:10000,:]
Ytr = Y[:10000]

Xval = X[10000:20000]
Yval = Y[10000:20000]

knnN = [1, 2, 3, 5, 10, 15, 20]
for n in knnN:
    knnClassifier = neighbors.KNeighborsClassifier(n_neighbors=n, weights="distance", n_jobs=-1)
    knnClassifier.fit(Xtr, Ytr)

    Yhat = knnClassifier.predict_proba(Xval)[:,1]

    print("ROC :", metrics.roc_auc_score(Yval, Yhat), "Training error: ", 1 - knnClassifier.score(Xtr, Ytr), "Validation error: ", 1 - knnClassifier.score(Xval, Yval))

ROC : 0.585716254583 Training error:  0.0062 Validation error:  0.373
ROC : 0.607340444885 Training error:  0.0062 Validation error:  0.3725
ROC : 0.614868193721 Training error:  0.0059 Validation error:  0.3593
ROC : 0.623432685576 Training error:  0.0059 Validation error:  0.3492
ROC : 0.630496518647 Training error:  0.0059 Validation error:  0.3411
ROC : 0.631805066292 Training error:  0.0059 Validation error:  0.336
ROC : 0.63483055211 Training error:  0.0059 Validation error:  0.3324


## Random Forest

In [23]:
from sklearn import ensemble

rfc = ensemble.RandomForestClassifier(
    n_estimators=500, min_samples_leaf=5, n_jobs=-1, oob_score=True)

rfc.fit(Xtr, Ytr)

rfcRoc = metrics.roc_auc_score(Yval, rfc.predict_proba(Xval)[:,1])

print("ROC :", rfcRoc)
print("Training Error: ", 1 - rfc.score(Xtr, Ytr))
print("Validation Error: ", 1 - rfc.score(Xval, Yval))

ROC : 0.696520420545
Training Error:  0.1295
Validation Error:  0.3047


## Neural network

In [21]:
from sklearn import neural_network

mlpc = neural_network.MLPClassifier(hidden_layer_sizes=(100,))
mlpc.fit(Xtr, Ytr)
mlpRoc = metrics.roc_auc_score(Yval, mlpc.predict_proba(Xval)[:,1])

print("ROC :", mlpRoc)
print("Training error: ", 1 - mlpc.score(Xtr, Ytr))
print("Validation error: ", 1 - mlpc.score(Xval, Yval))

ROC : 0.609586047413
Training error:  0.3834
Validation error:  0.3955


## Naive Bayes

In [22]:
from sklearn import naive_bayes

nbgc = naive_bayes.GaussianNB()
nbgc.fit(X, Y)
nbgRoc = metrics.roc_auc_score(Yval, nbgc.predict_proba(Xval)[:,1])

print("ROC :", rfcRoc)
print("Training Error: ", 1 - nbgc.score(Xtr, Ytr))
print("Validation Error: ", 1 - nbgc.score(Xval, Yval))

ROC : 0.695675438847
Training Error:  0.3885
Validation Error:  0.391
