In [16]:
import numpy as np
import pandas as pd

In [17]:
df = pd.read_csv('hockey_players.csv', sep=',')
df = df[df.position != 'G'].reset_index()
df.position.replace('D', 0, inplace=True)
df.position.replace('F', 1, inplace=True)
df.head()

Unnamed: 0,index,year,country,no,name,position,side,height,weight,birth,club,age,cohort,bmi
0,0,2001,RUS,10,tverdovsky oleg,0,L,185,84,1976-05-18,anaheim mighty ducks,24.952772,1976,24.543462
1,1,2001,RUS,2,vichnevsky vitali,0,L,188,86,1980-03-18,anaheim mighty ducks,21.119781,1980,24.332277
2,2,2001,RUS,26,petrochinin evgeni,0,L,182,95,1976-02-07,severstal cherepovetal,25.229295,1976,28.680111
3,3,2001,RUS,28,zhdan alexander,0,R,178,85,1971-08-28,ak bars kazan,29.675565,1971,26.827421
4,4,2001,RUS,32,orekhovsky oleg,0,R,175,88,1977-11-03,dynamo moscow,23.49076,1977,28.734694


In [19]:
df = df[['name', 'height', 'weight', 'bmi', 'position']]
df.head()

Unnamed: 0,name,height,weight,bmi,position
0,tverdovsky oleg,185,84,24.543462,0
1,vichnevsky vitali,188,86,24.332277,0
2,petrochinin evgeni,182,95,28.680111,0
3,zhdan alexander,178,85,26.827421,0
4,orekhovsky oleg,175,88,28.734694,0


In [20]:
from sklearn import preprocessing
df['bmisq'] = df.bmi * df.bmi
data = df[['height', 'weight', 'bmi', 'bmisq']]
target = df.position
data_scaled = preprocessing.scale(data)
data = pd.DataFrame(data_scaled)

In [21]:
from sklearn.linear_model import LogisticRegression
train_data = data_scaled[:-1]
train_target = df.position[:-1]
train_data

array([[ 0.21789138, -0.59310387, -1.06393386, -1.05775987],
       [ 0.77659587, -0.3024195 , -1.21346828, -1.19771797],
       [-0.3408131 ,  1.00566015,  1.86511116,  1.92757872],
       ..., 
       [-1.08575242, -0.88378823, -0.11716498, -0.14355176],
       [-0.71328276, -0.15707732,  0.57057203,  0.55093446],
       [-0.15457828,  1.00566015,  1.64377676,  1.68578218]])

In [22]:
from sklearn.cross_validation import KFold
kf = KFold(len(data.index), n_folds=5, shuffle=True, random_state=1)
kf

sklearn.cross_validation.KFold(n=5530, n_folds=5, shuffle=True, random_state=1)

In [23]:
c = 0.001

coefs = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]
res = []
for c in coefs:
    scores = []
    for train_index, test_index in kf:
        clf = LogisticRegression(penalty='l2', C=c, random_state=1)
        x_train, x_test = data.ix[train_index], data.ix[test_index]
        y_train, y_test = target.ix[train_index], target.ix[test_index]
        clf.fit(x_train, y_train)    
        scores.append(clf.score(x_test, y_test))
    print(c, np.mean(scores))

0.0001 0.645388788427
0.0005 0.650090415913
0.001 0.651717902351
0.005 0.650452079566
0.01 0.649909584087
0.1 0.649005424955
1.0 0.648643761302
10.0 0.649005424955
100.0 0.649186256781
1000.0 0.649186256781
10000.0 0.649186256781


In [8]:
from sklearn import cross_validation
x_train, x_test, y_train, y_test = cross_validation.train_test_split(data, target, test_size=0.2, random_state=0)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(4424, 3) (4424,)
(1106, 3) (1106,)


In [None]:
from sklearn import svm
c = 0.001

coefs = [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]
res = []
for c in coefs:
    scores = []
    for train_index, test_index in kf:
        clf = svm.SVC(kernel='linear', C=c)
        x_train, x_test = data.ix[train_index], data.ix[test_index]
        y_train, y_test = target.ix[train_index], target.ix[test_index]
        clf.fit(x_train, y_train)    
        scores.append(clf.score(x_test, y_test))
    print(c, np.mean(scores))

0.0001 0.632007233273
0.001 0.632007233273
0.01 0.632007233273
0.1 0.632007233273
1.0 0.632368896926
10.0

In [24]:
from sklearn import neighbors
c = 0.001

n_neighbors = [10, 100, 500, 1000, 2000]
res = []
for n in n_neighbors:
    scores = []
    weights = 'distance'    
    for train_index, test_index in kf:        
        clf = neighbors.KNeighborsClassifier(n, weights, algorithm='auto')
        x_train, x_test = data.ix[train_index], data.ix[test_index]
        y_train, y_test = target.ix[train_index], target.ix[test_index]
        clf.fit(x_train, y_train)    
        scores.append(clf.score(x_test, y_test))
    print(n, np.mean(scores))

10 0.641952983725
100 0.652622061483
500 0.652079566004
1000 0.651717902351
2000 0.64972875226
