In [1]:
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score, KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix
from scipy.stats import sem
import math
import numpy as np

In [2]:
def eval_clf(clf, X, y, K=3):
    ret = {}
    scores = cross_val_score(clf, X, y, cv = 3)
    print scores
    print "CV score: %.4f +/- %.4f" % (np.mean(scores), sem(scores))

# 1. Read files

In [3]:
X = pd.read_csv('train.csv', names=['id', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'cls'])
del X['id']
y = X.cls
del X['cls']

In [4]:
X.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7
0,27895540.0,100907200.0,88290160.0,27747959.522896,13544895.520226,6032274.041509,136620300.0
1,104359900.0,128225600.0,144744800.0,47212288.258147,14638399.414053,4656835.145223,49239790.0
2,45058650.0,102377100.0,39706190.0,16765604.345182,9393101.093684,3112345.906015,89685460.0
3,63625080.0,87409530.0,30324500.0,31085802.835866,11920472.753832,6181495.023623,83055170.0
4,68615470.0,55630330.0,34400380.0,16321977.344475,8851569.270636,3760765.552838,78290240.0


In [5]:
Xtest = pd.read_csv('validate_and_test.csv', names=['id', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7'])
del Xtest['id']
Xtest.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7
0,30762576.24575,145752900.0,37509273.423766,25363635.893043,10280755.654628,2677420.957194,57314260.0
1,68215444.555826,90324800.0,70397677.163411,35960556.882508,13622016.510508,2802449.029894,38072710.0
2,75473121.133032,87174980.0,76896678.903648,29592820.29618,22971007.653165,3679701.722529,51113740.0
3,38492881.276742,50639980.0,94466838.827649,24966820.937166,13873010.329408,7194749.98697,532093600.0
4,47811038.268233,38558000.0,22682356.302156,13186915.227945,13810196.684932,3840556.399298,50815870.0


In [6]:
X.shape, y.shape, Xtest.shape

((2025, 7), (2025,), (675, 7))

In [7]:
sub = pd.read_csv('example_submission.csv')
sub.head()

Unnamed: 0,Id,Label
0,2026,1
1,2027,1
2,2028,0
3,2029,1
4,2030,0


# 2. Normalization

In [9]:
from sklearn.preprocessing import normalize

In [10]:
XX = X.append(Xtest)
XX.shape

(2700, 7)

In [11]:
XX = normalize(XX)

In [12]:
Xn = XX[:X.shape[0],:]
Xtn = XX[X.shape[0]:,:]

In [13]:
Xn.shape, Xtn.shape

((2025, 7), (675, 7))

In [14]:
pd.DataFrame(Xn).head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.142334,0.514869,0.450492,0.141581,0.069112,0.030779,0.697092
1,0.452571,0.556067,0.627706,0.204742,0.063481,0.020195,0.213535
2,0.300322,0.682356,0.264647,0.111745,0.062606,0.020744,0.597765
3,0.442728,0.60823,0.21101,0.216307,0.082947,0.043013,0.577931
4,0.551587,0.447202,0.276538,0.131209,0.071156,0.030232,0.629361


In [15]:
pd.DataFrame(Xtn).head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.185041,0.876721,0.225623,0.152565,0.06184,0.016105,0.344752
1,0.474086,0.627742,0.489252,0.24992,0.094671,0.019477,0.264599
2,0.495097,0.57186,0.504435,0.194126,0.150688,0.024138,0.335301
3,0.070637,0.092927,0.173352,0.045816,0.025458,0.013203,0.976423
4,0.561562,0.452881,0.266414,0.154886,0.162207,0.045109,0.596855


# 3. Train clf

In [16]:
gb = AdaBoostClassifier()
eval_clf(gb,Xn,y)

[ 0.86538462  0.87111111  0.87537092]
CV score: 0.8706 +/- 0.0029


In [17]:
from sklearn.preprocessing import PolynomialFeatures

In [19]:
poly = PolynomialFeatures(2)

In [23]:
Xn.shape

(2025, 7)

In [26]:
pd.DataFrame(Xn).head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.142334,0.514869,0.450492,0.141581,0.069112,0.030779,0.697092
1,0.452571,0.556067,0.627706,0.204742,0.063481,0.020195,0.213535
2,0.300322,0.682356,0.264647,0.111745,0.062606,0.020744,0.597765
3,0.442728,0.60823,0.21101,0.216307,0.082947,0.043013,0.577931
4,0.551587,0.447202,0.276538,0.131209,0.071156,0.030232,0.629361


In [18]:
svm = SVC(kernel = 'rbf', gamma = 1, C=10)
eval_clf(svm,Xn,y)

[ 0.89201183  0.89925926  0.91097923]
CV score: 0.9008 +/- 0.0055


In [20]:
Xn2 = poly.fit_transform(Xn)

In [21]:
Xn2.shape

(2025, 36)

In [27]:
pd.DataFrame(Xn2).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,1,0.142334,0.514869,0.450492,0.141581,0.069112,0.030779,0.697092,0.020259,0.073284,...,0.020045,0.009785,0.004358,0.098695,0.004776,0.002127,0.048177,0.000947,0.021456,0.485938
1,1,0.452571,0.556067,0.627706,0.204742,0.063481,0.020195,0.213535,0.20482,0.25166,...,0.041919,0.012997,0.004135,0.04372,0.00403,0.001282,0.013556,0.000408,0.004312,0.045597
2,1,0.300322,0.682356,0.264647,0.111745,0.062606,0.020744,0.597765,0.090193,0.204926,...,0.012487,0.006996,0.002318,0.066797,0.00392,0.001299,0.037424,0.00043,0.0124,0.357323
3,1,0.442728,0.60823,0.21101,0.216307,0.082947,0.043013,0.577931,0.196008,0.269281,...,0.046789,0.017942,0.009304,0.125011,0.00688,0.003568,0.047938,0.00185,0.024859,0.334004
4,1,0.551587,0.447202,0.276538,0.131209,0.071156,0.030232,0.629361,0.304249,0.246671,...,0.017216,0.009336,0.003967,0.082578,0.005063,0.002151,0.044783,0.000914,0.019027,0.396095


In [38]:
svm = SVC(kernel = 'rbf', gamma = 1, C=10)
eval_clf(svm,Xn2,y)

[ 0.89349112  0.90074074  0.90949555]
CV score: 0.9012 +/- 0.0046


In [43]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [44]:
eval_clf(rf,Xn2,y)

[ 0.87869822  0.89777778  0.91543027]
CV score: 0.8973 +/- 0.0106


In [52]:
from sklearn.lda import LDA
from sklearn.qda import QDA
qda = QDA()
lda = LDA()

In [46]:
Xlda = lda.fit_transform(Xn,y)

In [56]:
Xlda.shape

(2025, 2)

In [59]:
Xlad2 = np.concatenate((Xn, Xlda),axis=1)

In [60]:
Xlad2.shape

(2025, 9)

In [62]:
eval_clf(rf, Xlad2, y)

[ 0.88905325  0.90962963  0.90207715]
CV score: 0.9003 +/- 0.0060


In [74]:
boostRF = AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=100),n_estimators=10,learning_rate=0.3)
eval_clf(boostRF,Xlad2,y)

[ 0.89349112  0.90814815  0.91691395]
CV score: 0.9062 +/- 0.0068


In [75]:
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
Xpca = pca.fit_transform(Xn)

In [76]:
Xpca.shape

(2025, 4)

In [80]:
eval_clf(boostRF,Xpca,y)

[ 0.88905325  0.88592593  0.89614243]
CV score: 0.8904 +/- 0.0030


In [55]:
eval_clf(qda,Xn2,y)

[ 0.88609467  0.88888889  0.90356083]
CV score: 0.8928 +/- 0.0054


In [81]:
eval_clf(lda, Xlda, y)

[ 0.88609467  0.89777778  0.91097923]
CV score: 0.8983 +/- 0.0072


In [91]:
eval_clf(svm, np.concatenate((Xpca, Xlda),axis=1), y)

[ 0.8964497   0.90074074  0.90801187]
CV score: 0.9017 +/- 0.0034


Using custom kernel: the Laplace kernel.

In [81]:
def laplace_ker(X, Y):
    # X.shape = n1*d, Y.shape = n2*d
    # return shape = n1*n2
    gamma = 1.0
    norm = [ [np.linalg.norm(X[i,:]-Y[j,:]) for j in xrange(Y.shape[0])] for i in xrange(X.shape[0]) ]
    norm = np.array(norm)
    return np.exp( -gamma*norm )

In [86]:
ll = laplace_ker(Xn[:80],Xn[:80,:])

In [87]:
ll.shape

(80, 80)

In [119]:
svm = SVC(kernel = laplace_ker, C=10)
eval_clf(svm,Xn,y)

ValueError: X should be a square kernel matrix

In [120]:
svm.fit(Xn,y)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel=<function laplace_ker at 0x7f92b65e21b8>, max_iter=-1,
  probability=False, random_state=None, shrinking=True, tol=0.001,
  verbose=False)

# 4. Predict on test data

In [121]:
yp = svm.predict(Xtn)

In [122]:
yp.shape, Xtest.shape

((675,), (675, 7))

In [123]:
sub.Label = map(int, yp)
sub.head()

Unnamed: 0,Id,Label
0,2026,2
1,2027,1
2,2028,1
3,2029,0
4,2030,0


In [124]:
sub.to_csv('sub-laplaceKer2.csv', index = False)

In [59]:
import tensorflow as tf

ImportError: cannot import name symbol_database