In [16]:
import pandas as pd
from sklearn.cross_validation import cross_val_score, KFold, StratifiedKFold, cross_val_predict
from sklearn.metrics import confusion_matrix
from scipy.stats import sem
import math
import numpy as np

In [22]:
def eval_clf(clf, X, y, K=3, return_res=False):
    ret = {}
    #cvK = KFold( X.shape[0], K, shuffle=True, random_state = 1 )
    cvK = StratifiedKFold( y, n_folds = K, shuffle=True, random_state = 1 )
    scores = cross_val_score(clf, X, y, cv = cvK)
    print scores
    print "CV score: %.4f +/- %.4f" % (np.mean(scores), sem(scores))
    print 'Confusion matrix:'
    y_pred = cross_val_predict(clf, X, y, cv=cvK)
    labels = np.unique( np.append(y,y_pred) )
    confmat = confusion_matrix(y, y_pred, labels=labels)
    #print confmat 
    ret['confmat'] = confmat;
    avg_precision = confmat.diagonal().sum()*1./confmat.sum()
    precision = {}
    recall = {}
    print '%-10s%s%10s' %( 'predicted→' ,''.join('%10s'%str(l) for l in labels), 'recall')
    print 'actual↓'
    for i, label in enumerate(labels):
        precision[label] = confmat[i,i]*1./confmat[:,i].sum()
        recall[label] = confmat[i,i]*1./confmat[i,:].sum()
        print '%-10s%s%10s' %(label, ''.join( '%10s'%str(cij) for cij in confmat[i,:] ) ,str(round(recall[label],4)))
    print '%-10s%s%10s'% ( 'precision',''.join('%10s'%str(round(precision[l],4)) for l in labels),  '[%.4f]'%avg_precision)
    ret['recall'] = recall;
    ret['precision'] = precision
    ret['avg_precision'] = avg_precision
    ret['label'] = label
    if len(labels)==2:
        tp,tn = recall[labels[0]], recall[labels[1]]
        fp,fn = 1-precision[labels[0]], 1-precision[labels[1]]
        mcc = (tp*tn-fp*fn)/math.sqrt( (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn) )
        print 'MCC: %.4f' % mcc
        ret['mcc'] = mcc
    if return_res: return ret

# Reading data

In [2]:
colnames = ['id'] + map(lambda x:'PHOG%d'%x, range(1,681) ) + map(lambda x:'1Dsig%d'%x, range(1,17) ) 
X = pd.read_csv("train.csv", names = colnames + ['cls'])
Xtest = pd.read_csv("test_validate.csv", names = colnames)
del X['id']
test_ids = Xtest['id']
del Xtest['id']
y = X['cls']
del X['cls']

In [3]:
X.head()

Unnamed: 0,PHOG1,PHOG2,PHOG3,PHOG4,PHOG5,PHOG6,PHOG7,PHOG8,PHOG9,PHOG10,...,1Dsig7,1Dsig8,1Dsig9,1Dsig10,1Dsig11,1Dsig12,1Dsig13,1Dsig14,1Dsig15,1Dsig16
0,0.022262,0.022726,0.03636,0.04441,0.02383,0.024371,0.047929,0.03842,0.009257,0.005418,...,0.10078,0.069767,0.046512,0.054264,0.054264,0.054264,0.077519,0.069767,0.069767,0.054264
1,0.01261,0.01245,0.071335,0.037531,0.007311,0.009596,0.066724,0.043898,0.005685,0.003667,...,0.065041,0.065041,0.065041,0.056911,0.073171,0.065041,0.065041,0.04878,0.065041,0.056911
2,0.025992,0.039643,0.038228,0.031641,0.021685,0.028254,0.039459,0.03625,0.008339,0.009046,...,0.06422,0.055046,0.06422,0.06422,0.073394,0.055046,0.045872,0.055046,0.073394,0.06422
3,0.032523,0.032168,0.035243,0.030806,0.031936,0.031387,0.037663,0.024821,0.009769,0.011665,...,0.076923,0.042735,0.051282,0.076923,0.068376,0.059829,0.059829,0.076923,0.042735,0.059829
4,0.028967,0.026514,0.027574,0.041115,0.033244,0.031683,0.029524,0.039006,0.010779,0.004814,...,0.055556,0.047619,0.071429,0.087302,0.031746,0.071429,0.063492,0.079365,0.063492,0.047619


In [4]:
X.shape, Xtest.shape

((890, 696), (382, 696))

In [5]:
y.value_counts()

2    623
1    267
dtype: int64

# Feature engineering

In [6]:
PHOGcols = range(680)
Sig1Dcols = range(680, 696)

Information regarding the meaning of 1Dsig and PHOG can be found at: https://ml2.inf.ethz.ch/courses/ml/tutorials/iml_tutorial_10.pdf

In [112]:
Xphog = X.ix[:, PHOGcols]
Xphog_test = Xtest.ix[:, PHOGcols]
Xsig1D = X.ix[:, Sig1Dcols]
Xsig1D_test = Xtest.ix[:, Sig1Dcols]

In [8]:
Xphog.head()

Unnamed: 0,PHOG1,PHOG2,PHOG3,PHOG4,PHOG5,PHOG6,PHOG7,PHOG8,PHOG9,PHOG10,...,PHOG671,PHOG672,PHOG673,PHOG674,PHOG675,PHOG676,PHOG677,PHOG678,PHOG679,PHOG680
0,0.022262,0.022726,0.03636,0.04441,0.02383,0.024371,0.047929,0.03842,0.009257,0.005418,...,0.0,0.0,0.0,0.000671,0.000987,0.000728,0.000174,0.000173,0.000268,0.0
1,0.01261,0.01245,0.071335,0.037531,0.007311,0.009596,0.066724,0.043898,0.005685,0.003667,...,0.0,0.0,0.000566,0.0,0.0,0.000203,0.0,0.000567,0.001306,0.000202
2,0.025992,0.039643,0.038228,0.031641,0.021685,0.028254,0.039459,0.03625,0.008339,0.009046,...,0.000581,0.000408,0.000163,0.0,0.000172,0.001328,0.0,0.0,0.0,0.000146
3,0.032523,0.032168,0.035243,0.030806,0.031936,0.031387,0.037663,0.024821,0.009769,0.011665,...,0.000143,0.000436,0.001412,0.002262,0.001477,0.000479,0.0,0.0,0.0,0.0
4,0.028967,0.026514,0.027574,0.041115,0.033244,0.031683,0.029524,0.039006,0.010779,0.004814,...,0.0,0.0,0.000202,0.000212,0.001074,5.5e-05,0.0,0.0,0.0,0.0


In [9]:
Xsig1D.head()

Unnamed: 0,1Dsig1,1Dsig2,1Dsig3,1Dsig4,1Dsig5,1Dsig6,1Dsig7,1Dsig8,1Dsig9,1Dsig10,1Dsig11,1Dsig12,1Dsig13,1Dsig14,1Dsig15,1Dsig16
0,0.062016,0.054264,0.046512,0.077519,0.054264,0.054264,0.10078,0.069767,0.046512,0.054264,0.054264,0.054264,0.077519,0.069767,0.069767,0.054264
1,0.073171,0.056911,0.065041,0.056911,0.065041,0.056911,0.065041,0.065041,0.065041,0.056911,0.073171,0.065041,0.065041,0.04878,0.065041,0.056911
2,0.082569,0.055046,0.055046,0.055046,0.06422,0.073394,0.06422,0.055046,0.06422,0.06422,0.073394,0.055046,0.045872,0.055046,0.073394,0.06422
3,0.076923,0.068376,0.068376,0.059829,0.059829,0.051282,0.076923,0.042735,0.051282,0.076923,0.068376,0.059829,0.059829,0.076923,0.042735,0.059829
4,0.071429,0.071429,0.071429,0.055556,0.063492,0.047619,0.055556,0.047619,0.071429,0.087302,0.031746,0.071429,0.063492,0.079365,0.063492,0.047619


### 1D signature

In [113]:
X['1Dsig-var'] = Xsig1D.var(axis = 1)
X['1Dsig-mean'] = Xsig1D.mean(axis = 1)
Xtest['1Dsig-var'] = Xsig1D_test.var(axis = 1)
Xtest['1Dsig-mean'] = Xsig1D_test.mean(axis = 1)

In [11]:
X.head()

Unnamed: 0,PHOG1,PHOG2,PHOG3,PHOG4,PHOG5,PHOG6,PHOG7,PHOG8,PHOG9,PHOG10,...,1Dsig9,1Dsig10,1Dsig11,1Dsig12,1Dsig13,1Dsig14,1Dsig15,1Dsig16,1Dsig-var,1Dsig-mean
0,0.022262,0.022726,0.03636,0.04441,0.02383,0.024371,0.047929,0.03842,0.009257,0.005418,...,0.046512,0.054264,0.054264,0.054264,0.077519,0.069767,0.069767,0.054264,0.000204,0.0625
1,0.01261,0.01245,0.071335,0.037531,0.007311,0.009596,0.066724,0.043898,0.005685,0.003667,...,0.065041,0.056911,0.073171,0.065041,0.065041,0.04878,0.065041,0.056911,4.2e-05,0.0625
2,0.025992,0.039643,0.038228,0.031641,0.021685,0.028254,0.039459,0.03625,0.008339,0.009046,...,0.06422,0.06422,0.073394,0.055046,0.045872,0.055046,0.073394,0.06422,9.2e-05,0.0625
3,0.032523,0.032168,0.035243,0.030806,0.031936,0.031387,0.037663,0.024821,0.009769,0.011665,...,0.051282,0.076923,0.068376,0.059829,0.059829,0.076923,0.042735,0.059829,0.000134,0.0625
4,0.028967,0.026514,0.027574,0.041115,0.033244,0.031683,0.029524,0.039006,0.010779,0.004814,...,0.071429,0.087302,0.031746,0.071429,0.063492,0.079365,0.063492,0.047619,0.000201,0.0625


### PHOG

In the description, they said they used "a level 3 pyramid", and the PHOG have 860 columns (bins?). So...? wtf..

http://blog.csdn.net/smartempire/article/details/24038355

# Normalization?

Not sure whether normalization will help or the contrary...

In [56]:
from sklearn.preprocessing import normalize

In [57]:
XX = X.append(Xtest)
XX.shape

(1272, 696)

In [58]:
XX = normalize(XX)

In [59]:
Xn = XX[:X.shape[0],:]
Xtn = XX[X.shape[0]:,:]

In [60]:
Xn.shape, Xtn.shape

((890, 696), (382, 696))

# Dimension reduction

In [35]:
from sklearn.decomposition import PCA

In [69]:
Xpca50 = PCA(n_components=50).fit_transform(X)

In [70]:
Xpca100 = PCA(n_components=100).fit_transform(X)
Xpca200 = PCA(n_components=200).fit_transform(X)

# Train clf and cross validate

Note: the classes are not balanced, we need to re-weight in the classfiers (for example, use `class_weight='balanced'`), and as we have more features than data, it's better to use L1 regularization. 

In [17]:
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model.logistic import LogisticRegression

In [104]:
eval_clf(LinearSVC(C=100, penalty='l1', dual=False ,class_weight='balanced', random_state=0),X,y, )

[ 0.72390572  0.73063973  0.73310811]
CV score: 0.7292 +/- 0.0027

Confusion matrix:
predicted→         1         2    recall
actual↓
1                163       104    0.6105
2                137       486    0.7801
precision     0.5433    0.8237  [0.7292]
MCC: 0.3971


In [89]:
eval_clf(LinearSVC(C=500, class_weight='balanced', random_state=0),Xpca50,y)

[ 0.73063973  0.74074074  0.77027027]
CV score: 0.7472 +/- 0.0119

Confusion matrix:
predicted→         1         2    recall
actual↓
1                 94       173    0.3521
2                 52       571    0.9165
precision     0.6438    0.7675  [0.7472]
MCC: 0.3083


In [82]:
eval_clf(LinearSVC(C=600, class_weight='balanced', random_state=0),Xpca100,y, )

[ 0.75757576  0.76094276  0.79391892]
CV score: 0.7708 +/- 0.0116

Confusion matrix:
predicted→         1         2    recall
actual↓
1                 98       169     0.367
2                 35       588    0.9438
precision     0.7368    0.7768  [0.7708]
MCC: 0.3974


In [102]:
eval_clf(LinearSVC(C=1000, penalty='l1', dual=False, class_weight='balanced', random_state=0),Xpca100,y, )

[ 0.72727273  0.71043771  0.76689189]
CV score: 0.7349 +/- 0.0167

Confusion matrix:
predicted→         1         2    recall
actual↓
1                181        86    0.6779
2                150       473    0.7592
precision     0.5468    0.8462  [0.7348]
MCC: 0.4360


In [111]:
eval_clf(LinearSVC(C=30, class_weight='balanced', penalty='l1', dual=False, random_state=0),Xpca200,y, )

[ 0.7037037   0.71043771  0.78716216]
CV score: 0.7338 +/- 0.0268

Confusion matrix:
predicted→         1         2    recall
actual↓
1                177        90    0.6629
2                147       476     0.764
precision     0.5463     0.841  [0.7337]
MCC: 0.4276


In [88]:
eval_clf(LogisticRegression(penalty='l1', C=1000, class_weight='balanced', random_state=0), X, y)

[ 0.6969697   0.73737374  0.72972973]
CV score: 0.7214 +/- 0.0124

Confusion matrix:
predicted→         1         2    recall
actual↓
1                160       107    0.5993
2                141       482    0.7737
precision     0.5316    0.8183  [0.7213]
MCC: 0.3806


In [87]:
eval_clf(SVC(class_weight='balanced', C=1, random_state=0), Xpca50, y)

[ 0.2996633   0.2996633   0.69932432]
CV score: 0.4329 +/- 0.1332

Confusion matrix:
predicted→         1         2    recall
actual↓
1                178        89    0.6667
2                416       207    0.3323
precision     0.2997    0.6993  [0.4326]
MCC: 0.0118


# Make predictions

For the moment the best result is linearSVC applied on Xpca100, we use it to make a prediciton. 

In [96]:
clf = LinearSVC(C=600, class_weight='balanced', random_state=0)
clf.fit(Xpca100, y)

LinearSVC(C=600, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0)

In [115]:
pca = PCA(n_components=100)
pca.fit(X)
Xtest_pca100 = pca.transform(Xtest)

In [116]:
y_pred = clf.predict(Xtest_pca100)

In [118]:
sub = pd.read_csv("examle_solution_handin.csv")

In [119]:
sub['Label'] = y_pred

In [120]:
sub.head()

Unnamed: 0,Id,Label
0,665,2
1,12,2
2,1074,2
3,323,1
4,892,1


In [122]:
sub.to_csv("submissions/1205_linSVC.csv", index=False)