In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from pandas import Series, DataFrame
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt

In [2]:
train_fn = './Downloads/letter-recognition.data'
data_col = ['letter','x-box','y-box','width','high','onpix','x-bar','y-bar',
            'x2bar','y2bar','xybar','x2ybr','xy2br','x-ege','xegvy','y-ege','yegvx']
X = pd.read_csv(train_fn, sep=',', header=None, names=data_col,
                     skiprows=None, na_values='?', keep_default_na=False, engine='python')

In [3]:
y = X.ix[:,'letter']
y

0        T
1        I
2        D
3        N
4        G
5        S
6        B
7        A
8        J
9        M
10       X
11       O
12       G
13       M
14       R
15       F
16       O
17       C
18       T
19       J
20       J
21       H
22       S
23       O
24       J
25       C
26       M
27       W
28       H
29       G
        ..
19970    F
19971    C
19972    V
19973    T
19974    N
19975    E
19976    L
19977    A
19978    K
19979    M
19980    R
19981    S
19982    Y
19983    V
19984    S
19985    M
19986    O
19987    L
19988    D
19989    P
19990    W
19991    O
19992    E
19993    J
19994    T
19995    D
19996    C
19997    T
19998    S
19999    A
Name: letter, dtype: object

In [4]:
cols = list(X)

cols.pop(cols.index('letter'))
cols

['x-box',
 'y-box',
 'width',
 'high',
 'onpix',
 'x-bar',
 'y-bar',
 'x2bar',
 'y2bar',
 'xybar',
 'x2ybr',
 'xy2br',
 'x-ege',
 'xegvy',
 'y-ege',
 'yegvx']

In [5]:
X = X.ix[:, cols]
X

Unnamed: 0,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
0,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10
5,4,11,5,8,3,8,8,6,9,5,6,6,0,8,9,7
6,4,2,5,4,4,8,7,6,6,7,6,6,2,8,7,10
7,1,1,3,2,1,8,2,2,2,8,2,8,1,6,2,7
8,2,2,4,4,2,10,6,2,6,12,4,8,1,6,1,7
9,11,15,13,9,7,13,2,6,2,12,1,9,8,1,1,8


In [6]:
X_train = X[:16000]
X_test = X[16000:20000]
y_train = y[:16000]
y_test = y[16000:20000]
y_train

0        T
1        I
2        D
3        N
4        G
5        S
6        B
7        A
8        J
9        M
10       X
11       O
12       G
13       M
14       R
15       F
16       O
17       C
18       T
19       J
20       J
21       H
22       S
23       O
24       J
25       C
26       M
27       W
28       H
29       G
        ..
15970    A
15971    J
15972    Z
15973    V
15974    S
15975    F
15976    D
15977    T
15978    K
15979    Y
15980    V
15981    X
15982    I
15983    A
15984    Z
15985    J
15986    Q
15987    E
15988    F
15989    Q
15990    X
15991    F
15992    J
15993    F
15994    P
15995    L
15996    R
15997    G
15998    E
15999    C
Name: letter, dtype: object

In [7]:
def models_N_weights(X, y, M, k):
    model = []
    model_weights = []
    training_errors = []
   
    N = X.shape[0]
    w = np.ones(N) / N

    for m in range(M):
        h = DecisionTreeClassifier(max_depth=1)
        h.fit(X, y, sample_weight=w)
        pred = h.predict(X)
        
        eps = w.dot(pred != y)
        alpha = (np.log((1 - eps)*(k-1)) - np.log(eps)) / 2
        for i in range(N):
            if(y[i] == pred[i]):
                w[i] = w[i]*np.exp(-alpha)
            else:
                w[i] = w[i]*np.exp(alpha)
        w = w / w.sum()
        
        model.append(h)
        model_weights.append(alpha)
    
    return [model, model_weights]


In [8]:
def predict_joined_models(X, model, model_weights, frame, k):
    
    #for j in range(len(model)):
    pred = model[k].predict(X)
    for i in range(X.shape[0]):
        t = frame.get_value(i, pred[i])
        frame.set_value(i, pred[i], t + model_weights[k])
            
    #获取每行最大数据的列名
    frame['max'] = frame.max(axis=1)
    frame['max_column'] = frame.T.idxmax()
    joined_model = frame.ix[:,'max_column']
 
    return joined_model

In [9]:
def error_func(y, y_hat):
    #correct_pred = []
    correct_pred = map(lambda t1, t2: t1 == t2, y, y_hat)
    correct_pred = list(correct_pred)
    '''
    for i in range(len(y)):
        if(y[i] == y_hat[i]):
            correct_pred.append(1)
        else:
            correct_pred.append(0) 
    '''
    Err = 1 - float(sum(correct_pred)/len(correct_pred))
    return Err

In [10]:
M = 10
k = 26
M_list = []
train_err_list = []
test_err_list = []
N1= X_train.shape[0]
frame1 = DataFrame(np.zeros([N1,26]),columns=['A','B','C','D','E','F','G','H','I','J','k','L','M',
                                           'N','O','P','Q','R','S','T','U','V','W','X','Y','Z'])
N2= X_test.shape[0]
frame2 = DataFrame(np.zeros([N2,26]),columns=['A','B','C','D','E','F','G','H','I','J','k','L','M',
                                           'N','O','P','Q','R','S','T','U','V','W','X','Y','Z'])
model_fit = models_N_weights(X_train, y_train, M+1, k)
for m in range(M):
    y_hat = predict_joined_models(X_train, model_fit[0], model_fit[1], frame1, m)
    err = error_func(y_train, y_hat)
    train_err_list.append(err)

    y_hat = predict_joined_models(X_test, model_fit[0], model_fit[1], frame2, m)
    err = error_func(y_test, y_hat)
    test_err_list.append(err)
    M_list.append(m)
y_hat

ValueError: could not convert string to float: T

In [None]:
plt.plot(M_list, train_err_list, c= 'red', linestyle='-')
plt.plot(M_list, test_err_list, c= 'green', linestyle='-')
plt.xlabel('number of weak learners')
plt.ylabel('Error')
plt.title('Error x Number of models')
plt.show()