In [1]:
import numpy as np 
import pandas as pd
from pandas import DataFrame, Series
from numpy.random import randn, uniform, binomial, choice, normal




def sq_error( Yhat, Y ):
    return np.sum( (Yhat - Y)**2 ,axis = 0 ) / (len(Yhat) + 0.0)

def percent_correct( Yhat,Y ):
    return np.sum( Yhat == Y) / ( len(Yhat) + 0.0 ) 

def run_perceptron( X, Weight ):
    '''
    :param X: DataFrame of input. The first column has to be all 1s.
    :param Weight: [P by 1] array specifying the weight of the perceptron parameter.  
    :return: [N by 1] array of 0 or 1. 
    '''
    if type(X) == DataFrame:
        X = X.as_matrix()
    if type(Weight) == DataFrame:
        Weight = Weight.as_matrix()

    # Perceptron Algorithm simply adds values in x_n = [x_{n,1},... x_{n,p}] weighted by parameter P = [p_1,...p_p]
    # and classifies the outcome by sign.
    Y = X.dot(Weight)
    Y = np.sign(Y)

    return Y

def get_perceptron_parameter( X, Y, m):
    '''
    :param X: DataFrame of Input  The first column has to be all 1s.
    :param Y: DataFrame of Output
    :param m: number of maximum iteration.
    :return: best weight, least error, and the dataframe of the whole process
    '''
    if type(X) == DataFrame:
        X = X.as_matrix()
    if type(Y) == DataFrame:
        Y = Y.as_matrix()
    # The weights will be initialized to the parameters of the OLS.
    # That is to say, weight W = (XTX)^-1 X^T Y
    # So use it if the X has a full rank (meaning fewer columns than rows)
    # if not just initialize it to random numbers.
#   P = get_OLS_parameter(X,Y)

    # But for now I will just initialize to random numbers.
    W = randn( len(X[0])).reshape( len(X[0]),1 )             # dim(X) by 1 array.
    # or use W = randn( len(X[0]),1)  : it is the same code. 
    
    # implements the following
    # pick a misclassified point y_n: sign(y_n) /= y_n
    # update the weight vector: W = W + y_n * x_n
    i = 0
    error = pd.DataFrame( columns = ['error' , 'Weight' ])
    error.index.name = ['trial']

    while i < m:
        # while the iteration runs, we will keep adjusting the weights (W),
        # keep the error to see how the error changes.
        # here the error measure we choose to use squred error.
        # We can use the weight that gave out the least error.
        Yhat = run_perceptron(X, W)
        error.loc[i] = [sq_error( Yhat, Y ) , W]
        pick = get_misclassified(Yhat, Y)
        if pick == -1:
            break
        W =  W + (Y[pick]*X[pick]).reshape(len(X[pick]),1) 
        i = i + 1

    best_weight_index = error['error'].idxmin()
    best_weight = error.ix[best_weight_index]['Weight']
    least_error = error.ix[best_weight_index]['error']
    return best_weight, least_error, error

def get_misclassified(Yhat, Y):
    # if everything is classified correctly, return -1, else return random index.
    if np.sum( Yhat == Y , axis = 0) == len(Yhat) :
        return -1
    mis_index = []

    for index, item in enumerate( (Yhat == Y) ):
        if item == [False]:
            mis_index.append(index)

    pick = np.random.choice(mis_index)
    return pick




In [50]:
fws_distance = pd.read_table( '/Users/MK/GitHub/the_answer_is/data/answer/fetch_doc_ws_train_minimum_distance.csv',sep = ',')
w2v_distance = pd.read_table( '/Users/MK/GitHub/the_answer_is/data/answer/pure_ck12_word2vec_distance.csv',sep = ',')
pws_distance = pd.read_table( '/Users/MK/GitHub/the_answer_is/data/answer/pure_wsregression_distance.csv',sep = ',')

fws_distance = fws_distance[['A','B','C','D']]
fws_distance.columns = ['A_fws', 'B_fws', 'C_fws', 'D_fws']
w2v_distance = w2v_distance[['A','B','C','D']]
w2v_distance.columns = ['A_w2v', 'B_w2v', 'C_w2v', 'D_w2v']
pws_distance = pws_distance[['A','B','C','D']]
pws_distance.columns = ['A_pws', 'B_pws', 'C_pws', 'D_pws']

all_distance = pd.concat([fws_distance,w2v_distance,pws_distance], axis = 1)
all_distance['one_fws'] = Series( choice(np.array([1]), len(fws_distance),replace= True) )
all_distance['one_w2v'] = Series( choice(np.array([1]), len(fws_distance),replace= True) )
all_distance['one_pws'] = Series( choice(np.array([1]), len(fws_distance),replace= True) )

sort_by_method = ['one_fws','A_fws', 'B_fws', 'C_fws', 'D_fws',
                  'one_w2v','A_w2v', 'B_w2v', 'C_w2v', 'D_w2v',
                  'one_pws','A_pws', 'B_pws', 'C_pws', 'D_pws']
all_distance[sort_by_method]
all_distance


Unnamed: 0,A_fws,B_fws,C_fws,D_fws,A_w2v,B_w2v,C_w2v,D_w2v,A_pws,B_pws,C_pws,D_pws,one_fws,one_w2v,one_pws
0,0.279508,0.279508,0.279508,0.279508,0.564686,0.616991,0.520165,0.564374,0.976841,1.045751,1.092502,1.087918,1,1,1
1,0.408248,0.408248,0.408248,0.408248,-0.132967,0.233225,-0.022619,-0.127433,1.020601,1.365254,1.087889,1.166615,1,1,1
2,0.258199,0.258199,0.258199,0.258199,0.346278,0.279330,0.385951,0.367701,1.181938,1.135487,1.174840,1.070520,1,1,1
3,0.447214,0.447214,0.408248,0.408248,0.386480,0.223195,0.330731,0.341064,1.112654,1.214635,1.227911,1.156946,1,1,1
4,0.372678,0.390023,0.342727,0.382048,0.897886,0.961387,0.529724,0.661194,0.500014,0.491408,0.841121,0.868888,1,1,1
5,0.264575,0.256436,0.256436,0.256436,-0.053494,0.198497,-0.021857,0.092730,0.925487,0.957756,0.998260,1.029628,1,1,1
6,0.345796,0.349541,0.349541,0.366682,0.114603,0.091598,0.186565,0.144794,1.209461,1.092344,0.978564,1.152755,1,1,1
7,0.374351,0.357118,0.374351,0.374351,0.958979,0.860180,0.498439,0.784219,1.194246,1.217161,1.237481,1.023343,1,1,1
8,0.368567,0.352324,0.352324,0.352324,0.269973,0.143851,0.129806,0.213452,1.177182,1.265059,1.315379,1.123742,1,1,1
9,0.290081,0.282206,0.284858,0.277120,0.418099,0.401858,0.429403,0.412552,0.958489,0.967308,0.845631,0.841209,1,1,1


In [38]:
all_distance.sort_index(axis=1)

Unnamed: 0,A_fws,A_pws,A_w2v,B_fws,B_pws,B_w2v,C_fws,C_pws,C_w2v,D_fws,D_pws,D_w2v
0,0.279508,0.976841,0.564686,0.279508,1.045751,0.616991,0.279508,1.092502,0.520165,0.279508,1.087918,0.564374
1,0.408248,1.020601,-0.132967,0.408248,1.365254,0.233225,0.408248,1.087889,-0.022619,0.408248,1.166615,-0.127433
2,0.258199,1.181938,0.346278,0.258199,1.135487,0.279330,0.258199,1.174840,0.385951,0.258199,1.070520,0.367701
3,0.447214,1.112654,0.386480,0.447214,1.214635,0.223195,0.408248,1.227911,0.330731,0.408248,1.156946,0.341064
4,0.372678,0.500014,0.897886,0.390023,0.491408,0.961387,0.342727,0.841121,0.529724,0.382048,0.868888,0.661194
5,0.264575,0.925487,-0.053494,0.256436,0.957756,0.198497,0.256436,0.998260,-0.021857,0.256436,1.029628,0.092730
6,0.345796,1.209461,0.114603,0.349541,1.092344,0.091598,0.349541,0.978564,0.186565,0.366682,1.152755,0.144794
7,0.374351,1.194246,0.958979,0.357118,1.217161,0.860180,0.374351,1.237481,0.498439,0.374351,1.023343,0.784219
8,0.368567,1.177182,0.269973,0.352324,1.265059,0.143851,0.352324,1.315379,0.129806,0.352324,1.123742,0.213452
9,0.290081,0.958489,0.418099,0.282206,0.967308,0.401858,0.284858,0.845631,0.429403,0.277120,0.841209,0.412552


In [45]:
Series( choice(np.array([1]), len(fws_distance),replace= True) )

0       1
1       1
2       1
3       1
4       1
5       1
6       1
7       1
8       1
9       1
10      1
11      1
12      1
13      1
14      1
15      1
16      1
17      1
18      1
19      1
20      1
21      1
22      1
23      1
24      1
25      1
26      1
27      1
28      1
29      1
       ..
2470    1
2471    1
2472    1
2473    1
2474    1
2475    1
2476    1
2477    1
2478    1
2479    1
2480    1
2481    1
2482    1
2483    1
2484    1
2485    1
2486    1
2487    1
2488    1
2489    1
2490    1
2491    1
2492    1
2493    1
2494    1
2495    1
2496    1
2497    1
2498    1
2499    1
dtype: int64