## Reading the Data

In [189]:
import pandas as pd
import numpy as np
import math

column_names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "class"]
train_data = pd.read_csv("train.txt", sep=",", header=None, names=column_names)
test_data = pd.read_csv("test.txt", sep=",", header=None, names=column_names[:-1])

In [163]:
print train_data.head()

   age          workclass  fnlwgt   education  education-num  \
0   34            Private  287315     HS-grad              9   
1   43        Federal-gov  145175   Bachelors             13   
2   45          Local-gov   33798     Masters             14   
3   23            Private  180497   Bachelors             13   
4   65   Self-emp-not-inc  145628        10th              6   

        marital-status          occupation    relationship    race      sex  \
0             Divorced   Machine-op-inspct   Not-in-family   White     Male   
1   Married-civ-spouse        Adm-clerical         Husband   White     Male   
2        Never-married      Prof-specialty   Not-in-family   White   Female   
3        Never-married        Tech-support       Own-child   Black   Female   
4   Married-civ-spouse        Craft-repair         Husband   White     Male   

   capital-gain  capital-loss  hours-per-week  native-country   class  
0             0             0              40   United-States   <=50

In [164]:
print test_data.head()

   age          workclass  fnlwgt      education  education-num  \
0   36          Local-gov  126569   Some-college             10   
1   26          State-gov   68346        Masters             14   
2   58            Private  225394        HS-grad              9   
3   60   Self-emp-not-inc   78913   Some-college             10   
4   20            Private  218215   Some-college             10   

        marital-status        occupation    relationship    race      sex  \
0   Married-civ-spouse   Protective-serv         Husband   White     Male   
1        Never-married    Prof-specialty   Not-in-family   White     Male   
2   Married-civ-spouse      Craft-repair         Husband   White     Male   
3   Married-civ-spouse   Exec-managerial         Husband   White     Male   
4        Never-married             Sales       Own-child   White   Female   

   capital-gain  capital-loss  hours-per-week  native-country  
0             0             0              40   United-States  
1     

## Preprocessing the data

In [190]:
# Compute mean of elements
def mean(elements):
    return float(sum(elements)) / float(len(elements))

# Standard deviation mean of elements
def standard_deviation(elements):
    m = mean(elements)
    var = sum([math.pow(e - m, 2) for e in elements]) / float(len(elements) - 1)
    return math.sqrt(var)
    
def z_score_normalization(data):
    for c in data.columns:
        data[c] = (data[c] - data[c].mean()) / data[c].std(ddof=0)
    return data

def cleaning(train_data, test_data):
    drop_column = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
    train_data = train_data.drop(columns = drop_column)
    test_data = test_data.drop(columns = drop_column)
    train_data["label"] = train_data["class"].apply(lambda x: -1 if x == " <=50K" else 1)
    train_data = train_data.drop(columns = "class")
    print train_data.head()
    return train_data, test_data

train_data, X_test = cleaning(train_data, test_data)
train_data.loc[:, train_data.columns != "label"] = z_score_normalization(train_data.loc[:, train_data.columns != "label"])
X_test = z_score_normalization(X_test)
print train_data.head()

   age  fnlwgt  education-num  capital-gain  capital-loss  hours-per-week  \
0   34  287315              9             0             0              40   
1   43  145175             13             0             0              42   
2   45   33798             14             0             0              40   
3   23  180497             13             0             0              32   
4   65  145628              6             0             0              40   

   label  
0     -1  
1      1  
2     -1  
3     -1  
4     -1  
        age    fnlwgt  education-num  capital-gain  capital-loss  \
0 -0.338110  0.920799      -0.420255     -0.144172      -0.21644   
1  0.318397 -0.420698       1.138364     -0.144172      -0.21644   
2  0.464288 -1.471858       1.528019     -0.144172      -0.21644   
3 -1.140508 -0.087334       1.138364     -0.144172      -0.21644   
4  1.923194 -0.416422      -1.589219     -0.144172      -0.21644   

   hours-per-week  label  
0       -0.033567     -1  
1       

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


## SVM 

In [222]:
def train_val_split(data):
    train_data, val_data = np.split(data.sample(frac=1), [int(.9*len(data))])
    X_train, y_train = np.split(train_data, [-1], axis=1)
    X_val, y_val = np.split(test_data, [-1], axis=1)
    y_train = np.squeeze(y_train) # for n x 1 vector
    y_val = np.squeeze(y_val) # for n x 1 vector
    return X_train, y_train, X_val, y_val


#def gradient_descent(l, step_size, x_k, y_k, n_steps): ################### Verify!!!!!
#    a = pd.DataFrame(np.random.randint(1, len(x_k)))
#    b = pd.random.randint(1)
#    a_prev, b_prev = a, b
#    a_next, b_next = [0] * len(x), 0
 #   n_steps = 300
  #  for i in range(n_steps):
   #     if y_k * (a.T.dot(x_k) + b) >= 1:
    #        a_next = a_prev -  step_size * a.apply(lambda x: x * l)
     #       b_next = b_prev
      #  else:
       #     a_next = a_prev -  step_size * (a.apply(lambda x: x * l) - y_k * x_k)
        #    b_next = b_prev + step_size * y_k
 #   return a_next, b_next

def train_SVM(X_train, y_train, l):
    step_size = 1
    a = pd.DataFrame(np.random.rand(1, len(X_train.iloc[0])))
    b = np.random.randint(1)
    a_prev, b_prev = a, b
    a_next, b_next = pd.DataFrame([0] * len(X_train.iloc[0])), 0
    k = np.random.randint(len(y_train))
    x_k = X_train.iloc[k]
    y_k = y_train.iloc[k]
    print type(x_k)
    print type(a)
    for i in range(300):
        if y_k * (a.T.dot(x_k) + b) >= 1:
            a_next = a_prev -  step_size * a.apply(lambda x: x * l)
            b_next = b_prev
        else:
            a_next = a_prev -  step_size * (a.apply(lambda x: x * l) - y_k * x_k)
            b_next = b_prev + step_size * y_k
    return a_next, b_next

def cross_validation(train_data):
    # compute eta first
    
    for l in [1e-3, 1e-2, 1e-1, 1]: # find bestlamdba hyperparameter
        X_train_lamdba, y_train_lamdba, X_val_lamdba, y_val_lamdba = train_val_split(train_data)
        for epoch in range(50): # Number of epochs => TBD
            train_val_data = pd.concat([X_train_lamdba, y_train_lamdba], axis=1)
            X_val_stepsize, y_val_stepsize = np.split(train_val_data.sample(n=50), [-1], axis=1)
            a, b = train_SVM(X_train, y_train, l)
            #split data
            #do cross_validation to find the best step size
cross_validation(train_data)

AttributeError: 'numpy.int64' object has no attribute 'to_frame'