## Reading the Data

In [33]:
import pandas as pd
import numpy as np
import math

column_names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "class"]
train_data = pd.read_csv("train.txt", sep=",", header=None, names=column_names)
test_data = pd.read_csv("test.txt", sep=",", header=None, names=column_names[:-1])

In [34]:
print train_data.head()

   age          workclass  fnlwgt   education  education-num  \
0   34            Private  287315     HS-grad              9   
1   43        Federal-gov  145175   Bachelors             13   
2   45          Local-gov   33798     Masters             14   
3   23            Private  180497   Bachelors             13   
4   65   Self-emp-not-inc  145628        10th              6   

        marital-status          occupation    relationship    race      sex  \
0             Divorced   Machine-op-inspct   Not-in-family   White     Male   
1   Married-civ-spouse        Adm-clerical         Husband   White     Male   
2        Never-married      Prof-specialty   Not-in-family   White   Female   
3        Never-married        Tech-support       Own-child   Black   Female   
4   Married-civ-spouse        Craft-repair         Husband   White     Male   

   capital-gain  capital-loss  hours-per-week  native-country   class  
0             0             0              40   United-States   <=50

In [35]:
print test_data.head()

   age          workclass  fnlwgt      education  education-num  \
0   36          Local-gov  126569   Some-college             10   
1   26          State-gov   68346        Masters             14   
2   58            Private  225394        HS-grad              9   
3   60   Self-emp-not-inc   78913   Some-college             10   
4   20            Private  218215   Some-college             10   

        marital-status        occupation    relationship    race      sex  \
0   Married-civ-spouse   Protective-serv         Husband   White     Male   
1        Never-married    Prof-specialty   Not-in-family   White     Male   
2   Married-civ-spouse      Craft-repair         Husband   White     Male   
3   Married-civ-spouse   Exec-managerial         Husband   White     Male   
4        Never-married             Sales       Own-child   White   Female   

   capital-gain  capital-loss  hours-per-week  native-country  
0             0             0              40   United-States  
1     

## Preprocessing the data

In [36]:
# Compute mean of elements
def mean(elements):
    return float(sum(elements)) / float(len(elements))

# Standard deviation mean of elements
def standard_deviation(elements):
    m = mean(elements)
    var = sum([math.pow(e - m, 2) for e in elements]) / float(len(elements) - 1)
    return math.sqrt(var)
    
def z_score_normalization(data):
    for c in data.columns:
        data[c] = (data[c] - data[c].mean()) / data[c].std(ddof=0)
    return pd.DataFrame(data)

def cleaning(train_data, test_data):
    drop_column = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
    train_data = train_data.drop(columns = drop_column)
    test_data = test_data.drop(columns = drop_column)
    train_data["label"] = train_data["class"].apply(lambda x: -1 if x == " <=50K" else 1)
    train_data = train_data.drop(columns = "class")
    print train_data.head()
    return train_data, test_data

train_data, X_test = cleaning(train_data, test_data)
train_data.loc[:, train_data.columns != "label"] = z_score_normalization(train_data.loc[:, train_data.columns != "label"])
X_test = z_score_normalization(X_test)
print train_data.head()

   age  fnlwgt  education-num  capital-gain  capital-loss  hours-per-week  \
0   34  287315              9             0             0              40   
1   43  145175             13             0             0              42   
2   45   33798             14             0             0              40   
3   23  180497             13             0             0              32   
4   65  145628              6             0             0              40   

   label  
0     -1  
1      1  
2     -1  
3     -1  
4     -1  
        age    fnlwgt  education-num  capital-gain  capital-loss  \
0 -0.338110  0.920799      -0.420255     -0.144172      -0.21644   
1  0.318397 -0.420698       1.138364     -0.144172      -0.21644   
2  0.464288 -1.471858       1.528019     -0.144172      -0.21644   
3 -1.140508 -0.087334       1.138364     -0.144172      -0.21644   
4  1.923194 -0.416422      -1.589219     -0.144172      -0.21644   

   hours-per-week  label  
0       -0.033567     -1  
1       

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


## SVM 

In [80]:
class SVM:
    def __init__(self, v_len):
        self.a = pd.DataFrame(np.random.uniform(size=v_len), 
                              index=["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"])
        self.b = np.random.uniform()
        self.l = 1e-3
        
    def fit(self, X_train, y_train, eta):
        #a_n, b_n = self.a, self.b
        a_n = pd.DataFrame(np.random.uniform(size=6), 
                              index=["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"])
        b_n = np.random.uniform()
        k = np.random.randint(len(y_train))
        x_k, y_k = X_train.iloc[k], y_train.iloc[k]
        for i in range(30):
            if (y_k * (np.dot(a_n.T, x_k) + b_n)).item() >= 1:
                a_n[0] = a_n[0] - eta * self.l * a_n[0]
                b_n = b_n
            else:
                a_n[0] = a_n[0] - eta * (self.l * a_n[0] - y_k * x_k)
                b_n = b_n + eta * y_k
        self.a = a_n
        self.b = b_n
    
    def predict(self, x):
        return 1 if (np.dot(self.a.T, x) + self.b).item() >= 0 else -1

In [None]:
# + reshuflle + without replacement 
def train_val_split(data, size):
    train_data, val_data = np.split(data.sample(frac=1, replace=False), [size])
    X_train, y_train = np.split(train_data, [-1], axis=1)
    X_val, y_val = np.split(val_data, [-1], axis=1)
    y_train = np.squeeze(y_train) # for n x 1 vector
    y_val = np.squeeze(y_val) # for n x 1 vector
    return X_train, y_train, X_val, y_val

def compute_accuracy(svm, X_val, y_val, l):
    accuracy = 0
    for i, x in X_val.iterrows():
        if svm.predict(x) == y_val[i]:
            accuracy += 1
    accuracy /= float(len(X_val))
    #print "Accuracy is " + str(accuracy) + " with lamdba = " + str(l)
    return accuracy

def output_for_autograder(svm, X_test):
    with open("submission.txt", "w") as test_file:
        for i, x in X_test.iterrows():
            if svm.predict(x) == -1:
                test_file.write("<=50K\n")
            else:
                test_file.write(">50K\n")

def cross_validation(svm, train_data):
    max_accuracy_lambda = 0
    best_parameters = (0, 0)
    for l in [1e-3, 1e-2, 1e-1, 1]: # find best lambda hyperparameter
        X_train_lamdba, y_train_lamdba, X_val_lamdba, y_val_lamdba = train_val_split(train_data, int(.9*len(train_data)))
        svm.l = l
        max_accuracy_eta, best_eta = 0, 0
        n_season = 100 # Number of season at least 50
        for season in range(n_season):
            eta = 10 / float(season + 50) #  acc(20)=0.77
            #eta = 10 / float(season + 200)
            train_val_eta = pd.concat([X_train_lamdba, y_train_lamdba], axis=1)
            season_size = 300 # Size of season at least 300 try with 600
            while season_size >= 30:
                X_train_eta, y_train_eta, X_val_eta, y_val_eta = train_val_split(train_val_eta.sample(n=season_size), season_size-50)
                svm.fit(X_train_eta, y_train_eta, eta) 
                accuracy_eta = compute_accuracy(svm, X_val_eta, y_val_eta, l)
                if accuracy_eta > max_accuracy_eta:
                    max_accuracy_eta = accuracy_eta
                    best_eta = eta
                season_size -= 30
                # plot magnitude of the coefficient vector every 30 steps, for each value of the regularization constant.
                # plot held out accuracy every 30 steps, for each value of the regularization constant.
        svm.fit(X_train_lamdba, y_train_lamdba, best_eta)
        accuracy_lambda = compute_accuracy(svm, X_val_lamdba, y_val_lamdba, l)
        if accuracy_lambda > max_accuracy_lambda:
                max_accuracy_lambda = accuracy_lambda
                best_parameters = (l, best_eta)
    print "Max accuracy is " + str(max_accuracy_lambda)
    print "Best parameters is " + str(best_parameters)
    return best_parameters

x_len = 6
svm = SVM(x_len)
svm.l, eta = cross_validation(svm, train_data)

In [None]:
X_train, y_train = np.split(train_data, [-1], axis=1)
svm.fit(X_train, y_train, eta)
output_for_autograder(svm, X_test)