In [166]:
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from collections import Counter

def remapOnFeatureMap(data):
    remapped = []
    for i,row in enumerate(data):
        newRow = []
        for j,d in enumerate(row):
            if j not in [0,7]:
                if (j,d) in featureMap:
                    newRow.append(featureMap[(j,d)])
            else:
                newRow.append(float(d)/50)
        remapped.append(newRow)
        
    binRemapped = np.zeros((len(data), len(featureMap)))
    for i,row in enumerate(remapped):
        for j,d in enumerate(row):
            if j not in [0,7]:
                binRemapped[i][d] = 1
            else:
                binRemapped[i][j] = d
    
    return binRemapped

def dist(trainData, testData, ord):
    dist = []
    for row in testData:
        dist.append(np.argsort(np.linalg.norm(trainData - row, ord, axis = 1)))
    return dist

def knn(distances, target, k):
    preds = []
    for row in distances:
        preds.append(Counter([target[i] for i in row[:k]]).most_common()[0][0])
    return preds
    
def err(preds, target):
    return sum([1 if preds[i]!=target[i] else 0 for i in range(len(preds))])/len(preds)*100

def pos(preds):
    return (sum([preds[i]==1 for i in range(len(preds))])/len(preds))*100

if __name__ == '__main__':
    train = [s.strip().split(',')[:-1] for s in open('income.train.txt.5k').readlines()]
    dev = [s.strip().split(',')[:-1] for s in open('income.dev.txt').readlines()]
#     test = [s.strip().split(',')[:-1] for s in open('income.test.blind').readlines()]

    trainY = [1 if s.strip().split(',')[-1].strip()=='>50K' else 0 for s in open('income.train.txt.5k').readlines()]
    devY = [1 if s.strip().split(',')[-1].strip()=='>50K' else 0 for s in open('income.dev.txt').readlines()]
    
    featureMap = defaultdict(int)
    for i, val in enumerate(train):
        for j, d in enumerate(val):
            feature = (j,d)
            if j not in [0,7]:
                if feature not in featureMap :
                    featureMap[feature] = len(featureMap)
            else:
                featureMap[(j,0)] = 1
                
    print('number of dimensions: {}'.format(len(featureMap)))
    binTrain = remapOnFeatureMap(train)
    binDev = remapOnFeatureMap(dev)
#     binTest = remapOnFeatureMap(test)
    
    devDist = dist(binTrain, binDev,1)
    trainDist = dist(binTrain, binTrain,1)
    
    kVals = [1,3,5,7,9,99,999,1999,2999,3999,4999,9999]
    for k in kVals:
        devPreds = knn(devDist, trainY, k)
        trainPreds = knn(trainDist, trainY, k)
        print('k: {} train_err: {}% (+: {}) dev_err: {}% (+: {})'.format(k,err(trainPreds, trainY), pos(trainPreds), err(devPreds, devY), pos(devPreds)))

number of dimensions: 92
k: 10 train_err: 12.64% (+: 23.22) dev_err: 16.400000000000002% (+: 21.6)


In [12]:
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

def remapOnFeatureMap(data):
    remapped = []
    for i,row in enumerate(data):
        newRow = []
        for j,d in enumerate(row):
            if (j,d) in featureMap:
                newRow.append(featureMap[(j,d)])
        remapped.append(newRow)
        
    binRemapped = np.zeros((len(data), len(featureMap)))
    for i,row in enumerate(remapped):
        for j,d in enumerate(row):
            binRemapped[i][d] = 1
        # Adding corresponding value 1 for last bias term
        binRemapped[i][-1] = 1
    
    return binRemapped

def perceptronTrain(epochs, data, target):
    epochData = defaultdict(list)
    w = np.zeros((len(featureMap)))
    
    for iteration in range(epochs):
        u = 0
        for i,row in enumerate(data):
            a = np.sum(np.dot(w,row))
            if target[i]*a <= 0:
                u += 1
                w += target[i]*row
        epochData[iteration].append(u)
        epochData[iteration].append(w)
    return epochData

def perceptronPredict(w,data):
    preds = []
    for i, d in enumerate(data):
        preds.append(np.sign(np.dot(w,d)))
    return preds

def avgPerceptronTrain(epochs, data, target):
    epochData = defaultdict(list)
    w = np.zeros((len(featureMap)))
    wa = np.zeros((len(featureMap)))
    c = 0
    
    for iteration in range(epochs):
        for i, row in enumerate(data):
            a = np.sum(np.dot(w,row))
            if target[i]*a <= 0:
                w += target[i]*row
                wa += c*target[i]*row
            c += 1
        epochData[iteration].append(c*w - wa)
    return epochData

def err(preds, target):
    return (np.sum([1 if preds[i] == 0 or preds[i]!=target[i] else 0 for i in range(len(preds))]))/len(preds)*100

def pos(preds):
    return np.sum([1 if preds[i]==1 else 0 for i in range(len(preds))])/len(preds)*100

def reorderedData():
    print('-------------------------------Reordering training data----------------------------------------------')
    reorderedTrain = [s.strip().split(',')[:-1] for s in open('income.train.txt.5k').readlines() if s.strip().split(',')[-1]==' >50K']
    reorderedTrain.extend([s.strip().split(',')[:-1] for s in open('income.train.txt.5k').readlines() if s.strip().split(',')[-1]==' <=50K'])
    reorderedTrainY = [1 for s in open('income.train.txt.5k').readlines() if s.strip().split(',')[-1]==' >50K']
    reorderedTrainY.extend([-1 for s in open('income.train.txt.5k').readlines() if s.strip().split(',')[-1]==' <=50K'])
    
    reorderedDev = [s.strip().split(',')[:-1] for s in open('income.dev.txt').readlines() if s.strip().split(',')[-1]==' >50K']
    reorderedDev.extend([s.strip().split(',')[:-1] for s in open('income.dev.txt').readlines() if s.strip().split(',')[-1]==' <=50K'])
    reorderedDevY = [1 for s in open('income.dev.txt').readlines() if s.strip().split(',')[-1]==' >50K']
    reorderedDevY.extend([-1 for s in open('income.dev.txt').readlines() if s.strip().split(',')[-1]==' <=50K'])

    reorderedBinTrain = remapOnFeatureMap(reorderedTrain)
    reorderedBinDev = remapOnFeatureMap(reorderedDev)
    
    for epoch in range(1,10):
        epochData = perceptronTrain(epoch, reorderedBinTrain, reorderedTrainY)
        updates, wTrain = epochData[epoch-1][0], epochData[epoch-1][1]
        devPreds = perceptronPredict(wTrain,reorderedBinDev)
        print('epoch: {} updates: {} ({:.2f}%) dev_err: {:.2f}% (+: {:.2f}%)'.format(epoch, updates, (updates/len(reorderedBinTrain))*100, err(devPreds, reorderedDevY), pos(devPreds)))
        
    print('-----------------------------Average Perceptron--------------------------------------------------')
    for epoch in range(1,10):
        epochData =  avgPerceptronTrain(epoch, reorderedBinTrain, reorderedTrainY)
        avgWTrain = epochData[epoch-1][0]
        if epoch == 4:
            w = epochData[epoch-1][0]
        avgDevPreds = perceptronPredict(avgWTrain, binDev)
        print('epoch: {} dev_err: {:.2f}% (+ {:.2f}%)'.format(epoch, err(avgDevPreds, reorderedDevY), pos(avgDevPreds)))

def featureEngineering1():
    featureMap2 = defaultdict(int)
    for i, val in enumerate(train):
        for j, d in enumerate(val):
            feature = (j,d)
            if j not in [0,7]:
                if feature not in featureMap2:
                    featureMap2[feature] = len(featureMap2)
    featureMap2[(9,1)] = len(featureMap2)
    
    return featureMap2
    
def remapOnFeatureMap2(data, featureMap2, zeroMean, unitVariance):
    remapOnFeatureMap2 = []
    a = [float(d[0]) for d in data]
    h = [float(d[7]) for d in data]
    
    muAge = np.mean(a)
    muHours = np.mean(h)
    
    sdAge = np.std(a)
    sdHours = np.std(h)
    
    minAge, maxAge = min(a), max(a)
    minHours, maxHours = min(h), max(h)
    
    for i, row in enumerate(data):
        newRow = []
        for j, d in enumerate(row):
            if j not in [0,7]:
                if (j,d) in featureMap2:
                    newRow.append(featureMap2[(j,d)])
            else:
                if zeroMean == False and unitVariance == False:
                    newRow.append(float(d)/50)
                elif zeroMean == True and unitVariance == False:
                    if j == 0:
                        newRow.append(float(d)-muAge)
                    elif j == 7:
                        newRow.append(float(d)-muHours)
                elif zeroMean == True and unitVariance == True:
                    if j == 0:
#                         newRow.append((float(d)-minAge)/maxAge-minAge)
                        newRow.append((float(d)-muAge)/sdAge)
                    elif j == 7:
#                         newRow.append((float(d)-minHours)/maxHours-minHours)
                        newRow.append((float(d)-muHours)/sdHours)
        remapOnFeatureMap2.append(newRow)
    
    binRemapped2 = np.zeros((len(data), len(featureMap2)), dtype=float)
    for i, row in enumerate(remapOnFeatureMap2):
        for j, d in enumerate(row):
            if j not in [0,7]:
                binRemapped2[i][d] = 1
            else:
                binRemapped2[i][j] = d
        binRemapped2[i][-1] = 1
    return binRemapped2
    
def perceptronTrainWithFeatureMap(epochs, data, target, fm):
    epochData = defaultdict(list)
    w = np.zeros((len(fm)))
    
    for iteration in range(epochs):
        u = 0
        for i,row in enumerate(data):
            row = np.asarray(row)
            a = np.sum(np.dot(w,row))
            if target[i]*a <= 0:
                u += 1
                w += target[i]*row
        epochData[iteration].append(u)
        epochData[iteration].append(w)
        
    return epochData

def avgPerceptronTrainWithFeatureMap(epochs, data, target, fm):
    epochData = defaultdict(list)
    w = np.zeros((len(fm)))
    wa = np.zeros((len(fm)))
    c = 0
    
    for iteration in range(epochs):
        for i, row in enumerate(data):
            a = np.sum(np.dot(w,row))
            if target[i]*a <= 0:
                w += target[i]*row
                wa += c*target[i]*row
            c += 1
        epochData[iteration].append(c*w - wa)
    return epochData
    
if __name__ == '__main__':
    sns.set()
    train = [s.strip().split(',')[:-1] for s in open('income.train.txt.5k').readlines()]
#     dev = [s.strip().split(',')[:-1] for s in open('income.dev.txt').readlines()]
    dev = [s.strip().split(',')[:-1] for s in open('income.test.predicted').readlines()]
    test = [s.strip().split(',') for s in open('income.test.blind').readlines()]

    trainY = [1 if s.strip().split(',')[-1].strip()=='>50K' else -1 for s in open('income.train.txt.5k').readlines()]
#     devY = [1 if s.strip().split(',')[-1].strip()=='>50K' else -1 for s in open('income.dev.txt').readlines()]
    devY = [1 if s.strip().split(',')[-1].strip()=='>50K' else -1 for s in open('income.test.predicted').readlines()]
    
    featureMap = defaultdict(int)
    featureRemap = {}
    for i, val in enumerate(train):
        for j, d in enumerate(val):
            feature = (j,d)
            if feature not in featureMap :
                featureMap[feature] = len(featureMap)
                featureRemap[len(featureMap)-1] = feature
    #Adding last dimension for bias (value = 1) to the feature map
    featureMap[(9,0)] = len(featureMap)
    featureRemap[len(featureMap)-1] = (9,0)
                
#     print('number of dimensions: {}'.format(len(featureMap)))
    binTrain = remapOnFeatureMap(train)
    binDev = remapOnFeatureMap(dev)
        
#     print(featureMap)
#     print('---------------------------------')
#     print(featureRemap)

    epochs = range(1,10)
    
#     basicVanErrRates = defaultdict(float)
#     for epoch in epochs:
#         epochData = perceptronTrain(epoch, binTrain, trainY)
#         updates, wTrain = epochData[epoch-1][0], epochData[epoch-1][1]
#         devPreds = perceptronPredict(wTrain,binDev)
#         basicVanErrRates[epoch] = err(devPreds, devY)
#         print('epoch: {} updates: {} ({:.2f}%) dev_err: {:.2f}% (+: {:.2f}%)'.format(epoch, updates, (updates/len(binTrain))*100, basicVanErrRates[epoch], pos(devPreds)))
    
#     plt.plot(epochs, basicVanErrRates.values())
        
#     print('-----------------------------Average Perceptron--------------------------------------------------')
#     basicAvgErrRates = defaultdict(float)
#     for epoch in epochs:
#         epochData =  avgPerceptronTrain(epoch, binTrain, trainY)
#         avgWTrain = epochData[epoch-1][0]
#         if epoch == 4:
#             w = epochData[epoch-1][0]
#         avgDevPreds = perceptronPredict(avgWTrain, binDev)
#         basicAvgErrRates[epoch] = err(avgDevPreds, devY)
#         print('epoch: {} dev_err: {:.2f}% (+ {:.2f}%)'.format(epoch, basicAvgErrRates[epoch], pos(avgDevPreds)))
    
#     plt.plot(epochs, basicAvgErrRates.values())
#     plt.legend(['Vanilla Perceptron', 'Average Perceptron'])
#     plt.show()
        
    
#     print('************************ Experimentations ************************')
#     reorderedData()
    
#     print('************************Using original numerical features************************')
#     fm2 = featureEngineering1()
#     binNumFeaturesTrainRemapped = remapOnFeatureMap2(train, fm2, False, False)
#     binNumFeaturesDevRemapped = remapOnFeatureMap2(dev, fm2, False, False)

#     numFeaturesVanErrRates = defaultdict(float)
#     for epoch in epochs:
#         epochData = perceptronTrainWithFeatureMap(epoch, binNumFeaturesTrainRemapped, trainY, fm2)
#         updates, wTrain = epochData[epoch-1][0], epochData[epoch-1][1]
#         devPreds = perceptronPredict(wTrain,binNumFeaturesDevRemapped)
#         numFeaturesVanErrRates[epoch] = err(devPreds, devY)
#         print('epoch: {} updates: {} ({:.2f}%) dev_err: {:.2f}% (+: {:.2f}%)'.format(epoch, updates, (updates/len(binNumFeaturesTrainRemapped))*100, numFeaturesVanErrRates[epoch], pos(devPreds)))
#     plt.plot(epochs, numFeaturesVanErrRates.values())
        
#     print('-----------------------------Average Perceptron--------------------------------------------------')
#     numFeaturesAvgErrRates = defaultdict(float)
#     for epoch in epochs:
#         epochData =  avgPerceptronTrainWithFeatureMap(epoch, binNumFeaturesTrainRemapped, trainY, fm2)
#         avgWTrain = epochData[epoch-1][0]
#         if epoch == 4:
#             w = epochData[epoch-1][0]
#         avgDevPreds = perceptronPredict(avgWTrain, binNumFeaturesDevRemapped)
#         numFeaturesAvgErrRates[epoch] = err(avgDevPreds, devY)
#         print('epoch: {} dev_err: {:.2f}% (+ {:.2f}%)'.format(epoch, numFeaturesAvgErrRates[epoch], pos(avgDevPreds)))
#     plt.plot(epochs, numFeaturesAvgErrRates.values())
#     plt.legend(['Vanilla Perceptron with numerical features', 'Average Perceptron with numerical features'])
#     plt.show()
    
#     print('----------------------------Most positive and negative features for average perceptron--------------------------------')
#     print('Using weight vector from epoch number 4 when dev_err is minimum')
#     print('len of w: {}'.format(len(w)))
#     sortedWeights = [(x,w[x]) for x in np.argsort(w)]
#     print('weights: {}'.format(sortedWeights))
#     print('remap: {}'.format(featureRemap))
#     print('At epoch={}, five most negative weight: {}'.format(4, [featureRemap[x] for (x,weight) in sortedWeights[:6]]))
#     print('At epoch={}, five most positive weight: {}'.format(4, [featureRemap[x] for (x,weight) in sortedWeights[-6:]]))
#     print('----------------------------Feature Weight for Bias Dimension --------------------------------')
#     print([w[x] for (x,w[x]) in sortedWeights if x==230])
    

#     print('************************Centering data to be zero mean************************')
#     print('-----------------------------Average Perceptron--------------------------------------------------')
#     zeroMeanErrRates = defaultdict(float)
#     for epoch in epochs:
#         epochData =  avgPerceptronTrainWithFeatureMap(epoch, remapOnFeatureMap2(train, featureMap, True, False), trainY, featureMap)
#         avgWTrain = epochData[epoch-1][0]
#         if epoch == 4:
#             w = epochData[epoch-1][0]
#         avgDevPreds = perceptronPredict(avgWTrain, remapOnFeatureMap2(dev, featureMap, True, False))
#         zeroMeanErrRates[epoch] = err(avgDevPreds, devY)
#         print('epoch: {} dev_err: {:.2f}% (+ {:.2f}%)'.format(epoch, zeroMeanErrRates[epoch], pos(avgDevPreds)))
#     plt.plot(epochs, zeroMeanErrRates.values())

    print('************************Centering data to be zero mean and unit variance************************')
#     print('-----------------------------Average Perceptron--------------------------------------------------')
    zeroMeanUnitVarErrRates = defaultdict(float)
    for epoch in epochs:
        epochData =  avgPerceptronTrainWithFeatureMap(epoch, remapOnFeatureMap2(train, featureMap, True, True), trainY, featureMap)
        avgWTrain = epochData[epoch-1][0]
        if epoch == 5:
            w = epochData[epoch-1][0]
        avgDevPreds = perceptronPredict(avgWTrain, remapOnFeatureMap2(dev, featureMap, True, True))
        zeroMeanUnitVarErrRates[epoch] = err(avgDevPreds, devY)
        print('epoch: {} dev_err: {:.2f}% (+ {:.2f}%)'.format(epoch, zeroMeanUnitVarErrRates[epoch], pos(avgDevPreds)))
        
#     plt.plot(epochs, zeroMeanUnitVarErrRates.values())
#     plt.legend(['Average Perceptron with zero mean', 'Average Perceptron with zero mean and unit variance'])
#     plt.show()

#     print('-----------------------------Ground truth positive percentage--------------------------------------------------')
#     print('Ground truth (+ {:.2f}%)'.format(pos(devY)))

    print('-----------------------------Test Predictions--------------------------------------------------')
    avgTestPreds = perceptronPredict(w, remapOnFeatureMap2(test, featureMap, True, True))
    print(avgTestPreds)
    print('Predicted positive % on test is {:.2f}%'.format(pos(avgTestPreds)))
    
# #     print('-----------------------------Writing Test Predictions to \'income.test.predicted\' file--------------------------------------------------')
#     final = [i + ['>50K'] if j==1.0 else i+['<=50K'] for i,j in zip(test,avgTestPreds) ]
#     l = list(map(lambda d : ','.join(d),final))
#     with open('income.test.predicted', 'w') as output:
#         for x in l:
#             output.write(str(x))
#             output.write('\n')

************************Centering data to be zero mean and unit variance************************
epoch: 1 dev_err: 2.50% (+ 19.50%)
epoch: 2 dev_err: 1.40% (+ 19.40%)
epoch: 3 dev_err: 0.50% (+ 19.70%)
epoch: 4 dev_err: 0.20% (+ 19.40%)
epoch: 5 dev_err: 0.00% (+ 19.40%)
epoch: 6 dev_err: 0.20% (+ 19.40%)
epoch: 7 dev_err: 0.20% (+ 19.40%)
epoch: 8 dev_err: 0.40% (+ 19.60%)
epoch: 9 dev_err: 0.40% (+ 19.60%)
-----------------------------Test Predictions--------------------------------------------------
[1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, 1.0, -1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, 1.0, -1.0, -1.0, -1.0, 1.0, -1.0, -1.0, 1.0, -1.0, -1.0, -1.0, 1.0, -1.0, 1.0, -1.0, -1.0, -1.0, 1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1

In [18]:
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Perceptron

def remapOnFeatureMap(data):
    remapped = []
    for i,row in enumerate(data):
        newRow = []
        for j,d in enumerate(row):
            if (j,d) in featureMap:
                newRow.append(featureMap[(j,d)])
        remapped.append(newRow)
        
    binRemapped = np.zeros((len(data), len(featureMap)))
    for i,row in enumerate(remapped):
        for j,d in enumerate(row):
            binRemapped[i][d] = 1
        # Adding corresponding value 1 for last bias term
        binRemapped[i][-1] = 1
    
    return binRemapped

def perceptronTrain(epochs, data, target):
    epochData = defaultdict(list)
    w = np.zeros((len(featureMap)))
    
    for iteration in range(epochs):
        u = 0
        for i,row in enumerate(data):
            a = np.sum(np.dot(w,row))
            if target[i]*a <= 0:
                u += 1
                w += target[i]*row
        epochData[iteration].append(u)
        epochData[iteration].append(w)
    return epochData

def perceptronPredict(w,data):
    preds = []
    for i, d in enumerate(data):
        preds.append(np.sign(np.dot(w,d)))
    return preds

def avgPerceptronTrain(epochs, data, target):
    epochData = defaultdict(list)
    w = np.zeros((len(featureMap)))
    wa = np.zeros((len(featureMap)))
    c = 0
    
    for iteration in range(epochs):
        for i, row in enumerate(data):
            a = np.sum(np.dot(w,row))
            if target[i]*a <= 0:
                w += target[i]*row
                wa += c*target[i]*row
            c += 1
        epochData[iteration].append(c*w - wa)
    return epochData

def err(preds, target):
    return (np.sum([1 if preds[i] == 0 or preds[i]!=target[i] else 0 for i in range(len(preds))]))/len(preds)*100

def pos(preds):
    return np.sum([1 if preds[i]==1 else 0 for i in range(len(preds))])/len(preds)*100

def reorderedData():
    print('-------------------------------Reordering training data----------------------------------------------')
    reorderedTrain = [s.strip().split(',')[:-1] for s in open('income.train.txt.5k').readlines() if s.strip().split(',')[-1]==' >50K']
    reorderedTrain.extend([s.strip().split(',')[:-1] for s in open('income.train.txt.5k').readlines() if s.strip().split(',')[-1]==' <=50K'])
    reorderedTrainY = [1 for s in open('income.train.txt.5k').readlines() if s.strip().split(',')[-1]==' >50K']
    reorderedTrainY.extend([-1 for s in open('income.train.txt.5k').readlines() if s.strip().split(',')[-1]==' <=50K'])
    
    reorderedDev = [s.strip().split(',')[:-1] for s in open('income.dev.txt').readlines() if s.strip().split(',')[-1]==' >50K']
    reorderedDev.extend([s.strip().split(',')[:-1] for s in open('income.dev.txt').readlines() if s.strip().split(',')[-1]==' <=50K'])
    reorderedDevY = [1 for s in open('income.dev.txt').readlines() if s.strip().split(',')[-1]==' >50K']
    reorderedDevY.extend([-1 for s in open('income.dev.txt').readlines() if s.strip().split(',')[-1]==' <=50K'])

    reorderedBinTrain = remapOnFeatureMap(reorderedTrain)
    reorderedBinDev = remapOnFeatureMap(reorderedDev)
    
    for epoch in range(1,10):
        epochData = perceptronTrain(epoch, reorderedBinTrain, reorderedTrainY)
        updates, wTrain = epochData[epoch-1][0], epochData[epoch-1][1]
        devPreds = perceptronPredict(wTrain,reorderedBinDev)
        print('epoch: {} updates: {} ({:.2f}%) dev_err: {:.2f}% (+: {:.2f}%)'.format(epoch, updates, (updates/len(reorderedBinTrain))*100, err(devPreds, reorderedDevY), pos(devPreds)))
        
    print('-----------------------------Average Perceptron--------------------------------------------------')
    for epoch in range(1,10):
        epochData =  avgPerceptronTrain(epoch, reorderedBinTrain, reorderedTrainY)
        avgWTrain = epochData[epoch-1][0]
        if epoch == 4:
            w = epochData[epoch-1][0]
        avgDevPreds = perceptronPredict(avgWTrain, binDev)
        print('epoch: {} dev_err: {:.2f}% (+ {:.2f}%)'.format(epoch, err(avgDevPreds, reorderedDevY), pos(avgDevPreds)))

def featureEngineering1():
    featureMap2 = defaultdict(int)
    for i, val in enumerate(train):
        for j, d in enumerate(val):
            feature = (j,d)
            if j not in [0,7]:
                if feature not in featureMap2:
                    featureMap2[feature] = len(featureMap2)
    featureMap2[(9,1)] = len(featureMap2)
    
    return featureMap2
    
def remapOnFeatureMap2(data, featureMap2, zeroMean, unitVariance):
    remapOnFeatureMap2 = []
    a = [float(d[0]) for d in data]
    h = [float(d[7]) for d in data]
    
    muAge = np.mean(a)
    muHours = np.mean(h)
    
    sdAge = np.std(a)
    sdHours = np.std(h)
    
    minAge, maxAge = min(a), max(a)
    minHours, maxHours = min(h), max(h)
    
    for i, row in enumerate(data):
        newRow = []
        for j, d in enumerate(row):
            if j not in [0,7]:
                if (j,d) in featureMap2:
                    newRow.append(featureMap2[(j,d)])
            else:
                if zeroMean == False and unitVariance == False:
                    newRow.append(float(d)/50)
                elif zeroMean == True and unitVariance == False:
                    if j == 0:
                        newRow.append(float(d)-muAge)
                    elif j == 7:
                        newRow.append(float(d)-muHours)
                elif zeroMean == True and unitVariance == True:
                    if j == 0:
#                         newRow.append((float(d)-minAge)/maxAge-minAge)
                        newRow.append((float(d)-muAge)/sdAge)
                    elif j == 7:
#                         newRow.append((float(d)-minHours)/maxHours-minHours)
                        newRow.append((float(d)-muHours)/sdHours)
        remapOnFeatureMap2.append(newRow)
    
    binRemapped2 = np.zeros((len(data), len(featureMap2)), dtype=float)
    for i, row in enumerate(remapOnFeatureMap2):
        for j, d in enumerate(row):
            if j not in [0,7]:
                binRemapped2[i][d] = 1
            else:
                binRemapped2[i][j] = d
        binRemapped2[i][-1] = 1
    return binRemapped2
    
def perceptronTrainWithFeatureMap(epochs, data, target, fm):
    epochData = defaultdict(list)
    w = np.zeros((len(fm)))
    
    for iteration in range(epochs):
        u = 0
        for i,row in enumerate(data):
            row = np.asarray(row)
            a = np.sum(np.dot(w,row))
            if target[i]*a <= 0:
                u += 1
                w += target[i]*row
        epochData[iteration].append(u)
        epochData[iteration].append(w)
        
    return epochData

def avgPerceptronTrainWithFeatureMap(epochs, data, target, fm):
    epochData = defaultdict(list)
    w = np.zeros((len(fm)))
    wa = np.zeros((len(fm)))
    c = 0
    
    for iteration in range(epochs):
        for i, row in enumerate(data):
            a = np.sum(np.dot(w,row))
            if target[i]*a <= 0:
                w += target[i]*row
                wa += c*target[i]*row
            c += 1
        epochData[iteration].append(c*w - wa)
    return epochData
    
if __name__ == '__main__':
    sns.set()
    train = [s.strip().split(',')[:-1] for s in open('income.train.txt.5k').readlines()]
#     dev = [s.strip().split(',')[:-1] for s in open('income.dev.txt').readlines()]
#     dev = [s.strip().split(',')[:-1] for s in open('income.test.predicted').readlines()]
    test = [s.strip().split(',') for s in open('income.test.blind').readlines()]

    trainY = [1 if s.strip().split(',')[-1].strip()=='>50K' else -1 for s in open('income.train.txt.5k').readlines()]
#     devY = [1 if s.strip().split(',')[-1].strip()=='>50K' else -1 for s in open('income.dev.txt').readlines()]
    devY = [1 if s.strip().split(',')[-1].strip()=='>50K' else -1 for s in open('income.test.predicted').readlines()]
    
    featureMap = defaultdict(int)
    featureRemap = {}
    for i, val in enumerate(train):
        for j, d in enumerate(val):
            feature = (j,d)
            if feature not in featureMap :
                featureMap[feature] = len(featureMap)
                featureRemap[len(featureMap)-1] = feature
    #Adding last dimension for bias (value = 1) to the feature map
    featureMap[(9,0)] = len(featureMap)
    featureRemap[len(featureMap)-1] = (9,0)
                
#     print('number of dimensions: {}'.format(len(featureMap)))
    binTrain = remapOnFeatureMap(train)
    binDev = remapOnFeatureMap(dev)
    binTest = remapOnFeatureMap(test)
        
#     print(featureMap)
#     print('---------------------------------')
#     print(featureRemap)

    epochs = range(1,10)
    
#     basicVanErrRates = defaultdict(float)
#     for epoch in epochs:
#         epochData = perceptronTrain(epoch, binTrain, trainY)
#         updates, wTrain = epochData[epoch-1][0], epochData[epoch-1][1]
#         devPreds = perceptronPredict(wTrain,binDev)
#         basicVanErrRates[epoch] = err(devPreds, devY)
#         print('epoch: {} updates: {} ({:.2f}%) dev_err: {:.2f}% (+: {:.2f}%)'.format(epoch, updates, (updates/len(binTrain))*100, basicVanErrRates[epoch], pos(devPreds)))
    
#     plt.plot(epochs, basicVanErrRates.values())
        
    print('-----------------------------Average Perceptron--------------------------------------------------')
    basicAvgErrRates = defaultdict(float)
    for epoch in epochs:
        epochData =  avgPerceptronTrain(epoch, binTrain, trainY)
        avgWTrain = epochData[epoch-1][0]
        if epoch == 4:
            w = epochData[epoch-1][0]
        avgDevPreds = perceptronPredict(avgWTrain, binDev)
        basicAvgErrRates[epoch] = err(avgDevPreds, devY)
        print('epoch: {} dev_err: {:.2f}% (+ {:.2f}%)'.format(epoch, basicAvgErrRates[epoch], pos(avgDevPreds)))
    
#     plt.plot(epochs, basicAvgErrRates.values())
#     plt.legend(['Vanilla Perceptron', 'Average Perceptron'])
#     plt.show()
        
    clf = Perceptron(tol=1e-3, random_state=0)
    clf.fit(binTrain, trainY)
    a = clf.predict(binTest)
    print([pos(a)])
    print(a)
    
#     print('************************ Experimentations ************************')
#     reorderedData()
    
#     print('************************Using original numerical features************************')
#     fm2 = featureEngineering1()
#     binNumFeaturesTrainRemapped = remapOnFeatureMap2(train, fm2, False, False)
#     binNumFeaturesDevRemapped = remapOnFeatureMap2(dev, fm2, False, False)

#     numFeaturesVanErrRates = defaultdict(float)
#     for epoch in epochs:
#         epochData = perceptronTrainWithFeatureMap(epoch, binNumFeaturesTrainRemapped, trainY, fm2)
#         updates, wTrain = epochData[epoch-1][0], epochData[epoch-1][1]
#         devPreds = perceptronPredict(wTrain,binNumFeaturesDevRemapped)
#         numFeaturesVanErrRates[epoch] = err(devPreds, devY)
#         print('epoch: {} updates: {} ({:.2f}%) dev_err: {:.2f}% (+: {:.2f}%)'.format(epoch, updates, (updates/len(binNumFeaturesTrainRemapped))*100, numFeaturesVanErrRates[epoch], pos(devPreds)))
#     plt.plot(epochs, numFeaturesVanErrRates.values())
        
#     print('-----------------------------Average Perceptron--------------------------------------------------')
#     numFeaturesAvgErrRates = defaultdict(float)
#     for epoch in epochs:
#         epochData =  avgPerceptronTrainWithFeatureMap(epoch, binNumFeaturesTrainRemapped, trainY, fm2)
#         avgWTrain = epochData[epoch-1][0]
#         if epoch == 4:
#             w = epochData[epoch-1][0]
#         avgDevPreds = perceptronPredict(avgWTrain, binNumFeaturesDevRemapped)
#         numFeaturesAvgErrRates[epoch] = err(avgDevPreds, devY)
#         print('epoch: {} dev_err: {:.2f}% (+ {:.2f}%)'.format(epoch, numFeaturesAvgErrRates[epoch], pos(avgDevPreds)))
#     plt.plot(epochs, numFeaturesAvgErrRates.values())
#     plt.legend(['Vanilla Perceptron with numerical features', 'Average Perceptron with numerical features'])
#     plt.show()
    
#     print('----------------------------Most positive and negative features for average perceptron--------------------------------')
#     print('Using weight vector from epoch number 4 when dev_err is minimum')
#     print('len of w: {}'.format(len(w)))
#     sortedWeights = [(x,w[x]) for x in np.argsort(w)]
#     print('weights: {}'.format(sortedWeights))
#     print('remap: {}'.format(featureRemap))
#     print('At epoch={}, five most negative weight: {}'.format(4, [featureRemap[x] for (x,weight) in sortedWeights[:6]]))
#     print('At epoch={}, five most positive weight: {}'.format(4, [featureRemap[x] for (x,weight) in sortedWeights[-6:]]))
#     print('----------------------------Feature Weight for Bias Dimension --------------------------------')
#     print([w[x] for (x,w[x]) in sortedWeights if x==230])
    

#     print('************************Centering data to be zero mean************************')
#     print('-----------------------------Average Perceptron--------------------------------------------------')
#     zeroMeanErrRates = defaultdict(float)
#     for epoch in epochs:
#         epochData =  avgPerceptronTrainWithFeatureMap(epoch, remapOnFeatureMap2(train, featureMap, True, False), trainY, featureMap)
#         avgWTrain = epochData[epoch-1][0]
#         if epoch == 4:
#             w = epochData[epoch-1][0]
#         avgDevPreds = perceptronPredict(avgWTrain, remapOnFeatureMap2(dev, featureMap, True, False))
#         zeroMeanErrRates[epoch] = err(avgDevPreds, devY)
#         print('epoch: {} dev_err: {:.2f}% (+ {:.2f}%)'.format(epoch, zeroMeanErrRates[epoch], pos(avgDevPreds)))
#     plt.plot(epochs, zeroMeanErrRates.values())

#     print('************************Centering data to be zero mean and unit variance************************')
# #     print('-----------------------------Average Perceptron--------------------------------------------------')
#     zeroMeanUnitVarErrRates = defaultdict(float)
#     for epoch in epochs:
#         epochData =  avgPerceptronTrainWithFeatureMap(epoch, remapOnFeatureMap2(train, featureMap, True, True), trainY, featureMap)
#         avgWTrain = epochData[epoch-1][0]
#         if epoch == 5:
#             w = epochData[epoch-1][0]
#         avgDevPreds = perceptronPredict(avgWTrain, remapOnFeatureMap2(dev, featureMap, True, True))
#         zeroMeanUnitVarErrRates[epoch] = err(avgDevPreds, devY)
#         print('epoch: {} dev_err: {:.2f}% (+ {:.2f}%)'.format(epoch, zeroMeanUnitVarErrRates[epoch], pos(avgDevPreds)))
        
#     plt.plot(epochs, zeroMeanUnitVarErrRates.values())
#     plt.legend(['Average Perceptron with zero mean', 'Average Perceptron with zero mean and unit variance'])
#     plt.show()

#     print('-----------------------------Ground truth positive percentage--------------------------------------------------')
#     print('Ground truth (+ {:.2f}%)'.format(pos(devY)))

#     print('-----------------------------Test Predictions--------------------------------------------------')
#     avgTestPreds = perceptronPredict(w, remapOnFeatureMap2(test, featureMap, True, True))
#     print('Predicted positive % on test is {:.2f}%'.format(pos(avgTestPreds)))
    
# #     print('-----------------------------Writing Test Predictions to \'income.test.predicted\' file--------------------------------------------------')
#     final = [i + ['>50K'] if j==1.0 else i+['<=50K'] for i,j in zip(test,avgTestPreds) ]
#     l = list(map(lambda d : ','.join(d),final))
#     with open('income.test.predicted', 'w') as output:
#         for x in l:
#             output.write(str(x))
#             output.write('\n')

-----------------------------Average Perceptron--------------------------------------------------
epoch: 1 dev_err: 1.00% (+ 19.10%)
epoch: 2 dev_err: 1.00% (+ 20.10%)
epoch: 3 dev_err: 1.00% (+ 20.10%)
epoch: 4 dev_err: 1.00% (+ 20.00%)
epoch: 5 dev_err: 1.00% (+ 20.40%)
epoch: 6 dev_err: 1.00% (+ 20.60%)
epoch: 7 dev_err: 1.00% (+ 20.50%)
epoch: 8 dev_err: 1.00% (+ 20.90%)
epoch: 9 dev_err: 1.00% (+ 21.00%)
[4.3]
[ 1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 

### TA's Solution

In [20]:
#!/usr/bin/env python

from __future__ import division
import sys
import numpy as np
import time
from collections import defaultdict
from itertools import combinations

def normalization(data_X, mean = False, variance = False, norm_para=None):
    data_X = np.array(data_X)
    if norm_para is None:    # calc for training data
        feat_mean = np.mean(data_X,0)   # mean along 0 axis (vertical)
        feat_std  = np.std(data_X,0)
        feat_std[feat_std < 1e-10] = 1.0    # in case some features not appear in dev or test data, resulting 0-std in normalization
    else:               # use scales from training data
        feat_mean, feat_std = norm_para

    if mean:
        data_X = data_X - feat_mean
    if variance:
        data_X = data_X / feat_std
    data_X[:,0] = 1.0    # resume the bias feature to 1

    return data_X, feat_mean, feat_std

def map_data(filename, feature2index, mean = False, variance = False, norm_para = None):
    data_X = []
    data_Y, last = (None, None) if 'blind' in filename else ([], -1)    # last is the position of last valid column
    dimension = len(feature2index)
    for j1, line in enumerate(open(filename)):
        line = line.strip()
        features = line.split(", ")
        feat_vec = np.zeros(dimension)
        for i, fv in enumerate(features[:last]):    # train/dev: last one is target; test: last is also a col in feature
            if (i, fv) in feature2index:            # ignore unobserved features
                feat_vec[feature2index[i, fv]] = 1.0

        # engineering for feature combination
        for i, j in combinations(comb, 2):
            if (i, features[i], j, features[j]) in feature2index:     # ignore unobserved combined features
                feat_vec[feature2index[i, features[i], j, features[j]]] = 1.0

        # engineering for 2 numerical features addition
        if numerical: 
            feat_vec = np.append(feat_vec, [float(features[0]), float(features[7])])  # numerical age, numerical hours  
        data_X.append(feat_vec)
        if last is not None: 
            data_Y.append(1 if features[-1] == ">50K" else -1)
    
    # normalization part
    data_X, feat_mean, feat_std = normalization(data_X, mean = mean, variance = variance, norm_para=norm_para)
#     print('data_X: {}'.format([x for x in data_X[7]]))
    return data_X, data_Y, (feat_mean, feat_std)
    

def train(train_data, dev_data, it = 5, check_freq = 5000, smart_avg = False):
    train_size = len(train_data)
    dimension = len(train_data[0][0])
    model = np.zeros(dimension)
    totmodel = np.zeros(dimension)
    c, smart_tot = 0, np.zeros(dimension)
    best_err_rate = best_err_rate_avg = best_positive = best_positive_avg = 1
    t = time.time()
    for i in range(it):
        updates = 0
        for j, (vecx, y) in enumerate(train_data, start = 1):
            c += 1
            if model.dot(vecx) * y <= 0:
                updates += 1
                model += y * vecx
                if smart_avg:
                    smart_tot += c * y * vecx
            if not smart_avg:
                totmodel += model
            if (j+i*train_size) % check_freq == 0:
                dev_err_rate, positive = test_dev(dev_data, model)
                dev_err_rate_avg, positive_avg = test_dev(dev_data,  model - smart_tot/c if smart_avg else totmodel)
                epoch_position = i + j/train_size

                if dev_err_rate < best_err_rate:        # update a better error
                    best_err_rate = dev_err_rate
                    best_err_pos = epoch_position #(i, j)
                    best_positive = positive
                if dev_err_rate_avg < best_err_rate_avg:
                    best_err_rate_avg = dev_err_rate_avg
                    best_err_pos_avg = epoch_position #(i, j)
                    best_positive_avg = positive_avg
                    best_avg_model = model - smart_tot/c if smart_avg else totmodel.copy() #copy() is important
                print("unavg, epoch {} updates {} ({:.1%}) dev_err {:.1%} (+:{:.1%});   ".format(i+1,
                                                            updates,
                                                            updates/train_size,
                                                            dev_err_rate,
                                                            positive), \
                "avg, epoch {} dev_err {:.1%} (+:{:.1%})".format(i+1,
                                                            dev_err_rate_avg,
                                                            positive_avg))
    print("training time {:.5f} s".format(time.time()-t))
    return best_avg_model/it/len(train_data)

def test_dev(data, model):
    errors = sum(model.dot(vecx) * y <= 0 for vecx, y in data)
    positives = sum(model.dot(vecx) > 0 for vecx, _ in data)
    return errors / len(data), positives / len(data)

def predict(test_data, model):
    return [">50K" if model.dot(vecx) > 0 else "<=50K" for vecx in test_data ]

def create_feature_map(train_file):
    column_values = defaultdict(set)
    for line in open(train_file):
        features = line.strip().split(", ")[:-1] # last field is target.
        for i, fv in enumerate(features):
            # if i in numerical: continue    # uncommand to keep only binarized/numerical features for age and hours, command out to keep both
            column_values[i].add(fv)
    
    feature2index = {(-1, 'bias'): 0} # bias
    index2feature = {0: ('col-1', 'bias')}
    
    for i, values in column_values.items():
        for v in values:
            feature2index[i, v] = len(feature2index)
            index2feature[len(feature2index) - 1] = ('col'+str(i),v)
                        
    # engineering for feature combination
    for i, j in combinations(comb, 2):
        for v1 in column_values[i]:
            for v2 in  column_values[j]:
                feature2index[i, v1, j, v2] = len(feature2index)
                index2feature[len(feature2index) - 1] = ('col'+str(i),v1,'col'+str(j),v2)
    dimension = len(feature2index) + len(numerical)
    print("dimensionality: ", dimension)
    return feature2index, index2feature


def experiment(train_file, dev_file, test_file = '', it = 1, check_freq = 5000, feat_detail = False, mean = False, variance = False):
    feature2index,index2feature = create_feature_map(train_file)
#     print('feature2index: {}'.format(feature2index))
#     print('index2feature: {}'.format(index2feature))
    
    X1, Y1, norm = map_data(train_file, feature2index, mean, variance)
    train_data   = list(zip(X1, Y1))
    
    X2, Y2, _    = map_data(dev_file, feature2index, mean, variance, norm_para=norm)
    dev_data     = list(zip(X2, Y2))
    
    model = train(train_data, dev_data, it, check_freq)

    print("train_err {:.2%} (+:{:.1%})".format(*test_dev(train_data, model)))

    # display feature details (top 5 pos weights, top5 neg weights, etc)
    if feat_detail: 
        print([(index2feature[i], '{:.5}'.format(model[i])) for i in model.argsort()[-5:][::-1]])
        print([(index2feature[i], '{:.5}'.format(model[i])) for i in model.argsort()[:5]])
        print([(i, model[feature2index[-1,i]]) for i in ['bias']])
        print([(i, model[feature2index[6,i]]) for i in ['Male', 'Female']])

    # predict blind test if test_file name is assigned
    if test_file != '':
        test_data_X, _, _  = map_data(test_file, feature2index, mean, variance, norm_para=norm)
        labels = predict(test_data_X, model)
        positive = sum([1 for x in labels if x == '>50K'])/len(labels)
        fout = open(test_file+'_pred','w')
        for j, line in enumerate(open(test_file)):
            fout.write(line[:-1]+', '+labels[j]+'\n')
        fout.close()
        print('(+:{:.1%}), prediction written to {}'.format(positive, test_file+'_pred'))
 
def exp():
    print("{}\nPerceptron and Averaged Perceptron".format('-'*N_dash))
    experiment(train_file, dev_file,it=it,feat_detail=False,mean=mean_,variance=variance_)

if __name__ == "__main__":

#     if len(sys.argv) > 1:
#         train_file, dev_file = sys.argv[1], sys.argv[2]
#     else:
    train_file, dev_file = "income.train.txt.5k", "income.dev.txt"
    test_file = "./hw1-data/income.test.blind"

    mean_ = True
    variance_ = False
    it = 5

    N_dash = 40

    # here are tunnung on numerical features and combinations
    numerical = []#[0, 7]
    # numerical = [0,7]
    comb = []
    # comb=[4,6]
    comb = [4,5,8]
    # comb = [0,5,6,7,8]
    # comb = [0,1,2,3,4,5,6,7,8]
    exp()


----------------------------------------
Perceptron and Averaged Perceptron
dimensionality:  1042
X1: [[ 1.     -0.0016 -0.0166 ...  0.      0.      0.    ]
 [ 1.     -0.0016 -0.0166 ...  0.      0.      0.    ]
 [ 1.     -0.0016 -0.0166 ...  0.      0.      0.    ]
 [ 1.     -0.0016 -0.0166 ...  0.      0.      0.    ]
 [ 1.     -0.0016 -0.0166 ...  0.      0.      0.    ]]
unavg, epoch 1 updates 1182 (23.6%) dev_err 21.2% (+:28.2%);    avg, epoch 1 dev_err 13.7% (+:19.1%)
unavg, epoch 2 updates 1121 (22.4%) dev_err 20.8% (+:27.2%);    avg, epoch 2 dev_err 14.9% (+:21.3%)
unavg, epoch 3 updates 1097 (21.9%) dev_err 23.1% (+:29.7%);    avg, epoch 3 dev_err 15.1% (+:21.5%)
unavg, epoch 4 updates 1095 (21.9%) dev_err 21.5% (+:25.1%);    avg, epoch 4 dev_err 15.5% (+:22.1%)
unavg, epoch 5 updates 1066 (21.3%) dev_err 20.7% (+:26.5%);    avg, epoch 5 dev_err 15.4% (+:22.0%)
training time 0.44104 s
train_err 16.46% (+:20.2%)
