In [166]:
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from collections import Counter

def remapOnFeatureMap(data):
    remapped = []
    for i,row in enumerate(data):
        newRow = []
        for j,d in enumerate(row):
            if j not in [0,7]:
                if (j,d) in featureMap:
                    newRow.append(featureMap[(j,d)])
            else:
                newRow.append(float(d)/50)
        remapped.append(newRow)
        
    binRemapped = np.zeros((len(data), len(featureMap)))
    for i,row in enumerate(remapped):
        for j,d in enumerate(row):
            if j not in [0,7]:
                binRemapped[i][d] = 1
            else:
                binRemapped[i][j] = d
    
    return binRemapped

def dist(trainData, testData, ord):
    dist = []
    for row in testData:
        dist.append(np.argsort(np.linalg.norm(trainData - row, ord, axis = 1)))
    return dist

def knn(distances, target, k):
    preds = []
    for row in distances:
        preds.append(Counter([target[i] for i in row[:k]]).most_common()[0][0])
    return preds
    
def err(preds, target):
    return sum([1 if preds[i]!=target[i] else 0 for i in range(len(preds))])/len(preds)*100

def pos(preds):
    return (sum([preds[i]==1 for i in range(len(preds))])/len(preds))*100

if __name__ == '__main__':
    train = [s.strip().split(',')[:-1] for s in open('income.train.txt.5k').readlines()]
    dev = [s.strip().split(',')[:-1] for s in open('income.dev.txt').readlines()]
#     test = [s.strip().split(',')[:-1] for s in open('income.test.blind').readlines()]

    trainY = [1 if s.strip().split(',')[-1].strip()=='>50K' else 0 for s in open('income.train.txt.5k').readlines()]
    devY = [1 if s.strip().split(',')[-1].strip()=='>50K' else 0 for s in open('income.dev.txt').readlines()]
    
    featureMap = defaultdict(int)
    for i, val in enumerate(train):
        for j, d in enumerate(val):
            feature = (j,d)
            if j not in [0,7]:
                if feature not in featureMap :
                    featureMap[feature] = len(featureMap)
            else:
                featureMap[(j,0)] = 1
                
    print('number of dimensions: {}'.format(len(featureMap)))
    binTrain = remapOnFeatureMap(train)
    binDev = remapOnFeatureMap(dev)
#     binTest = remapOnFeatureMap(test)
    
    devDist = dist(binTrain, binDev,1)
    trainDist = dist(binTrain, binTrain,1)
    
    kVals = [1,3,5,7,9,99,999,1999,2999,3999,4999,9999]
    for k in kVals:
        devPreds = knn(devDist, trainY, k)
        trainPreds = knn(trainDist, trainY, k)
        print('k: {} train_err: {}% (+: {}) dev_err: {}% (+: {})'.format(k,err(trainPreds, trainY), pos(trainPreds), err(devPreds, devY), pos(devPreds)))

number of dimensions: 92
k: 10 train_err: 12.64% (+: 23.22) dev_err: 16.400000000000002% (+: 21.6)


In [211]:
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

def remapOnFeatureMap(data):
    remapped = []
    for i,row in enumerate(data):
        newRow = []
        for j,d in enumerate(row):
            if (j,d) in featureMap:
                newRow.append(featureMap[(j,d)])
        remapped.append(newRow)
        
    binRemapped = np.zeros((len(data), len(featureMap)))
    for i,row in enumerate(remapped):
        for j,d in enumerate(row):
            binRemapped[i][d] = 1
        # Adding corresponding value 1 for last bias term
        binRemapped[i][-1] = 1
    
    return binRemapped

def perceptronTrain(epochs, data, target):
    epochData = defaultdict(list)
    w = np.zeros((len(featureMap)))
    
    for iteration in range(epochs):
        u = 0
        for i,row in enumerate(data):
            a = np.sum(np.dot(w,row))
            if target[i]*a <= 0:
                u += 1
                w += target[i]*row
        epochData[iteration].append(u)
        epochData[iteration].append(w)
    return epochData

def perceptronPredict(w,data):
    preds = []
    for i, d in enumerate(data):
        preds.append(np.sign(np.dot(w,d)))
    return preds

def avgPerceptronTrain(epochs, data, target):
    epochData = defaultdict(list)
    w = np.zeros((len(featureMap)))
    wa = np.zeros((len(featureMap)))
    c = 0
    
    for iteration in range(epochs):
        for i, row in enumerate(data):
            a = np.sum(np.dot(w,row))
            if target[i]*a <= 0:
                w += target[i]*row
                wa += c*target[i]*row
            c += 1
        epochData[iteration].append(c*w - wa)
    return epochData

def err(preds, target):
    return (np.sum([1 if preds[i] == 0 or preds[i]!=target[i] else 0 for i in range(len(preds))]))/len(preds)*100

def pos(preds):
    return np.sum([1 if preds[i]==1 else 0 for i in range(len(preds))])/len(preds)*100

def reorderedData():
    print('-------------------------------Reordering training data----------------------------------------------')
    reorderedTrain = [s.strip().split(',')[:-1] for s in open('income.train.txt.5k').readlines() if s.strip().split(',')[-1]==' >50K']
    reorderedTrain.extend([s.strip().split(',')[:-1] for s in open('income.train.txt.5k').readlines() if s.strip().split(',')[-1]==' <=50K'])
    reorderedTrainY = [1 for s in open('income.train.txt.5k').readlines() if s.strip().split(',')[-1]==' >50K']
    reorderedTrainY.extend([-1 for s in open('income.train.txt.5k').readlines() if s.strip().split(',')[-1]==' <=50K'])
    
    reorderedDev = [s.strip().split(',')[:-1] for s in open('income.dev.txt').readlines() if s.strip().split(',')[-1]==' >50K']
    reorderedDev.extend([s.strip().split(',')[:-1] for s in open('income.dev.txt').readlines() if s.strip().split(',')[-1]==' <=50K'])
    reorderedDevY = [1 for s in open('income.dev.txt').readlines() if s.strip().split(',')[-1]==' >50K']
    reorderedDevY.extend([-1 for s in open('income.dev.txt').readlines() if s.strip().split(',')[-1]==' <=50K'])

    reorderedBinTrain = remapOnFeatureMap(reorderedTrain)
    reorderedBinDev = remapOnFeatureMap(reorderedDev)
    
    for epoch in range(1,10):
        epochData = perceptronTrain(epoch, reorderedBinTrain, reorderedTrainY)
        updates, wTrain = epochData[epoch-1][0], epochData[epoch-1][1]
        devPreds = perceptronPredict(wTrain,reorderedBinDev)
        print('epoch: {} updates: {} ({:.2f}%) dev_err: {:.2f}% (+: {:.2f}%)'.format(epoch, updates, (updates/len(reorderedBinTrain))*100, err(devPreds, reorderedDevY), pos(devPreds)))
        
    print('-----------------------------Average Perceptron--------------------------------------------------')
    for epoch in range(1,10):
        epochData =  avgPerceptronTrain(epoch, reorderedBinTrain, reorderedTrainY)
        avgWTrain = epochData[epoch-1][0]
        if epoch == 4:
            w = epochData[epoch-1][0]
        avgDevPreds = perceptronPredict(avgWTrain, binDev)
        print('epoch: {} dev_err: {:.2f}% (+ {:.2f}%)'.format(epoch, err(avgDevPreds, reorderedDevY), pos(avgDevPreds)))

def featureEngineering1():
    featureMap2 = defaultdict(int)
    for i, val in enumerate(train):
        for j, d in enumerate(val):
            feature = (j,d)
            if j not in [0,7]:
                if feature not in featureMap2:
                    featureMap2[feature] = len(featureMap2)
    featureMap2[(9,1)] = len(featureMap2)
    
    return featureMap2
    
def remapOnFeatureMap2(data, featureMap2, zeroMean, unitVariance):
    remapOnFeatureMap2 = []
    a = [float(d[0]) for d in data]
    h = [float(d[7]) for d in data]
    
    muAge = np.mean(a)
    muHours = np.mean(h)
    
    sdAge = np.std(a)
    sdHours = np.std(h)
    
    minAge, maxAge = min(a), max(a)
    minHours, maxHours = min(h), max(h)
    
    for i, row in enumerate(data):
        newRow = []
        for j, d in enumerate(row):
            if j not in [0,7]:
                if (j,d) in featureMap2:
                    newRow.append(featureMap2[(j,d)])
            else:
                if zeroMean == False and unitVariance == False:
                    newRow.append(float(d)/50)
                elif zeroMean == True and unitVariance == False:
                    if j == 0:
                        newRow.append(float(d)-muAge)
                    elif j == 7:
                        newRow.append(float(d)-muHours)
                elif zeroMean == True and unitVariance == True:
                    if j == 0:
#                         newRow.append((float(d)-minAge)/maxAge-minAge)
                        newRow.append((float(d)-muAge)/sdAge)
                    elif j == 7:
#                         newRow.append((float(d)-minHours)/maxHours-minHours)
                        newRow.append((float(d)-muHours)/sdHours)
        remapOnFeatureMap2.append(newRow)
    
    binRemapped2 = np.zeros((len(data), len(featureMap2)), dtype=float)
    for i, row in enumerate(remapOnFeatureMap2):
        for j, d in enumerate(row):
            if j not in [0,7]:
                binRemapped2[i][d] = 1
            else:
                binRemapped2[i][j] = d
        binRemapped2[i][-1] = 1
    return binRemapped2
    
def perceptronTrainWithFeatureMap(epochs, data, target, fm):
    epochData = defaultdict(list)
    w = np.zeros((len(fm)))
    
    for iteration in range(epochs):
        u = 0
        for i,row in enumerate(data):
            row = np.asarray(row)
            a = np.sum(np.dot(w,row))
            if target[i]*a <= 0:
                u += 1
                w += target[i]*row
        epochData[iteration].append(u)
        epochData[iteration].append(w)
        
    return epochData

def avgPerceptronTrainWithFeatureMap(epochs, data, target, fm):
    epochData = defaultdict(list)
    w = np.zeros((len(fm)))
    wa = np.zeros((len(fm)))
    c = 0
    
    for iteration in range(epochs):
        for i, row in enumerate(data):
            a = np.sum(np.dot(w,row))
            if target[i]*a <= 0:
                w += target[i]*row
                wa += c*target[i]*row
            c += 1
        epochData[iteration].append(c*w - wa)
    return epochData
    
if __name__ == '__main__':
    sns.set()
    train = [s.strip().split(',')[:-1] for s in open('income.train.txt.5k').readlines()]
    dev = [s.strip().split(',')[:-1] for s in open('income.dev.txt').readlines()]
    test = [s.strip().split(',')[:-1] for s in open('income.test.blind').readlines()]

    trainY = [1 if s.strip().split(',')[-1].strip()=='>50K' else -1 for s in open('income.train.txt.5k').readlines()]
    devY = [1 if s.strip().split(',')[-1].strip()=='>50K' else -1 for s in open('income.dev.txt').readlines()]
    
    featureMap = defaultdict(int)
    featureRemap = {}
    for i, val in enumerate(train):
        for j, d in enumerate(val):
            feature = (j,d)
            if feature not in featureMap :
                featureMap[feature] = len(featureMap)
                featureRemap[len(featureMap)-1] = feature
    #Adding last dimension for bias (value = 1) to the feature map
    featureMap[(9,0)] = len(featureMap)
    featureRemap[len(featureMap)-1] = (9,0)
                
#     print('number of dimensions: {}'.format(len(featureMap)))
    binTrain = remapOnFeatureMap(train)
    binDev = remapOnFeatureMap(dev)
        
#     print(featureMap)
#     print('---------------------------------')
#     print(featureRemap)

    epochs = range(1,10)
    
#     basicVanErrRates = defaultdict(float)
#     for epoch in epochs:
#         epochData = perceptronTrain(epoch, binTrain, trainY)
#         updates, wTrain = epochData[epoch-1][0], epochData[epoch-1][1]
#         devPreds = perceptronPredict(wTrain,binDev)
#         basicVanErrRates[epoch] = err(devPreds, devY)
#         print('epoch: {} updates: {} ({:.2f}%) dev_err: {:.2f}% (+: {:.2f}%)'.format(epoch, updates, (updates/len(binTrain))*100, basicVanErrRates[epoch], pos(devPreds)))
    
#     plt.plot(epochs, basicVanErrRates.values())
        
#     print('-----------------------------Average Perceptron--------------------------------------------------')
#     basicAvgErrRates = defaultdict(float)
#     for epoch in epochs:
#         epochData =  avgPerceptronTrain(epoch, binTrain, trainY)
#         avgWTrain = epochData[epoch-1][0]
#         if epoch == 4:
#             w = epochData[epoch-1][0]
#         avgDevPreds = perceptronPredict(avgWTrain, binDev)
#         basicAvgErrRates[epoch] = err(avgDevPreds, devY)
#         print('epoch: {} dev_err: {:.2f}% (+ {:.2f}%)'.format(epoch, basicAvgErrRates[epoch], pos(avgDevPreds)))
    
#     plt.plot(epochs, basicAvgErrRates.values())
#     plt.legend(['Vanilla Perceptron', 'Average Perceptron'])
#     plt.show()
        
    
#     print('************************ Experimentations ************************')
#     reorderedData()
    
#     print('************************Using original numerical features************************')
#     fm2 = featureEngineering1()
#     binNumFeaturesTrainRemapped = remapOnFeatureMap2(train, fm2, False, False)
#     binNumFeaturesDevRemapped = remapOnFeatureMap2(dev, fm2, False, False)

#     numFeaturesVanErrRates = defaultdict(float)
#     for epoch in epochs:
#         epochData = perceptronTrainWithFeatureMap(epoch, binNumFeaturesTrainRemapped, trainY, fm2)
#         updates, wTrain = epochData[epoch-1][0], epochData[epoch-1][1]
#         devPreds = perceptronPredict(wTrain,binNumFeaturesDevRemapped)
#         numFeaturesVanErrRates[epoch] = err(devPreds, devY)
#         print('epoch: {} updates: {} ({:.2f}%) dev_err: {:.2f}% (+: {:.2f}%)'.format(epoch, updates, (updates/len(binNumFeaturesTrainRemapped))*100, numFeaturesVanErrRates[epoch], pos(devPreds)))
#     plt.plot(epochs, numFeaturesVanErrRates.values())
        
#     print('-----------------------------Average Perceptron--------------------------------------------------')
#     numFeaturesAvgErrRates = defaultdict(float)
#     for epoch in epochs:
#         epochData =  avgPerceptronTrainWithFeatureMap(epoch, binNumFeaturesTrainRemapped, trainY, fm2)
#         avgWTrain = epochData[epoch-1][0]
#         if epoch == 4:
#             w = epochData[epoch-1][0]
#         avgDevPreds = perceptronPredict(avgWTrain, binNumFeaturesDevRemapped)
#         numFeaturesAvgErrRates[epoch] = err(avgDevPreds, devY)
#         print('epoch: {} dev_err: {:.2f}% (+ {:.2f}%)'.format(epoch, numFeaturesAvgErrRates[epoch], pos(avgDevPreds)))
#     plt.plot(epochs, numFeaturesAvgErrRates.values())
#     plt.legend(['Vanilla Perceptron with numerical features', 'Average Perceptron with numerical features'])
#     plt.show()
    
#     print('----------------------------Most positive and negative features for average perceptron--------------------------------')
#     print('Using weight vector from epoch number 4 when dev_err is minimum')
#     print('len of w: {}'.format(len(w)))
#     sortedWeights = [(x,w[x]) for x in np.argsort(w)]
#     print('weights: {}'.format(sortedWeights))
#     print('remap: {}'.format(featureRemap))
#     print('At epoch={}, five most negative weight: {}'.format(4, [featureRemap[x] for (x,weight) in sortedWeights[:6]]))
#     print('At epoch={}, five most positive weight: {}'.format(4, [featureRemap[x] for (x,weight) in sortedWeights[-6:]]))
#     print('----------------------------Feature Weight for Bias Dimension --------------------------------')
#     print([w[x] for (x,w[x]) in sortedWeights if x==230])
    

#     print('************************Centering data to be zero mean************************')
#     print('-----------------------------Average Perceptron--------------------------------------------------')
#     zeroMeanErrRates = defaultdict(float)
#     for epoch in epochs:
#         epochData =  avgPerceptronTrainWithFeatureMap(epoch, remapOnFeatureMap2(train, featureMap, True, False), trainY, featureMap)
#         avgWTrain = epochData[epoch-1][0]
#         if epoch == 4:
#             w = epochData[epoch-1][0]
#         avgDevPreds = perceptronPredict(avgWTrain, remapOnFeatureMap2(dev, featureMap, True, False))
#         zeroMeanErrRates[epoch] = err(avgDevPreds, devY)
#         print('epoch: {} dev_err: {:.2f}% (+ {:.2f}%)'.format(epoch, zeroMeanErrRates[epoch], pos(avgDevPreds)))
#     plt.plot(epochs, zeroMeanErrRates.values())

    print('************************Centering data to be zero mean and unit variance************************')
#     print('-----------------------------Average Perceptron--------------------------------------------------')
    zeroMeanUnitVarErrRates = defaultdict(float)
    for epoch in epochs:
        epochData =  avgPerceptronTrainWithFeatureMap(epoch, remapOnFeatureMap2(train, featureMap, True, True), trainY, featureMap)
        avgWTrain = epochData[epoch-1][0]
        if epoch == 5:
            w = epochData[epoch-1][0]
        avgDevPreds = perceptronPredict(avgWTrain, remapOnFeatureMap2(dev, featureMap, True, True))
        zeroMeanUnitVarErrRates[epoch] = err(avgDevPreds, devY)
        print('epoch: {} dev_err: {:.2f}% (+ {:.2f}%)'.format(epoch, zeroMeanUnitVarErrRates[epoch], pos(avgDevPreds)))
        
#     plt.plot(epochs, zeroMeanUnitVarErrRates.values())
#     plt.legend(['Average Perceptron with zero mean', 'Average Perceptron with zero mean and unit variance'])
#     plt.show()

#     print('-----------------------------Ground truth positive percentage--------------------------------------------------')
#     print('Ground truth (+ {:.2f}%)'.format(pos(devY)))

#     print('-----------------------------Test Predictions--------------------------------------------------')
    avgTestPreds = perceptronPredict(w, remapOnFeatureMap2(test, featureMap, True, True))
    print('Predicted positive % on test is {:.2f}%'.format(pos(avgTestPreds)))
    
#     print('-----------------------------Writing Test Predictions to \'income.test.predicted\' file--------------------------------------------------')
#     final = [i + ['>50K'] if [j]==1 else i+['<=50K'] for i,j in zip(test,avgTestPreds) ]
#     l = list(map(lambda d : ','.join(d),final))
#     with open('income.test.predicted', 'w') as output:
#         for x in l:
#             output.write(str(x))
#             output.write('\n')

************************Centering data to be zero mean and unit variance************************
epoch: 1 dev_err: 15.60% (+ 19.80%)
epoch: 2 dev_err: 15.70% (+ 19.90%)
epoch: 3 dev_err: 15.90% (+ 19.70%)
epoch: 4 dev_err: 15.70% (+ 19.30%)
epoch: 5 dev_err: 15.30% (+ 19.50%)
epoch: 6 dev_err: 15.40% (+ 19.20%)
epoch: 7 dev_err: 15.40% (+ 19.40%)
epoch: 8 dev_err: 15.40% (+ 19.20%)
epoch: 9 dev_err: 15.30% (+ 19.50%)
Predicted positive % on test is 14.70%
