In [24]:
import csv
import numpy as np
import math
from collections import defaultdict
from collections import Counter
import time

In [25]:
def getData(fileName):
    with open(fileName) as trainTxt:
        train = trainTxt.readlines()

        #Getting all data except the target column
        data = list(map(lambda s : s.strip().split(', '), train))
    return data

In [26]:
def getAllData(fileName):
    with open(fileName) as trainTxt:
        train = trainTxt.readlines()

    #Getting all data except the target column
    data = list(map(lambda s : s.strip().split(', '), train))
        
    sector = [line[1] for line in data]
    education = [line[2] for line in data]
    marital = [line[3] for line in data]
    occ = [line[4] for line in data]
    race = [line[5] for line in data]
    gender = [line[6] for line in data]
    country = [line[8] for line in data]
    
    age = [x/100 for x in list(map(int,[line[0] for line in data]))]
    hours = [x/100 for x in list(map(int,[line[7] for line in data]))]
    
    return list(zip(age, sector, education, marital, occ, race, gender, country, hours))

In [27]:
def getCategoricalFeaturesData(data):
    sector = [line[1] for line in data]
    education = [line[2] for line in data]
    marital = [line[3] for line in data]
    occ = [line[4] for line in data]
    race = [line[5] for line in data]
    gender = [line[6] for line in data]
    country = [line[8] for line in data]
    
    return list(zip(sector, education, marital, occ, race, gender, country))

In [28]:
def getNumericalFeaturesData(data):
    age = [x/50 for x in list(map(int,[line[0] for line in data]))]
    hours = [x/50 for x in list(map(int,[line[7] for line in data]))]
    
    return list(zip(age, hours))

In [29]:
def getTarget(data):
    return [line[-1] for line in data]

In [30]:
def getFeatureMapAndBinarizedData(trainData):
    featureMapping = {}
    encodedTrainData = []
    
    for row in trainData:
        newRow = []
        for j,x in enumerate(row):
            if isinstance(x,str) and j not in [0,8]:
                feature = (j,x)
                if feature not in featureMapping:
                    featureMapping[feature] = len(featureMapping)
                newRow.append(featureMapping[feature])
            elif j in [0,8]:
                newRow.append(x)
            else:
                print('unusual')
        encodedTrainData.append(newRow)
        
    return featureMapping, encodedTrainData

In [None]:
def getRawFeatureMapAndBinarizedData(trainData):
    featureMapping = {}
    encodedTrainData = []
    
    for row in trainData:
        newRow = []
        for j,x in enumerate(row):
            feature = (j,x)
            if feature not in featureMapping:
                featureMapping[feature] = len(featureMapping)
            newRow.append(featureMapping[feature])
        encodedTrainData.append(newRow)
        
    return featureMapping, encodedTrainData

In [None]:
def getTranslatedOnFeatureMap(featureMapping, data):
    translatedData = []
    
    for row in data:
        newRow = []
        for j,x in enumerate(row):
            feature = (j,x)
            if j not in [0,8]:
                if feature in featureMapping.keys():
                    newRow.append(featureMapping[feature])
            elif j in [0,8]:
                newRow.append(x)
        translatedData.append(newRow)
    return translatedData

In [None]:
def getBinarizedData(unbinarizedData, numberOfFeatures):
    finalData = np.zeros((len(unbinarizedData), numberOfFeatures))
    for c, row in enumerate(unbinarizedData):
        for i,x in enumerate(row):
            if isinstance(x,int) and i not in [0,8]:
                finalData[c][x] = 1
            elif isinstance(x,float) and i in [0,8]:
                finalData[c][i] = x
    return finalData

In [None]:
def getLingAlgNorm(binarizedTrainData, binarizedDevData, k):
    eucs = defaultdict(int)
    for c,devRow in enumerate(binarizedDevData):
        dist = []
        for x,tRow in enumerate(binarizedTrainData):
            dist.append((x,np.linalg.norm(np.array(tRow)-np.array(devRow))))
        eucs[c] = sorted(dist, key = lambda s : s[1])[:k]
    return eucs

In [None]:
def getEucDist(trainRow, testRow):
    n = len(trainRow)
    return math.sqrt(sum([ (np.array(trainRow[i]) - np.array(testRow[i])) **2 for i in range(n)] ))

In [None]:
def getNormWithoutIndex(train, test, ord=None):
    dist = []
    for row in test:
        dist.append(np.linalg.norm(train - row, ord, axis = 1))
    return dist

In [None]:
def getPredictions(topKElements, fileName):
    actualIncome = getTarget(getData(fileName))
    predictions = []
    
    for i in range(len(topKElements)):
        predictedLabelsIndex = [actualIncome[j] for j in topKElements[i]]
        predictedLabel = Counter(predictedLabelsIndex).most_common(1)[0][0]
        predictions.append(predictedLabel)
    return predictions

In [None]:
def getTopKElements(l,k):
    topK = []
    i = 0
    
    while len(topK) < k:
        topK.append((l[i],i))
        for j in range(i+1,len(l)):
            if topK[i][0] > l[j]:
                topK[i] = (l[j],j)
                l[i], l[j] = l[j], l[i]
        i+=1
    return topK

In [None]:
def getErrorRate(predictions, actualTargetFileName):
    actuals = getTarget(getData(actualTargetFileName))
    error = 0
    for i in range(len(predictions)):
        if actuals[i] != predictions[i]:
            error+=1
    return error/10

In [None]:
def getPositives(predictions):
    return Counter(predictions)['>50K']/10

In [None]:
def getErrorRatesAndPositivePercentage(distWithoutIndex, baselineFileName):
    kList = [1,3,5,7,9,31,32,33,34,35,40,99,999,9999]
    
    topKElements = defaultdict(list)
    predictions = defaultdict(list)

    for k in kList:
        if len(distWithoutIndex) < k:
            k = len(distWithoutIndex)
            topKElements[k] = [np.argpartition(x,k-1, axis=0)[:k-1] for x in [distWithoutIndex[i] for i in range(len(distWithoutIndex))]]
        else:
            topKElements[k] = [np.argpartition(x,k, axis=0)[:k] for x in [distWithoutIndex[i] for i in range(len(distWithoutIndex))]]
        predictions[k] = getPredictions(topKElements[k], baselineFileName)

    return predictions

In [None]:
featureMapping, unbinarizedTrainData = getFeatureMapAndBinarizedData(getAllData('income.train.txt.5k'))
unbinarizedTrainData = getTranslatedOnFeatureMap(featureMapping, getAllData('income.train.txt.5k'))
unbinarizedDevData = getTranslatedOnFeatureMap(featureMapping, getAllData('income.dev.txt'))
unbinarizedTestData = getTranslatedOnFeatureMap(featureMapping, getAllData('income.test.blind'))

binarizedTrainData = getBinarizedData(unbinarizedTrainData, 92)
binarizedDevData = getBinarizedData(unbinarizedDevData, 92)
binarizedTestData = getBinarizedData(unbinarizedTestData, 92)

print('------For dev data---------')
devDistWithoutIndex = getNormWithoutIndex(binarizedTrainData, binarizedDevData)
devPredictions = getErrorRatesAndPositivePercentage(devDistWithoutIndex, 'income.train.txt.5k')

for k,preds in devPredictions.items():
    print('k: {} error rate: {} positive: {}'.format(k,getErrorRate(preds, 'income.dev.txt'), getPositives(preds)))

print('------For train data---------')
trainDistWithoutIndex = getNormWithoutIndex(binarizedTrainData, binarizedTrainData)
trainPredictions = getErrorRatesAndPositivePercentage(trainDistWithoutIndex, 'income.train.txt.5k')
for k,preds in trainPredictions.items():
    print('k: {} error rate: {} positive: {}'.format(k,getErrorRate(preds, 'income.train.txt.5k'), getPositives(preds)))

print('------For test data---------')
testDistWithoutIndex = getNormWithoutIndex(binarizedTrainData, binarizedTestData)
testPredictions = getErrorRatesAndPositivePercentage(testDistWithoutIndex, 'income.train.txt.5k')

------For dev data---------
k: 1 error rate: 23.9 positive: 26.7
k: 3 error rate: 20.0 positive: 24.6
k: 5 error rate: 19.0 positive: 23.8
k: 7 error rate: 17.4 positive: 24.0
k: 9 error rate: 17.0 positive: 21.6
k: 31 error rate: 16.7 positive: 18.9
k: 32 error rate: 16.1 positive: 18.9
k: 33 error rate: 16.4 positive: 18.4
k: 34 error rate: 15.8 positive: 18.6
k: 35 error rate: 15.9 positive: 19.1
k: 40 error rate: 16.0 positive: 18.8
k: 99 error rate: 16.7 positive: 16.7
k: 999 error rate: 20.4 positive: 5.8
k: 1000 error rate: 20.4 positive: 5.8
------For train data---------


In [None]:
print('************************Using Manhattan distance**********************')
print('------For dev data---------')
devMDist = getNormWithoutIndex(binarizedTrainData, binarizedDevData, ord=1)
devMPredictions = getErrorRatesAndPositivePercentage(devMDist, 'income.train.txt.5k')

for k,preds in devMPredictions.items():
    print('k: {} error rate: {} positive: {}'.format(k,getErrorRate(preds, 'income.dev.txt'), getPositives(preds)))
    
print('------For train data---------')
trainDistWithoutIndex = getNormWithoutIndex(binarizedTrainData, binarizedTrainData)
trainPredictions = getErrorRatesAndPositivePercentage(trainDistWithoutIndex, 'income.train.txt.5k')
for k,preds in trainPredictions.items():
    print('k: {} error rate: {} positive: {}'.format(k,getErrorRate(preds, 'income.train.txt.5k'), getPositives(preds)))

In [None]:
featureMapping2, unbinarizedTrainData2 = getRawFeatureMapAndBinarizedData(getAllData('income.train.txt.5k'))
unbinarizedTrainData2 = getTranslatedOnFeatureMap(featureMapping2, getAllData('income.train.txt.5k'))
unbinarizedDevData2 = getTranslatedOnFeatureMap(featureMapping2, getAllData('income.dev.txt'))
unbinarizedTestData2 = getTranslatedOnFeatureMap(featureMapping2, getAllData('income.test.blind'))

binarizedTrainData2 = getBinarizedData(unbinarizedTrainData2, 230)
binarizedDevData2 = getBinarizedData(unbinarizedDevData2, 230)
binarizedTestData2 = getBinarizedData(unbinarizedTestData2, 230)

print('------For dev data---------')
devDistWithoutIndex2 = getNormWithoutIndex(binarizedTrainData2, binarizedDevData2)
devPredictions2 = getErrorRatesAndPositivePercentage(devDistWithoutIndex2, 'income.train.txt.5k')

for k,preds in devPredictions2.items():
    print('k: {} error rate: {} positive: {}'.format(k,getErrorRate(preds, 'income.dev.txt'), getPositives(preds)))

print('------For train data---------')
trainDistWithoutIndex2 = getNormWithoutIndex(binarizedTrainData2, binarizedTrainData2)
trainPredictions2 = getErrorRatesAndPositivePercentage(trainDistWithoutIndex2, 'income.train.txt.5k')
for k,preds in trainPredictions2.items():
    print('k: {} error rate: {} positive: {}'.format(k,getErrorRate(preds, 'income.train.txt.5k'), getPositives(preds)))