In [4]:
import csv
import numpy as np
import pixiedust
import math
from collections import defaultdict
from collections import Counter

Pixiedust database opened successfully


Unable to check latest version <urlopen error [WinError 10054] An existing connection was forcibly closed by the remote host>


In [5]:
def getData(fileName):
    with open(fileName) as trainTxt:
        train = trainTxt.readlines()

        #Getting all data except the target column
        data = list(map(lambda s : s.strip().split(', '), train))
    return data

In [6]:
def getAllData(fileName):
    with open(fileName) as trainTxt:
        train = trainTxt.readlines()

    #Getting all data except the target column
    data = list(map(lambda s : s.strip().split(', '), train))
        
    sector = [line[1] for line in data]
    education = [line[2] for line in data]
    marital = [line[3] for line in data]
    occ = [line[4] for line in data]
    race = [line[5] for line in data]
    gender = [line[6] for line in data]
    country = [line[8] for line in data]
    
    age = [x/50 for x in list(map(int,[line[0] for line in data]))]
    hours = [x/50 for x in list(map(int,[line[7] for line in data]))]
    
    return list(zip(age, sector, education, marital, occ, race, gender, country, hours))

In [7]:
def getCategoricalFeaturesData(data):
    sector = [line[1] for line in data]
    education = [line[2] for line in data]
    marital = [line[3] for line in data]
    occ = [line[4] for line in data]
    race = [line[5] for line in data]
    gender = [line[6] for line in data]
    country = [line[8] for line in data]
    
    return list(zip(sector, education, marital, occ, race, gender, country))

In [8]:
def getNumericalFeaturesData(data):
    age = [x/50 for x in list(map(int,[line[0] for line in data]))]
    hours = [x/50 for x in list(map(int,[line[7] for line in data]))]
    
    return list(zip(age, hours))

In [9]:
def getTarget(data):
    return [line[-1] for line in data]

In [42]:
def getFeatureMapAndBinarizedData(trainData):
    featureMapping = {}
    encodedTrainData = []
    
    for row in trainData:
        newRow = []
        for j,x in enumerate(row):
            if isinstance(x,str) and j not in [0,8]:
                feature = (j,x)
                if feature not in featureMapping:
                    featureMapping[feature] = len(featureMapping)
                newRow.append(featureMapping[feature])
            elif j in [0,8]:
                newRow.append(x)
            else:
                print('unusual')
        encodedTrainData.append(newRow)
        
    return featureMapping, encodedTrainData

In [41]:
def getTranslatedOnFeatureMap(featureMapping, data):
    translatedData = []
    
    for row in data:
        newRow = []
        for j,x in enumerate(row):
            feature = (j,x)
            if j not in [0,8]:
                if feature in featureMapping.keys():
                    newRow.append(featureMapping[feature])
            elif j in [0,8]:
                newRow.append(x)
        translatedData.append(newRow)
    return translatedData

In [44]:
def getBinarizedData(unbinarizedData, numberOfFeatures):
    finalData = np.zeros((len(unbinarizedData), numberOfFeatures))
    for c, row in enumerate(unbinarizedData):
        for i,x in enumerate(row):
            if isinstance(x,int) and i not in [0,8]:
                finalData[c][x] = 1
            elif isinstance(x,float) and i in [0,8]:
                finalData[c][i] = x
    return finalData

In [13]:
def getLingAlgNorm(binarizedTrainData, binarizedDevData, k):
    eucs = defaultdict(int)
    for c,devRow in enumerate(binarizedDevData):
        dist = []
        for x,tRow in enumerate(binarizedTrainData):
            dist.append((x,np.linalg.norm(np.array(tRow)-np.array(devRow))))
        eucs[c] = sorted(dist, key = lambda s : s[1])[:k]
    return eucs

In [14]:
def getEucDist(trainRow, testRow):
    n = len(trainRow)
    return math.sqrt(sum([ (np.array(trainRow[i]) - np.array(testRow[i])) **2 for i in range(n)] ))

In [15]:
def getL2NormWithIndex(train, test):
    dist = []
    for row in test:
        dist.append([(i,d) for i,d in enumerate(np.linalg.norm(train - row, axis = 1))])
    return dist

In [16]:
def getL2NormWithoutIndex(train, test):
    dist = []
    for row in test:
        dist.append(np.linalg.norm(train - row, axis = 1))
    return dist

In [17]:
def getPredictions(topKElements):
    trainIncome = getTarget(getData('income.train.txt.5k'))
    predictions = []
    
    for i in range(len(topKElements)):
#         predictedLabelsIndex = [trainIncome[j] for dist,j in topKElements[i]]
        predictedLabelsIndex = [trainIncome[j] for j in topKElements[i]]
        predictedLabel = Counter(predictedLabelsIndex).most_common(1)[0][0]
        predictions.append(predictedLabel)
    return predictions

In [18]:
def getTopKElements(l,k):
    topK = []
    i = 0
    
    while len(topK) < k:
        topK.append((l[i],i))
        for j in range(i+1,len(l)):
            if topK[i][0] > l[j]:
                topK[i] = (l[j],j)
                l[i], l[j] = l[j], l[i]
        i+=1
    return topK

In [58]:
def getErrorRate(predictions, fileName):
    actuals = getTarget(getData(fileName))
    error = 0
    for i in range(len(predictions)):
        if actuals[i] != predictions[i]:
            error+=1
    return error/10

In [20]:
def getPositives(predictions):
    return Counter(predictions)['>50K']/10

In [60]:
def getErrorRatesAndPositivePercentage(distWithoutIndex, fileName):
    kList = [1,3,5,7,9,99,999]
    
    topKElements = defaultdict(list)
    predictions = defaultdict(list)
    error = defaultdict(list)
    positives = defaultdict(list)

    for k in kList:
    #     topKElements[k] = [getTopKElements(devDistWithoutIndex[d],k) for d in range(len(devDistWithoutIndex))]
        topKElements[k] = [np.argpartition(x,k, axis=0)[:k] for x in [devDistWithoutIndex[i] for i in range(len(devDistWithoutIndex))]]
#         print(topKElements[k][:3])
        predictions[k] = getPredictions(topKElements[k])
        error[k] = getErrorRate(predictions[k], fileName)
        positives[k] = getPositives(predictions[k])

    [print('k: {} error rate: {}% positive rate: {}%'.format(k,error[k], positives[k])) for k in kList]
    return error, positives

In [59]:
trainTarget = getTarget(getData('income.train.txt.5k'))
devTarget = getTarget(getData('income.dev.txt'))

featureMapping, unbinarizedTrainData = getFeatureMapAndBinarizedData(getAllData('income.train.txt.5k'))
unbinarizedDevData = getTranslatedOnFeatureMap(featureMapping, getAllData('income.dev.txt'))
unbinarizedTestData = getTranslatedOnFeatureMap(featureMapping, getAllData('income.test.blind'))

# print(unbinarizedTrainData[:5])
# print('--------------------------')
# print(unbinarizedDevData[285])
# print('--------------------------')
# print(featureMapping)
# print('--------------------------')
# print(devFeatureMapping)
# print([x for x in unbinarizedTrainData[1578]])
# print([x for x in unbinarizedDevData[0]])

binarizedTrainData = getBinarizedData(unbinarizedTrainData, 92)
binarizedDevData = getBinarizedData(unbinarizedDevData, 92)
binarizedTestData = getBinarizedData(unbinarizedTestData, 92)

# print(binarizedTrainData[:5])
# print('--------------------------')
# print(binarizedDevData[:5])
# print('--------------------------')
# print(binarizedDevData[285])

print('------For dev data---------')
devDistWithoutIndex = getL2NormWithoutIndex(binarizedTrainData, binarizedDevData)
devErrorRates, devPositives = getErrorRatesAndPositivePercentage(devDistWithoutIndex, 'income.dev.txt')

# print('------For train data---------')
# testDistWithoutIndex = getL2NormWithoutIndex(binarizedTrainData, binarizedTestData)
# testErrorRates, testPositives = getErrorRatesAndPositivePercentage(testDistWithoutIndex)

------For dev data---------
k: 1 error rate: 23.6 positive rate: 26.8%
k: 3 error rate: 20.1 positive rate: 24.9%
k: 5 error rate: 18.5 positive rate: 24.5%
k: 7 error rate: 17.2 positive rate: 23.6%
k: 9 error rate: 17.2 positive rate: 22.0%
k: 99 error rate: 16.5 positive rate: 16.3%
k: 999 error rate: 20.8 positive rate: 5.0%


In [35]:
# print([devDistWithoutIndex[x] for x in range(1)])
# print(devDistWithoutIndex[0][0])
# print(devDistWithoutIndex[0][1578])
# print(devDistWithoutIndex[0][4396])
# print([x for x in devDistWithoutIndex[0]])
# print("where there's 0: {}".format([i for i in range(1) if 0 in devDistWithoutIndex[i]]))
f = getDevErrorRates(devDistWithoutIndex)
print(f)

0.647765389628066
0.0
0.0
[array([215], dtype=int64), array([411], dtype=int64), array([22], dtype=int64)]
[array([1578, 4396, 4092], dtype=int64), array([2314, 1637, 3960], dtype=int64), array([4521, 4376, 4512], dtype=int64)]
k: 1 error rate: 32.1 positive rate: 22.1%
k: 3 error rate: 26.1 positive rate: 16.1%
None


In [233]:
actuals = getTarget(getData('income.dev.txt'))

error = 0
for i in range(len(devPredictions)):
    if actuals[i] != devPredictions[i]:
        error+=1
        
print('k={} error rate: {}% Positive %: {}% is '.format(1,error/10, Counter(devPredictions)['>50K']/10))

k=1 error rate: 0.5% Positive %: 0.0% is 


In [148]:
print('---------------------------{}'.format(devDistWithoutIndex[0]))
print('----------Top K -----------------{}'.format(getTopKElements(devDistWithoutIndex[2],3)))

---------------------------[0.64776539 0.17204651 0.18867962 ... 0.86162637 0.11661904 0.49030603]
----------Top K -----------------[(0.0, 0), (0.0, 1), (0.0, 2)]


In [149]:
# [i, j for i, j in devDist]
# [(i, dist) for i, dist in enumerate(devDist[:5])]

# print('---------------------------{}'.format(devDistWithIndex[0]))
print('-------------sorted top k--------------{}'.format(sorted(devDistWithIndex[2], key=lambda s:s[1])[:3]))
# print(getTopKElements([44,32,10,56,900,543,2,67], 3))
# [(i,getTopKElements(dist,1)) for i,dist in enumerate(devDist)]

-------------sorted top k--------------[(22, 0.0), (210, 0.0), (325, 0.0)]
