Finding basic level categoies
================

In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib.colors as pltColors
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from scipy.spatial.distance import cosine
import csv
from collections import Counter, defaultdict
from sklearn.preprocessing import normalize
from numpy.linalg import norm
from numpy.linalg import pinv
from scipy.stats import linregress

In [2]:
# Load files
inputFile = 'data/cat_desc.csv'
headers = None
descCnt = Counter()
catCnt = Counter()
descCatCnt = defaultdict(Counter)
with open(inputFile) as fin:
    r = csv.reader(fin)
    for row in r:
        if headers is None:
            headers = row
            print headers
        else:
            cat, desc, cnt = row
            cat = int(cat) - 1
            cnt = int(cnt)
            descCnt[desc] += cnt
            catCnt[cat] += cnt
            descCatCnt[desc][cat] += cnt

catNameFile = 'data/categoryKey.csv'
catName = {}
with open(catNameFile) as fin:
    r = csv.reader(fin)
    for row in r:
        catName[int(row[0]) - 1] = row[1]

print range(len(catName)) == catName.keys() and range(len(catName)) == catCnt.keys()

['category_id', 'description', 'count']
True


In [3]:
# filter descriptions
threshold = 100
descList = [d for d, c in descCnt.most_common() if c >= threshold]
descId = {i:d for i, d in enumerate(descList)}
print len(descList), len(descCnt)
print sum(descCnt[d] for d in descList), sum(descCnt.itervalues())

1922 84922
4801096 5115137


In [4]:
# Build sparse matrix
rowIdx = []
colIdx = []
dataLst = []
for i, desc in enumerate(descList):
    for j in sorted(descCatCnt[desc].keys()):
        rowIdx.append(i)
        colIdx.append(j)
        dataLst.append(descCatCnt[desc][j])
mtx = csr_matrix((dataLst, (rowIdx, colIdx)), shape=(len(descList), len(catName)), dtype=np.float)
_ = normalize(mtx, norm='l1', axis=0, copy=False)

In [6]:
reconErr1 = []
reconVar1 = []
reconErr2 = []
reconVar2 = []
numComponent = range(1, 1001)
for k in numComponent:
    if k % 10 == 0:
        print 'Working on component count', k, reconVar1[-1], reconVar2[-1]
    U, S, Vt = svds(mtx, k=k)
    pc = np.flipud(Vt)
    normalize(pc, norm='l2', axis=1, copy=False)
    normalize(mtx, norm='l2', axis=1, copy=False)
    simMtx = np.absolute(mtx.dot(pc.T).T)
    bestMatch = np.argmax(simMtx, axis=1)
    wordPC = np.zeros(pc.shape)
    for i in xrange(k):
        wordPC[i] = mtx.getrow(bestMatch[i]).toarray()
    recon1 = (mtx.dot(pc.T)).dot(pc)
    recon2 = (mtx.dot(pinv(wordPC))).dot(wordPC)
    mtxArray = mtx.toarray()
    reconErr1.append(norm(mtxArray - recon1))
    reconErr2.append(norm(mtxArray - recon2))
    reconVar1.append(linregress(recon1.reshape(-1), mtxArray.reshape(-1))[2]**2)
    reconVar2.append(linregress(recon2.reshape(-1), mtxArray.reshape(-1))[2]**2)

Working on component count 10 0.107632635624 0.0778531567442
Working on component count 20 0.188032839059 0.14229218156
Working on component count 30 0.250192410894 0.181837468715
Working on component count 40 0.301384438606 0.211319973314
Working on component count 50 0.346727905072 0.238477690456
Working on component count 60 0.3876046276 0.263864521883
Working on component count 70 0.424869308046 0.287392287126
Working on component count 80 0.458966837022 0.309515593505
Working on component count 90 0.490379937959 0.330070446537
Working on component count 100 0.518749240641 0.350854177676
Working on component count 110 0.545109156855 0.372331262726
Working on component count 120 0.569954579518 0.391043841979
Working on component count 130 0.593451202888 0.40478298244
Working on component count 140 0.615612743657 0.416159738844
Working on component count 150 0.636436187468 0.427997306692
Working on component count 160 0.655685919353 0.438281900146
Working on component count 170 0.673

In [8]:
varThreshold = 0.9
nComponent = min(i for i, x in enumerate(reconVar1) if x >= varThreshold)
print nComponent, reconErr1[nComponent], reconVar1[nComponent], reconErr2[nComponent], reconVar2[nComponent]

394 13.7164233204 0.900380995686 27.6711056072 0.594629024641


In [12]:
U, S, Vt = svds(mtx, k=nComponent)
pc = np.flipud(Vt)
normalize(pc, norm='l2', axis=1, copy=False)
normalize(mtx, norm='l2', axis=1, copy=False)
simMtx = np.absolute(mtx.dot(pc.T).T)
bestMatch = np.argmax(simMtx, axis=1)
blDescSet = set()
blDesc = []
for i in bestMatch:
    if descList[i] not in blDescSet:
        blDesc.append(descList[i])
        blDescSet.add(descList[i])
        print descList[i]
print len(blDescSet)

place
nature
building
animal
auditoriam
shopping
play ground
home
guest room
monument
water
trash
music
kid
computer
horse
dress
dining
aeroplan
hen shed
party
parking area
game center
parking
rail
dent
wood gate
wooden compound
swimming room
cattle farm
electricity
electricity board
salon
water pipe
saloon shop
parlour
fance
doctor
baseball ground
vegetable
vehicle
market
harbor
shell
ring
boxing
port
racing
steep
book
entrance
sky
skating ground
pet
cycle parking
bicycle
massage place
wooden room
pool table
model
sheep
tent
sports field
exercise room
bridge
board
bus stand
sit out
soldier
petrol pump
petrol bank
blue sky
wel
telephone
drinks shop
research
boat deck
laboratory
tram
bull fighting
entrance place
car racing
cemetary
fire
court house
ticket counter
giraffe
hat shop
washing machine
fish show
hat
van
mini golf
meat
nursing home
old age home
dog cage
flower
badminton ground
mutton stall
bed
meat shop
bell room
karate
karate class
bowling game
bowling place
bowling alley
bee


In [13]:
# load json taxonomy tree
taxoTreeFile = 'model/taxoTree_2000_0.pickle.gz'
import gzip
import pickle
with gzip.open(taxoTreeFile) as fin:
    taxoTreeNodes = pickle.load(fin)
print taxoTreeNodes[0][0]

place


In [14]:
taxoTreeLabels = set(t[0] for t in taxoTreeNodes if len(t[2]) > 0)

In [15]:
print len(taxoTreeLabels), len(blDescSet), len(taxoTreeLabels & blDescSet)

352 264 82


In [17]:
for w in blDesc:
    if w in taxoTreeLabels:
        print w

place
nature
building
monument
wooden compound
swimming room
water pipe
steep
massage place
tent
board
sit out
entrance place
car racing
giraffe
washing machine
dog cage
butcher
canel
saw mill
toll booth
museum
maersk drilling
helicopter pad
voting booth
salt place
observatory
eye care shop
chef
marsh
cycle race
candy shop
water tower
butte
igloo house
job centre
car dicky
pizza
fire station
flood water
fish farm
strike
illuminated building
solar panel
radio city
hotel room
crosswalk
football stadium
dugout
hay field
cavern
rainbow
board room
picnic area
icy hills
canning company
attic
bleachers
hangar
cellar
pawn shop
exercise
water plant
banana
oast
covered bridge
ferry
steak house
island
food court
motel
mountain road
ice cream shop
organ
castle palace
exam hall
cannon
general store
storage shed
donkey
train tunnel
skate park
