Description Dataset Cleanup
===============
## Splitting phrases into words in addition
Note: the print-outs are not all rules enforced - they are only significant rules enforced

In [1]:
from string import letters, punctuation, whitespace, printable, digits
import csv
import re
import string
from collections import Counter, defaultdict

In [2]:
# Load file
wordCatCnt = defaultdict(Counter)
rawDescHeaders = None
with open('rawDesc.csv') as fin:
    csvReader = csv.reader(fin)
    for row in csvReader:
        if rawDescHeaders is None:
            rawDescHeaders = row
        else:
            wordCatCnt[row[2]][row[1]] += 1
print rawDescHeaders

['image_filename', 'image_category_id', 'description']


In [3]:
# Make string updates faster
def applyUpdates(updateMap, warningThreshold=50):
    removalWarning = 0
    removalWarningSum = 0
    splitWarning = 0
    splitWarningSum = 0
    for w in updateMap:
        if w in wordCatCnt:
            t = updateMap[w]
            catCnt = wordCatCnt[w]
            sumCnt = sum(catCnt.itervalues())
            if w == t: continue
            if sumCnt >= warningThreshold:
                print w, sumCnt, '-->', t
            if t is None or len(t) == 0:
                removalWarning += 1
                removalWarningSum += sumCnt
                del wordCatCnt[w]
            elif type(t) is tuple or type(t) is list:
                splitWarning += 1
                splitWarningSum += sumCnt
                del wordCatCnt[w]
                for newW in t:
                    wordCatCnt[newW] += catCnt
            else:
                del wordCatCnt[w]
                wordCatCnt[t] += catCnt
        else:
            raise RuntimeError('Update rule is not applicable: ' + w + ' ' + updateMap[w])
    if removalWarning > 0:
        print 'WARNING - Removing entries:', removalWarning, removalWarningSum
    if splitWarning > 0:
        print 'WARNING - Splitting entries:', splitWarning, splitWarningSum

def getWordCnt():
    return Counter({w:sum(c.itervalues()) for w, c in wordCatCnt.iteritems()})

def fastUpdate(fn, warningThreshold=50, enforce=True):
    updateMap = {w:fn(w) for w in wordCatCnt.iterkeys()}
    if enforce: applyUpdates(updateMap, warningThreshold)

def fullUpdate(fn, warningThreshold=50, enforce=True):
    wordCnt = getWordCnt()
    updateMap = fn(wordCnt)
    if enforce: applyUpdates(updateMap, warningThreshold)

In [4]:
def removeNonPrintableAnswers(w):
    if all(c in printable for c in w):
        return w
    else:
        return None
fastUpdate(removeNonPrintableAnswers)

def removeNumbers(w):
    return w.translate(None, digits)
fastUpdate(removeNumbers)

def punctuationWhitespace2Space(w):
    s = [' ' if (c in whitespace or c in punctuation) else c for c in w]
    return ''.join(s)
fastUpdate(punctuationWhitespace2Space)

# Make everything lowercase, remove leading and trailing spaces/punctuations, and remove multiple spaces
def lowercaseStripMultipleSpaces(w):
    w = w.lower().strip()
    return re.sub(' +',' ', w)
fastUpdate(lowercaseStripMultipleSpaces, warningThreshold=10000)

house & garden 60 --> house   garden
dinning room. 162 --> dinning room 
dentist's office 66 --> dentist s office
children's park 574 --> children s park
children's room 72 --> children s room
multi-storeyed building 109 --> multi storeyed building
bathroom. 50 --> bathroom 
Church 10324 --> church
Forest 19664 --> forest
Garden 10108 --> garden
Hall 23573 --> hall
Factory 20501 --> factory
House 22415 --> house
Sea 10379 --> sea
Shop 11444 --> shop
Mountain 18483 --> mountain
Building 27362 --> building


In [5]:
# Remove space distinction 
# (i.e. map the set of identical answers ignoring space to the most frequent one in the set)
def removeSpaceDistinction(wordCnt):
    answersSet = defaultdict(Counter)
    for s, v in wordCnt.iteritems():
        s_nospace = re.sub('\\W+', '', s)
        answersSet[s_nospace][s]+=v
    updateMap = {}
    for c in answersSet.values():
        t = c.most_common(1)[0][0]
        if len(c.keys()) > 1 and sum(c.values()) - c[t] > 1000:
            print t, '<--', c.keys()
        for s in c:
            updateMap[s] = t
    return updateMap
fullUpdate(removeSpaceDistinction, warningThreshold=1000)

super market <-- ['super market', 'supermarket']
waterfall <-- ['water fall', 'waterfall', 'water fa ll']
grassland <-- ['grassland', 'grasslan d', 'grass land', 'grassla nd']
airport <-- ['airport', 'air port', 'airp ort', 'airpo rt']
car port <-- ['carport', 'car port']
bedroom <-- ['bed room', 'be d room', 'bedr oom', 'bedroom', 'bedro om', 'b ed room', 'be droom']
play ground <-- ['play ground', 'playg round', 'playground']
wind mill <-- ['windmill', 'wind mill']
old place <-- ['old place', 'oldplace']
highway <-- ['high way', 'highway', 'highwa y']
sea shore <-- ['seashore', 'sea shore']
bathtub <-- ['bath tub', 'bathtub']
workshop <-- ['work shop', 'workshop']
light house <-- ['lighthouse', 'light house']
swimming pool <-- ['swimming pool', 'swi mming pool', 'swim mingpool', 'swimmi ng pool', 'swimmin gpool', 'swimmingpool']
bathroom <-- ['bathroom', 'ba throom', 'bath room']
class room <-- ['classroom', 'class room', 'classr oom']
under water <-- ['underwater', 'unde r water', '

In [6]:
# Remove pular words if their corresponding singular forms exist in the dataset 
# and (are more popular) or (have more than 100 occurances)
def handlePularWords(wordCnt):
    updateMap = {}
    for s in wordCnt:
        sing = s[:-1]
        if s[-1] == 's' and (wordCnt[s] <= wordCnt[sing] or wordCnt[sing] >= 100):
            updateMap[s] = sing
    return updateMap
fullUpdate(handlePularWords, warningThreshold=1000)

machines 2574 --> machine
games 3093 --> game
rocks 3096 --> rock
players 1178 --> player
hills 21464 --> hill
ruins 1458 --> ruin
clouds 1537 --> cloud
hotels 1088 --> hotel
sports 3002 --> sport
cars 1882 --> car
waterfalls 7492 --> waterfall
palaces 3049 --> palace
hens 1002 --> hen
apartments 2598 --> apartment
toys 1194 --> toy
flowers 3269 --> flower
houses 1408 --> house
stairs 1114 --> stair
trees 16837 --> tree
plants 2375 --> plant
peoples 16900 --> people
steps 12584 --> step
workers 1712 --> worker
mountains 4456 --> mountain
animals 2588 --> animal
buildings 3126 --> building
pigs 1140 --> pig
appartments 1474 --> appartment
homes 1128 --> home
chairs 1735 --> chair
childrens 1442 --> children


In [7]:
# Modification distance. Credit to Peter Norvig
def edits1(word):
   splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
   deletes    = [a + b[1:] for a, b in splits if b]
   transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
   replaces   = [a + c + b[1:] for a, b in splits for c in string.ascii_lowercase if b]
   inserts    = [a + c + b     for a, b in splits for c in string.ascii_lowercase+' ']
   return set(deletes + transposes + replaces + inserts) - set(['', word])

def words(text): return re.findall('[a-z]+', text.lower())

def train(filename):
    model = {}
    with open(filename) as fin:
        skipLine = 1 # Skipping first line
        for line in fin:
            if skipLine > 0:
                skipLine -= 1
            else:
                w, c = line.split('\t')
                if c >= 10000:
                    model[w] = int(c)
    return model

NWORDS = train('wikipedia_wordfreq.txt') 
# Credit to http://www.monlp.com/2012/04/16/calculating-word-and-n-gram-statistics-from-a-wikipedia-corpora/

def inDict(phrase):
    return len(filter(lambda w: w not in NWORDS, words(phrase))) == 0

def scorePhrase(phrase):
    if not inDict(phrase):
        return 0
    return min([NWORDS[w] for w in words(phrase)])

In [8]:
# Try to correct spellings
def correctSpelling(tmp):
    updateMap = {}
    manualProcess = []
    for s, v in tmp.iteritems():
        if inDict(s):
            updateMap[s] = s
            continue
        po = edits1(s)
        bestDict = max(po, key=scorePhrase)
        if scorePhrase(bestDict) == 0:
            bestDict = None
        bestRef = max(po, key=tmp.get)
        if tmp[bestRef] < 100:
            bestDict = None
        if bestDict == bestRef and bestDict is not None:
            updateMap[s] = bestDict
            if v >= 1000:
                print 'Convert', s, v, '-->', bestDict, tmp[bestDict]
        else:
            if v < 20:
                updateMap[s] = s # gives up rare entries
            else:
                po2 = set([s2 for s1 in po for s2 in edits1(s1)]) or po
                bestRef = max(po2, key=tmp.get)
                if bestRef != s and tmp[bestRef] >= 1000:
                    updateMap[s] = bestRef
                    print 'Convert-2', s, v, '-->', bestRef, tmp[bestRef]
                else:
                    updateMap[s] = s
                    if v >= 100:
                        manualProcess.append((s, v))
    print '-- Manual Process --'
    for s, v in manualProcess:
        print s
    return updateMap
fullUpdate(correctSpelling)

Convert-2 bulinding 4871 --> building 189999
Convert-2 computeroom 29 --> computer room 1493
Convert-2 aerplane 36 --> aeroplane 9292
Convert-2 tolite 29 --> toilet 6954
Convert-2 acurium 171 --> aquarium 4244
Convert-2 graveyred 65 --> graveyard 1536
Convert-2 swming pool 130 --> swimming pool 20049
Convert-2 prayroom 43 --> play room 3145
Convert-2 horserideing 44 --> horse riding 1035
Convert-2 daining hall 879 --> dinning hall 6092
Convert-2 wattinghall 20 --> waiting hall 5717
Convert-2 billiats 41 --> billiards 1264
Convert-2 gamesroom 206 --> game room 1146
Convert resturant 6548 --> restaurant 54018
Convert-2 exclator 32 --> escalator 3910
Convert-2 soffaa 97 --> sofa 1464
Convert-2 rainage 46 --> drainage 1052
Convert-2 airoplaine 24 --> aeroplane 9292
Convert-2 carage 90 --> cage 3778
Convert-2 gudown 27 --> godown 5288
Convert-2 reasturant 96 --> restaurant 54018
Convert-2 daining hal 23 --> dining hall 3711
Convert restuarant 4089 --> restaurant 54018
Convert-2 mounament 78

In [9]:
handConvert = '''
veichile|vehicle
planatorium|planetarium
confrancehall|conference hall
kitchenroom|kitchen
accurium|aquarium
animalshome|animal home
liviningroom|living room
swimmingful|swimming pool
alecoopter|helicopter
liabery|library
frontviewofbuilding|front view of building
fogplace|foggy place
olderbuliding|older building
tallbuildings|tall building
photoview|photo view
sportsroom|sports room
archbuilding|arch building
damagedbuilding|damaged building
trainroute|train route
searesearch|sea research
sushishop|sushi shop
grassshapes|grass shape
damagedarticles|damaged article
damagedthings|damaged thing
roadview|road view
playcourt|play court
woodenframes|wooden frame
riverarea|river area
pitshop|pit shop
rocksview|rock view
damagedstructure|damaged structure
steelstructures|steel structure
sandroad|sand road
indoorstadium|indoor stadium
no image|
unknown|
mountion|mountain
buliding|building
bulding|building
smallhouse|small house
restaurent|restaurant
shope|shop
meating hall|meeting hall
restarant|restaurant
restaruant|restaurant
restraunt|restaurant
chruch|church
hospitel|hospital
hoispatl|hospital
swiming pool|swimming pool
smimming place|swimming place
smimming pool|swimming pool
smimming room|swimming room
parliment|parliament
kichen room|kitchen room
kichen|kitchen
kicthen|kitchen
brige|bridge
restauant|restaurant
railwaybalm|railway balm
charch|church
chirch|church
moutain|mountain
goverment building|government building
seminor class|seminar class
lakh|lake
appartment|apartment
multi storey building|multi story building
bed rooms|bed room
bed rooom|bed room
mountan|mountain
shoping center|shopping center
airplan|airplane
machinary|machinery
machineries|machinery
laboratary|laboratory
lobi|lobby
coridoor|corridor
hallview|hall view
delux car|deluxe car
histrorical place|historical place
rockfort|rock fort
multi storeyed building|multi story building
book centre|book center
liberay|library
tant|tent
hutview|hut view
information centre|information center
swimmimg pool|swimming pool
swimmingfull|swimming pool
cinema theature|cinema theater
theator|theater
theatre|theater
game centre|game center
construciton|construction
musiam|museum
muzium|museum
musem|museum
museiam|museum
statium|stadium
galary|gallery
stedum|stadium
computersoffice|computer office
bouth room|booth room
both room|booth room
bathshower|bath shower
bustand|bus stand
light kouse|light house
secutity place|security place
digg|dig
gargen|garden
archbridge|arch bridge
haiway road|highway road
shipview|ship view
shoping|shopping
men|man
women|woman
natural|nature
tallbuildings|tall building
restround|restaurant
animalshome|animal home
kitchenroom|kitchen room
restarea|rest area
circu|circus
damagedstructure|damaged structure
publicsplace|public place
kittar|gittar
bigbuildings|big building
seminor|seminar
hourbor|harbour
fueal station|fuel station
damagedstructures|damaged structure
frontviewofbuildings|front view of building
'''
tmp = getWordCnt()
updateMap = {}
rules = handConvert.split('\n')
for r in rules:
    if r == '': continue
    s, t = r.split('|')
    if s == t:
        raise RuntimeError('Do not change anything: ' + s + '|' + t)
    if s not in tmp:
        print 'WARNING - rule not used: ', r
    else:
        updateMap[s] = t
applyUpdates(updateMap)

appartment 3667 --> apartment
tallbuildings 289 --> tall building
no image 427 --> 
confrancehall 411 --> conference hall
chruch 12522 --> church
lakh 147 --> lake
smallhouse 601 --> small house
woodenframes 158 --> wooden frame
brige 189 --> bridge
sushishop 506 --> sushi shop
archbuilding 404 --> arch building
mountan 283 --> mountain
frontviewofbuildings 154 --> front view of building
bulding 11998 --> building
airplan 270 --> airplane
hospitel 127 --> hospital
parliment 340 --> parliament
indoorstadium 431 --> indoor stadium
riverarea 124 --> river area
frontviewofbuilding 90 --> front view of building
book centre 63 --> book center
liabery 162 --> library
machineries 729 --> machinery
hourbor 124 --> harbour
liviningroom 145 --> living room
playcourt 342 --> play court
theator 144 --> theater
multi storey building 62 --> multi story building
planatorium 141 --> planetarium
meating hall 384 --> meeting hall
swiming pool 872 --> swimming pool
smimming pool 1621 --> swimming pool
tra

In [10]:
# Handles common long sentences
tmp = getWordCnt()
prefixRemovalS = '''
i am giong to
i am going to
iam going to
i am seeing
i am watching
i want to live
iam seeing
seeing
this is
it is
look like
looks like
'''
prefixBan = set(tuple(l.split()) for l in prefixRemovalS.split('\n') if l != '')
updateMap = {}
for s, v in tmp.iteritems():
    phrase = s.split()
    trunc = 0
    for pattern in prefixBan:
        if tuple(phrase[:len(pattern)]) == pattern:
            trunc = max(trunc, len(pattern))
    if trunc > 0:
        updateMap[s]=' '.join(phrase[trunc:])
        if v > 10:
            print s, v, '-->', updateMap[s]
    else:
        updateMap[s] = s
applyUpdates(updateMap)

i am going to the cinema hall 17 --> the cinema hall
seeing a arch 31 --> a arch
this is a room 29 --> a room
this is factory 12 --> factory
i am going to the farm land 17 --> the farm land
iam seeing the chruch 11 --> the chruch
iam seeing the playground 27 --> the playground
iam seeing the road 25 --> the road
this is house 11 --> house
this is a tree 12 --> a tree
seeing a play station 16 --> a play station
this is a factory 23 --> a factory
i am going to the meeting hall 67 --> the meeting hall
i am going to the baseball 12 --> the baseball
this is a shop 36 --> a shop
iam seeing the church 62 --> the church
i am going to the railway station 32 --> the railway station
i am going to the closet 17 --> the closet
i am going to the church 384 --> the church
iam seeing the hall 26 --> the hall
i am going to church 35 --> church
this is a sea 28 --> a sea
iam seeing the carshed 11 --> the carshed
this is a machine 21 --> a machine
iam seeing the hotel 13 --> the hotel
i am going to the s

In [11]:
# Remove confident stop words
tmp = getWordCnt()
stopWordsS='a,an,the,some' # only include very confident ones
stopWords = set([w.lower().strip() for w in stopWordsS.split(',')])
updateMap = {}
for s, v in tmp.iteritems():
    phrase = s.split()
    phraseOut = filter(lambda w: w not in stopWords, phrase)
    updateMap[s] = ' '.join(phraseOut)
    if v > 100 and len(phrase) > len(phraseOut):
        print s, v, '-->', updateMap[s]
applyUpdates(updateMap)

under the sea 399 --> under sea
the house 140 --> house
the church 449 --> church
the hotel 245 --> hotel
the factory 150 --> factory
a building 229 --> building
the historic building 120 --> historic building
the park 212 --> park
the museum 67 --> museum
under the sea 399 --> under sea
the mountain 85 --> mountain
a house 76 --> house
a shop 55 --> shop
the garden 80 --> garden
the building 79 --> building
the church 449 --> church
the restaurant 71 --> restaurant
a room 64 --> room
the meeting hall 77 --> meeting hall
the road 69 --> road
the hotel 245 --> hotel
the factory 150 --> factory
the stadium 71 --> stadium
the home 63 --> home
a building 229 --> building
the music hall 56 --> music hall
the shopping mall 82 --> shopping mall
the river 55 --> river
the historic building 120 --> historic building
the house 140 --> house
the park 212 --> park
the nature 54 --> nature


In [12]:
# Combine identical terms if we ignore less confident stop words
stopWordsS2='a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,\
cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,\
i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,nor,of,off,often,on,\
only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,\
this,tis,to,too,twas,us,want,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,\
your,iam'
# Credit to http://www.textfixer.com/resources/common-english-words.txt
def removeStopwordDistinction(tmp):
    stopWords2 = set([w.lower().strip() for w in stopWordsS2.split(',')]) - stopWords
    unorderedDesc = defaultdict(Counter)
    updateMap = {}
    for s, v in tmp.iteritems():
        phrase = s.split()
        phraseOut = filter(lambda w: w not in stopWords2, phrase)
        unorderedDesc[' '.join(sorted(phraseOut))][s]+=v
    for k, c in unorderedDesc.iteritems():
        if k == '':
            for s in c.keys():
                updateMap[s] = ''
        else:
            target_s, target_v = c.most_common(1)[0]
            for s in c.keys():
                updateMap[s] = target_s
            if len(c.keys()) > 1 and sum(c.values()) - c.most_common(1)[0][1] > 50:
                print c.most_common(1)[0], '<--', c.most_common()[1:], sum(c.values()) - c.most_common(1)[0][1]
    return updateMap
fullUpdate(removeStopwordDistinction)

('exterior house', 370) <-- [('house exterior', 49), ('exterior of house', 7), ('exterior in house', 1)] 57
('exterior building', 604) <-- [('building exterior', 479), ('exterior of building', 4)] 483
('room', 62412) <-- [('room in', 123), ('was room', 2), ('other room', 2), ('so room', 1), ('room is', 1), ('any room', 1), ('its room', 1)] 131
('conference', 939) <-- [('conference all', 51)] 51
('car race', 2506) <-- [('race car', 495)] 495
('table chair', 322) <-- [('table and chair', 71), ('chair table', 17), ('chair and table', 16)] 104
('place of worship', 464) <-- [('worship place', 140), ('place to worship', 6)] 146
('garden house', 309) <-- [('house garden', 148), ('house with garden', 16), ('garden and house', 13), ('house in garden', 6), ('house and garden', 5), ('garden with house', 2), ('house to garden', 1)] 191
('mountain with sea', 57) <-- [('mountain sea', 21), ('sea with mountain', 19), ('sea mountain', 17), ('mountain and sea', 7), ('sea and mountain', 1)] 65
('church'

In [13]:
# Redo: Remove space distinction
# (i.e. map the set of identical answers ignoring space to the most frequent one in the set)
fullUpdate(removeSpaceDistinction)

rockview 54 --> rock view


In [14]:
# Redo: Remove pular words if their corresponding singular forms exist in the dataset 
# and (are more popular) or (have more than 100 occurances)
fullUpdate(handlePularWords)

In [15]:
# Redo: spelling correction
fullUpdate(correctSpelling)

-- Manual Process --
waranta


In [16]:
# Split phrases into words in addition
wordCnt = getWordCnt()
def splitPhrases(w):
    s = w.split()
    if len(s) == 1:
        if len(w) <= 6: return w
        score, spI = max((min(wordCnt[w[:i]], wordCnt[w[i:]]), i) for i in xrange(3, len(w) - 2))
        w1, w2 = w[:spI], w[spI:]
        if score >= 1000:
            return [w, w1, w2]
        else:
            return w
    else:
        return s + [w]
fastUpdate(splitPhrases, warningThreshold=10000)

guest house 15574 --> ['guest', 'house', 'guest house']
living room 19768 --> ['living', 'room', 'living room']
meeting hall 10887 --> ['meeting', 'hall', 'meeting hall']
play ground 51802 --> ['play', 'ground', 'play ground']
swimming pool 23269 --> ['swimming', 'pool', 'swimming pool']
class room 14819 --> ['class', 'room', 'class room']
old place 23535 --> ['old', 'place', 'old place']
bedroom 55311 --> ['bedroom', 'bed', 'room']


In [17]:
# Redo: clean up stopwords
fullUpdate(removeStopwordDistinction)

with 2195 --> 
like 131 --> 
from 248 --> 
at 546 --> 
is 116 --> 
it 59 --> 
in 2810 --> 
of 3709 --> 
or 216 --> 
and 2405 --> 
on 913 --> 
for 339 --> 
by 122 --> 
i 85 --> 
to 259 --> 
are 125 --> 


In [18]:
# Redo: Remove space distinction
# (i.e. map the set of identical answers ignoring space to the most frequent one in the set)
fullUpdate(removeSpaceDistinction)

show room <-- ['showroom', 'show room']
workplace 54 --> work place
basket ball 261 --> basketball
sub station 74 --> substation
sinkhole 62 --> sink hole
playground 73 --> play ground
railroad 365 --> rail road
windmill 51 --> wind mill
underground 142 --> under ground
volley ball 382 --> volleyball
roadside 55 --> road side
showroom 1324 --> show room
classroom 157 --> class room
underwater 484 --> under water


In [19]:
# Redo: Remove pular words if their corresponding singular forms exist in the dataset 
# and (are more popular) or (have more than 100 occurances)
fullUpdate(handlePularWords)

woods 99 --> wood
tickets 307 --> ticket
machines 126 --> machine
lights 71 --> light
kids 3941 --> kid
items 228 --> item
boats 54 --> boat
pianos 453 --> piano
rocks 215 --> rock
steeps 109 --> steep
girls 119 --> girl
dogs 172 --> dog
doors 72 --> door
fields 162 --> field
ruins 129 --> ruin
pets 78 --> pet
news 505 --> new
games 754 --> game
bottles 151 --> bottle
things 1652 --> thing
materials 89 --> material
hills 1607 --> hill
steps 552 --> step
garments 208 --> garment
beds 137 --> bed
clouds 123 --> cloud
cleaners 656 --> cleaner
shirts 110 --> shirt
drinks 1043 --> drink
passengers 63 --> passenger
sports 2888 --> sport
rays 52 --> ray
birds 131 --> bird
cars 178 --> car
plains 113 --> plain
flowers 329 --> flower
fruits 303 --> fruit
cards 51 --> card
jewels 81 --> jewel
falls 12761 --> fall
stilts 54 --> stilt
ways 59 --> way
weapons 85 --> weapon
entrances 277 --> entrance
bricks 145 --> brick
arts 219 --> art
hens 147 --> hen
persons 77 --> person
images 100 --> image
in

In [20]:
# Redo: hand convert
tmp = getWordCnt()
updateMap = {}
rules = handConvert.split('\n')
for r in rules:
    if r == '': continue
    s, t = r.split('|')
    if s == t:
        raise RuntimeError('Do not change anything: ' + s + '|' + t)
    if s not in tmp:
#         print 'WARNING - rule not used: ', r
        pass
    else:
        updateMap[s] = t
applyUpdates(updateMap)

appartment 54 --> apartment
moutain 71 --> mountain
seminor 62 --> seminar
theator 190 --> theater
kittar 56 --> gittar
bulding 208 --> building
restaurent 447 --> restaurant
theatre 3483 --> theater
men 455 --> man
shoping 860 --> shopping
women 252 --> woman
natural 495 --> nature
statium 59 --> stadium
shope 71 --> shop
buliding 247 --> building


In [21]:
# Check popular misspelled entry (our dictionary is not perfect)
tmp = getWordCnt()
occ = 0
for s, v in tmp.most_common():
    if len(filter(lambda w: w not in NWORDS, words(s))) > 0:
        occ += v
        if v >= 100:
            print s, v
print occ

waranta 298
kuthrai 181
33039


In [22]:
# Check most common ones
tmp = getWordCnt()
for s, v in tmp.most_common(20):
    print s, v

house 278702
building 251142
room 244145
factory 202529
hall 200021
mountain 184807
shop 153133
ground 127000
church 119911
forest 119108
road 108276
garden 107490
sea 88706
office 79940
water 76835
restaurant 76524
place 68907
play 67444
bridge 66270
bed 64246


In [23]:
# Check frequency==100
tmp = getWordCnt()
for s, v in tmp.iteritems():
    if v == 100:
        print s, v

robo 100
lumber 100
putting green 100
tele booth 100
bell tower 100
japanese 100
vast land 100
snow road 100
nature trail 100
groud 100


In [24]:
# Check frequency==50
tmp = getWordCnt()
for s, v in tmp.iteritems():
    if v == 50:
        print s, v

queue 50
pilgrimage 50
swan 50
factory area 50
wooden cabin 50
cruise ship deck 50
football game 50
agrifield 50
sun rays 50
very 50
telecast 50
topiary garden 50
parkway 50
iron bridge 50
cauge 50
bonestructures 50
school house 50
gap 50
stock market 50
bottling 50
o 50
ambulance 50
coaboard 50
derrick 50
brasserie 50
clouds with sky 50
fumes 50
coca 50
prision 50
relax 50
snow fort 50
greek 50
cotton 50
cycle sports 50
pilot house 50
opera house 50
fam 50
sweet shop 50
microscope 50


In [25]:
# Saving file
headers = ['category_id', 'description', 'count']
with open('cat_desc_2.csv', 'w') as fout:
    csvWriter = csv.writer(fout)
    csvWriter.writerow(headers)
    for word, dist in wordCatCnt.iteritems():
        for cat, v in dist.iteritems():
            csvWriter.writerow((cat, word, v))