Description Dataset Cleanup
===============
Note: the print-outs are not all rules enforced - they are only significant rules enforced

In [1]:
import os
import sys
import subprocess
import csv
import re
import collections
import string

In [2]:
# Load file
rawDesc = [] # list of tuples of (image_filename, image_category_id, description)
rawDescHeaders = None
with open('rawDesc.csv') as fin:
    csvReader = csv.reader(fin)
    for row in csvReader:
        if rawDescHeaders is None:
            rawDescHeaders = row
        else:
            rawDesc.append(row)
print rawDescHeaders

['image_filename', 'image_category_id', 'description']


In [3]:
# Make string updates faster
def applyUpdates(desc, updateMap):
    madeEmpty = set()
    for data in desc:
        if data[2] in updateMap:
            if len(updateMap[data[2]]) == 0 and data[2] not in madeEmpty:
                print 'WARNING - Removing', data[2]
                madeEmpty.add(data[2])
            data[2] = updateMap[data[2]]
        else:
            raise RuntimeError('Undefined updates')
    if len(madeEmpty) > 0:
        desc[:] = [data for data in desc if len(data[2]) > 0]
def collectDesc(desc):
    ret = collections.Counter()
    for data in desc:
        ret[data[2]]+=1
    return ret

In [4]:
# Remove all entries that contains numbers and non-English characters
updateMap = {}
tmp = collectDesc(rawDesc);
for s, v in tmp.iteritems():
    if all([c in string.letters+string.punctuation+' ' for c in s]):
        updateMap[s] = s
    else:
        updateMap[s] = ''
        if v >=5 :
            print s, v
applyUpdates(rawDesc, updateMap)

AK 47 7
route 66 7
2 men 6
ctf - 70 5
caf� 6
ak 47 10
car4 8


In [5]:
# Make everything lowercase, remove leading and trailing spaces/punctuations, and remove multiple spaces
updateMap = {}
for s in collectDesc(rawDesc):
    updateMap[s] = re.sub(' +',' ',s.lower().strip(string.punctuation+string.whitespace))
applyUpdates(rawDesc, updateMap)



In [6]:
# Remove space/punctuation distinction (i.e. map the set of identical answers ignoring space to the most frequent one in the set)
tmp = collectDesc(rawDesc)
answersSet = collections.defaultdict(collections.Counter)
for s, v in tmp.iteritems():
    s_nospace = re.sub('\\W+', '', s)
    answersSet[s_nospace][s]+=v
updateMap = {}
for c in answersSet.values():
    t = c.most_common(1)[0][0]
    if len(c.keys()) > 1 and sum(c.values()) - c[t] > 100:
        print t, '<--', c.keys()
    for s in c:
        updateMap[s] = t
applyUpdates(rawDesc, updateMap)

work place <-- ['work place', 'workplace']
old building <-- ['oldbuilding', 'old building']
confrancehall <-- ['confrance hall', 'confrancehall']
fashion show <-- ['fashion show', 'fashionshow']
night club <-- ['night club', 'nightclub']
wash basin <-- ['wash basin', 'washbasin']
car showroom <-- ['car show room', 'car showroom', 'carshowroom']
railway track <-- ['railway track', 'rail way track', 'rail;way track', 'railwaytrack']
front yard <-- ['front yard', 'frontyard']
baseball ground <-- ['base ball ground', 'baseball ground', 'baseballground']
kitchenroom <-- ['kitchenroom', 'kitchen room']
prayer hall <-- ['prayer hall', 'prayerhall']
sports ground <-- ['sportsground', 'sports ground']
sky scrapers <-- ['sky scrapers', 'skyscrapers']
pipe line <-- ['pipe-line', 'pipe line', 'pipeline']
phone booth <-- ['phonebooth', 'phone booth']
archbuilding <-- ['archbuilding', 'arch building']
super market <-- ['super market', 'supermarket']
wooden house <-- ['woodenhouse', 'wooden house']
r

In [7]:
# Remove pular words if their corresponding singular forms exist in the dataset 
# and (are more popular) or (have more than 50 occurances)
tmp = collectDesc(rawDesc)
updateMap = {}
for s, v in tmp.iteritems():
    if s in updateMap: continue
    if s[-1] == 's' and s[:-1] in tmp:
        if v <= tmp[s[:-1]] or tmp[s[:-1]] >= 50:
            if v + tmp[s[:-1]] >= 50: 
                print s, v, '-->', s[:-1], tmp[s[:-1]]
            updateMap[s] = s[:-1]
            updateMap[s[:-1]] = s[:-1]
        else:
            if v + tmp[s[:-1]] >= 50: 
                print s, v, '<--', s[:-1], tmp[s[:-1]]
            updateMap[s] = s
            updateMap[s[:-1]] = s
for s in tmp:
    if s not in updateMap:
        updateMap[s] = s
    
applyUpdates(rawDesc, updateMap)

woods 753 --> wood 3221
kids 991 --> kid 230
domes 12 --> dome 585
palaces 3049 --> palace 51171
selfs 16 --> self 700
airplane seats 39 <-- airplane seat 27
helipads 1 --> helipad 961
gradens 1 --> graden 115
elders 46 <-- elder 14
mosques 10 --> mosque 2932
mangos 3 --> mango 61
childs 77 --> child 1795
water pipes 4 --> water pipe 167
sketings 1 --> sketing 274
disasters 68 --> disaster 275
jewels 72 <-- jewel 27
boys 168 --> boy 634
readings 3 --> reading 98
photos 406 --> photo 1356
fruits stalls 1 --> fruits stall 67
meeting halls 2 --> meeting hall 10390
organisations 4 --> organisation 435
farm lands 2 --> farm land 517
ice houses 1 --> ice house 106
boatings 1 --> boating 4509
columns 159 <-- column 45
cable cars 5 --> cable car 297
open fields 5 --> open field 46
historical monuments 11 --> historical monument 72
glass buildings 3 --> glass building 77
soliders 36 --> solider 47
banglows 3 --> banglow 441
snacks 196 <-- snack 5
books 751 --> book 282
taps 9 --> tap 212
chears

In [8]:
# Modification distance. Credit to Peter Norvig
def edits1(word):
   splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
   deletes    = [a + b[1:] for a, b in splits if b]
   transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
   replaces   = [a + c + b[1:] for a, b in splits for c in string.ascii_lowercase if b]
   inserts    = [a + c + b     for a, b in splits for c in string.ascii_lowercase+' ']
   return set(deletes + transposes + replaces + inserts) - set(['', word])

def words(text): return re.findall('[a-z]+', text.lower())

def train(filename):
    model = {}
    with open(filename) as fin:
        skipLine = 1 # Skipping first line
        for line in fin:
            if skipLine > 0:
                skipLine -= 1
            else:
                w, c = line.split('\t')
                if c >= 5000:
                    model[w] = int(c)
    return model

NWORDS = train('wikipedia_wordfreq.txt') 
# Credit to http://www.monlp.com/2012/04/16/calculating-word-and-n-gram-statistics-from-a-wikipedia-corpora/

def inDict(phrase):
    return len(filter(lambda w: w not in NWORDS, words(phrase))) == 0

def scorePhrase(phrase):
    if not inDict(phrase):
        return 0
    return min([NWORDS[w] for w in words(phrase)])

In [9]:
# Try to correct spellings
tmp = collectDesc(rawDesc)
updateMap = {}
manualProcess = []
for s, v in tmp.iteritems():
    if inDict(s):
        updateMap[s] = s
        continue
    po = edits1(s)
    bestDict = max(po, key=scorePhrase)
    if scorePhrase(bestDict) == 0:
        bestDict = None
    bestRef = max(po, key=tmp.get)
    if tmp[bestRef] < 50:
        bestDict = None
    if bestDict == bestRef and bestDict is not None:
        updateMap[s] = bestDict
        if v >= 100:
            print 'Convert', s, v, '-->', bestDict, tmp[bestDict]
    else:
        if v < 20:
            updateMap[s] = s # gives up rare entries
        else:
            po2 = set([s2 for s1 in po for s2 in edits1(s1)]) or po
            bestRef = max(po2, key=tmp.get)
            if bestRef != s and tmp[bestRef] >= 500:
                updateMap[s] = bestRef
                print 'Convert-2', s, v, '-->', bestRef, tmp[bestRef]
            else:
                updateMap[s] = s
                if v >= 100:
                    manualProcess.append((s, v))
applyUpdates(rawDesc, updateMap)
print '-- Manual Process --'
for s, v in manualProcess:
    print s

Convert foreat 105 --> forest 114869
Convert-2 bulinding 4871 --> building 189998
Convert-2 computeroom 29 --> computer room 1493
Convert-2 aerplane 36 --> aeroplane 9292
Convert-2 tolite 29 --> toilet 6954
Convert-2 dyningtable 53 --> dining table 2141
Convert-2 acurium 171 --> aquarium 4244
Convert-2 restarea 173 --> restarent 545
Convert-2 graveyred 65 --> graveyard 1536
Convert-2 kuthrai place 91 --> kuthirai place 890
Convert-2 prayroom 43 --> play room 3145
Convert-2 daining hall 879 --> dinning hall 6092
Convert planetorium 199 --> planetarium 232
Convert restaturant 126 --> restaurant 54018
Convert-2 wattinghall 20 --> waiting hall 5717
Convert-2 video gamescenter 34 --> video game center 640
Convert ainmal 184 --> animal 6061
Convert-2 billiats 41 --> billiard 1318
Convert-2 hourserace 83 --> horse race 2605
Convert-2 gamesroom 206 --> game room 1146
Convert resturant 6548 --> restaurant 54018
Convert-2 exclator 32 --> escalator 3910
Convert-2 soffaa 97 --> sofa 1464
Convert m

In [10]:
handConvert = '''
veichile|vehicle
planatorium|planetarium
confrancehall|conference hall
kitchenroom|kitchen
accurium|aquarium
animalshome|animal home
liviningroom|living room
swimmingful|swimming pool
alecoopter|helicopter
liabery|library
frontviewofbuilding|front view of building
fogplace|foggy place
olderbuliding|older building
tallbuildings|tall building
photoview|photo view
sportsroom|sports room
archbuilding|arch building
damagedbuilding|damaged building
trainroute|train route
searesearch|sea research
sushishop|sushi shop
grassshapes|grass shape
damagedarticles|damaged article
damagedthings|damaged thing
roadview|road view
playcourt|play court
woodenframes|wooden frame
riverarea|river area
pitshop|pit shop
rocksview|rock view
damagedstructure|damaged structure
steelstructures|steel structure
sandroad|sand road
indoorstadium|indoor stadium
no image|
unknown|
mountion|mountain
buliding|building
bulding|building
smallhouse|small house
restaurent|restaurant
shope|shop
meating hall|meeting hall
restarant|restaurant
restaruant|restaurant
restraunt|restaurant
chruch|church
hospitel|hospital
hoispatl|hospital
swiming pool|swimming pool
swimming pal|swimming pool
smimming place|swimming place
smimming pool|swimming pool
smimming room|swimming room
parliment|parliament
kichen room|kitchen room
kichen|kitchen
kicthen|kitchen
brige|bridge
restauant|restaurant
railwaybalm|railway balm
charch|church
chirch|church
moutain|mountain
goverment building|government building
seminor class|seminar class
lakh|lake
appartment|apartment
multi storey building|multi story building
bed rooms|bed room
bed rooom|bed room
mountan|mountain
shoping center|shopping center
airplan|airplane
machinary|machinery
machineries|machinery
laboratary|laboratory
lobi|lobby
coridoor|corridor
hallview|hall view
delux car|deluxe car
histrorical place|historical place
rockfort|rock fort
multi storeyed building|multi story building
book centre|book center
liberay|library
tant|tent
hutview|hut view
information centre|information center
swimmimg pool|swimming pool
swimmingfull|swimming pool
cinema theature|cinema theater
theator|theater
theatre|theater
game centre|game center
construciton|construction
musiam|museum
muzium|museum
musem|museum
museiam|museum
statium|stadium
galary|gallery
stedum|stadium
computersoffice|computer office
bouth room|bathroom
both room|bathroom
bath room|bathroom
bathshower|bath shower
bustand|bus stand
light kouse|light house
secutity place|security place
digg|dig
gargen|garden
archbridge|arch bridge
haiway road|highway road
shipview|ship view
shoping|shopping
men|man
women|woman
natural|nature
tallbuildings|tall building
restround|restaurant
animalshome|animal home
kitchenroom|kitchen room
restarea|rest area
circu|circus
damagedstructure|damaged structure
publicsplace|public place
kittar|gittar
bigbuildings|big building
seminor|seminar
hourbor|harbour
fueal station|fuel station
damagedstructures|damaged structure
frontviewofbuildings|front view of building
orch|arch
'''
tmp = collectDesc(rawDesc)
updateMap = {}
rules = handConvert.split('\n')
for r in rules:
    if r == '': continue
    s, t = r.split('|')
    updateMap[s] = t
    if s == t:
        raise RuntimeError('Do not change anything: ' + s + '|' + t)
    print s, '-->', t
for s in tmp:
    if s not in updateMap:
        updateMap[s] = s
applyUpdates(rawDesc, updateMap)

veichile --> vehicle
planatorium --> planetarium
confrancehall --> conference hall
kitchenroom --> kitchen
accurium --> aquarium
animalshome --> animal home
liviningroom --> living room
swimmingful --> swimming pool
alecoopter --> helicopter
liabery --> library
frontviewofbuilding --> front view of building
fogplace --> foggy place
olderbuliding --> older building
tallbuildings --> tall building
photoview --> photo view
sportsroom --> sports room
archbuilding --> arch building
damagedbuilding --> damaged building
trainroute --> train route
searesearch --> sea research
sushishop --> sushi shop
grassshapes --> grass shape
damagedarticles --> damaged article
damagedthings --> damaged thing
roadview --> road view
playcourt --> play court
woodenframes --> wooden frame
riverarea --> river area
pitshop --> pit shop
rocksview --> rock view
damagedstructure --> damaged structure
steelstructures --> steel structure
sandroad --> sand road
indoorstadium --> indoor stadium
no image --> 
unknown -->

In [11]:
# Handles common long sentences
tmp = collectDesc(rawDesc)
prefixRemovalS = '''
i am giong to
i am going to
iam going to
i am seeing
i am watching
i want to live
iam seeing
seeing
this is
it is
look like
looks like
'''
prefixBan = set(tuple(l.split()) for l in prefixRemovalS.split('\n') if l != '')
updateMap = {}
for s, v in tmp.iteritems():
    phrase = s.split()
    trunc = 0
    for pattern in prefixBan:
        if tuple(phrase[:len(pattern)]) == pattern:
            trunc = max(trunc, len(pattern))
    if trunc > 0:
        updateMap[s]=' '.join(phrase[trunc:])
        if v > 10:
            print s, v, '-->', updateMap[s]
    else:
        updateMap[s] = s
applyUpdates(rawDesc, updateMap)

i am going to the cinema hall 17 --> the cinema hall
seeing a arch 31 --> a arch
this is a room 29 --> a room
this is factory 12 --> factory
i am going to the farm land 17 --> the farm land
iam seeing the chruch 11 --> the chruch
iam seeing the playground 27 --> the playground
iam seeing the road 25 --> the road
this is house 11 --> house
this is a tree 12 --> a tree
this is a factory 23 --> a factory
i am going to the meeting hall 67 --> the meeting hall
i am going to the baseball 12 --> the baseball
iam seeing the church 62 --> the church
i am going to the railway station 32 --> the railway station
this is a house 37 --> a house
i am going to the closet 17 --> the closet
iam seeing the hall 26 --> the hall
i am going to church 35 --> church
this is a sea 28 --> a sea
iam seeing the carshed 11 --> the carshed
this is a machine 21 --> a machine
iam seeing the hotel 13 --> the hotel
i am going to the stadium 70 --> the stadium
this is a hotel 15 --> a hotel
i am going to the garden 69 -

In [12]:
# Remove confident stop words
tmp = collectDesc(rawDesc)
stopWordsS='a,an,the,some' # only include very confident ones
stopWords = set([w.lower().strip() for w in stopWordsS.split(',')])
updateMap = {}
for s, v in tmp.iteritems():
    phrase = s.split()
    phraseOut = filter(lambda w: w not in stopWords, phrase)
    updateMap[s] = ' '.join(phraseOut)
    if v > 10 and len(phrase) > len(phraseOut):
        print s, v, '-->', updateMap[s]
applyUpdates(rawDesc, updateMap)

the museum 67 --> museum
under the sea 399 --> under sea
a airpoort 20 --> airpoort
the complex 12 --> complex
a man 31 --> man
a home 26 --> home
the house 140 --> house
the home depot 12 --> home depot
the many peoples 29 --> many peoples
the railway station 33 --> railway station
a office 14 --> office
a living room 12 --> living room
the swimming pool 27 --> swimming pool
a river 15 --> river
the bridge 22 --> bridge
the forest 24 --> forest
a forest 29 --> forest
the playground 27 --> playground
an office 13 --> office
a house 76 --> house
a restaurant 17 --> restaurant
a shop 55 --> shop
the temple 20 --> temple
the library 23 --> library
the shopping center 26 --> shopping center
the garden 80 --> garden
a mountain 20 --> mountain
the building 79 --> building
the tree 12 --> tree
a stadium 26 --> stadium
the company 13 --> company
the people's court 13 --> people's court
the train 15 --> train
out of the city 24 --> out of city
a lady 12 --> lady
the boat 11 --> boat
a hotel 20 

In [13]:
# Combine identical terms if we ignore less confident stop words
tmp = collectDesc(rawDesc)
stopWordsS2='a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,\
cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,\
i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,nor,of,off,often,on,\
only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,\
this,tis,to,too,twas,us,want,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,\
your,iam'
# Credit to http://www.textfixer.com/resources/common-english-words.txt
stopWords2 = set([w.lower().strip() for w in stopWordsS2.split(',')]) - stopWords
unorderedDesc = collections.defaultdict(collections.Counter)
updateMap = {}
for s, v in tmp.iteritems():
    phrase = s.split()
    phraseOut = filter(lambda w: w not in stopWords2, phrase)
    unorderedDesc[' '.join(sorted(phraseOut))][s]+=v
for k, c in unorderedDesc.iteritems():
    if k == '':
        for s in c.keys():
            updateMap[s] = ''
    else:
        target_s, target_v = c.most_common(1)[0]
        for s in c.keys():
            updateMap[s] = target_s
        if len(c.keys()) > 1 and sum(c.values()) - c.most_common(1)[0][1] > 50:
            print c.most_common(1)[0], '<--', c.most_common()[1:], sum(c.values()) - c.most_common(1)[0][1]
applyUpdates(rawDesc, updateMap)

('exterior house', 370) <-- [('house exterior', 49), ('exterior of house', 7), ('exterior in house', 1)] 57
('exterior building', 604) <-- [('building exterior', 479), ('exterior of building', 4)] 483
('room', 62410) <-- [('room in', 123), ('was room', 2), ('other room', 2), ('so room', 1), ('room is', 1), ('any room', 1), ('its room', 1)] 131
('conference', 939) <-- [('conference all', 51)] 51
('car race', 2506) <-- [('race car', 495)] 495
('table chair', 322) <-- [('table and chair', 144), ('chair and table', 16)] 160
('place of worship', 464) <-- [('worship place', 140), ('place to worship', 6)] 146
('garden house', 309) <-- [('house garden', 147), ('house with garden', 16), ('garden and house', 13), ('house in garden', 6), ('house and garden', 5), ('garden with house', 2), ('house to garden', 1)] 190
('house room', 71) <-- [('house in room', 36), ('room in house', 15), ('room of house', 1)] 52
('church', 117959) <-- [('church in', 68), ('in church', 7), ('these are church', 1)] 76


In [14]:
# Redo: Remove space/punctuation distinction (i.e. map the set of identical answers ignoring space to the most frequent one in the set)
tmp = collectDesc(rawDesc)
answersSet = collections.defaultdict(collections.Counter)
for s, v in tmp.iteritems():
    s_nospace = re.sub('\\W+', '', s)
    answersSet[s_nospace][s]+=v
udpateMap = {}
for c in answersSet.values():
    t = c.most_common(1)[0][0]
    if len(c.keys()) > 1 and sum(c.values()) - c[t] > 100:
        print t, '<--', c.keys()
    for s in c:
        updateMap[s] = t
applyUpdates(rawDesc, updateMap)

In [15]:
# Redo: Remove pular words if their corresponding singular forms exist in the dataset 
# and (are more popular) or (have more than 50 occurances)
tmp = collectDesc(rawDesc)
updateMap = {}
for s, v in tmp.iteritems():
    if s in updateMap: continue
    if s[-1] == 's' and s[:-1] in tmp:
        if v <= tmp[s[:-1]] or tmp[s[:-1]] >= 50:
            if v + tmp[s[:-1]] >= 50: 
                print s, v, '-->', s[:-1], tmp[s[:-1]]
            updateMap[s] = s[:-1]
            updateMap[s[:-1]] = s[:-1]
        else:
            if v + tmp[s[:-1]] >= 50: 
                print s, v, '<--', s[:-1], tmp[s[:-1]]
            updateMap[s] = s
            updateMap[s[:-1]] = s
for s in tmp:
    if s not in updateMap:
        updateMap[s] = s

applyUpdates(rawDesc, updateMap)

kids 1 --> kid 1221
fruits 1 --> fruit 1156
dolls 1 --> doll 487
gymnastics 91 <-- gymnastic 1
materials 1 --> material 205
pictures 1 --> picture 1339
books 2 --> book 1033
horses 442 --> horse 8845
coconut trees 1 --> coconut tree 61
small houses 10 --> small house 604
chairs 2 --> chair 4928
steps 12643 --> step 1714
play courts 2 --> play court 342
pyramids 1 --> pyramid 307
hills 21525 --> hill 9456
caps 3 --> cap 476
trees 16847 --> tree 20647
students 1 --> student 448
tables 1 --> table 5189
drums 4 --> drum 296
deers 1 --> deer 132
exhibits 2 --> exhibit 77
weapons 111 <-- weapon 1
honey bees 1 --> honey bee 346
mines 3 --> mine 566
apples 1 --> apple 182
highways 674 --> highway 17301
cars 10 --> car 31541
toys 6 --> toy 1512
players 3 --> player 1930
factory machines 6 --> factory machine 81
rooms 1 --> room 62541
sports 2 --> sport 3383
farmers 1 --> farmer 59
bricks 2 --> brick 916
persons 3 --> person 1854
tractors 1 --> tractor 195
workers 2 --> worker 6290
fountains 1 -

In [16]:
# Redo: spelling correction
tmp = collectDesc(rawDesc)
updateMap = {}
manualProcess = []
for s, v in tmp.iteritems():
    if inDict(s):
        updateMap[s] = s
        continue
    po = edits1(s)
    bestDict = max(po, key=scorePhrase)
    if scorePhrase(bestDict) == 0:
        bestDict = None
    bestRef = max(po, key=tmp.get)
    if tmp[bestRef] < 50:
        bestDict = None
    if bestDict == bestRef and bestDict is not None:
        updateMap[s] = bestDict
        if v >= 100:
            print 'Convert', s, v, '-->', bestDict, tmp[bestDict]
    else:
        if v < 20:
            updateMap[s] = s # gives up rare entries
        else:
            po2 = set([s2 for s1 in po for s2 in edits1(s1)]) or po
            bestRef = max(po2, key=tmp.get)
            if bestRef != s and tmp[bestRef] >= 500:
                updateMap[s] = bestRef
                print 'Convert-2', s, v, '-->', bestRef, tmp[bestRef]
            else:
                updateMap[s] = s
                if v >= 100:
                    manualProcess.append((s, v))
applyUpdates(rawDesc, updateMap)
print '-- Manual Process --'
for s, v in manualProcess:
    print s

Convert-2 restarent 174 --> restaurant 75189
Convert-2 planitorium 59 --> planetarium 578
-- Manual Process --
waranta


In [17]:
# Check popular non-alphabetical non-white characters
tmp = collectDesc(rawDesc)
occ = 0
for s, v in tmp.most_common():
    if len(filter(lambda c: (not c.isalpha()) and c != ' ', s)) > 0:
        occ += v
        if v >= 50:
            print s, v
print occ

children's park 688
multi-storeyed building 133
doctor's office 84
hills & river 84
dentist's office 73
visitor's room 60
park. garden 60
6099


In [18]:
# Check popular misspelled entry (our dictionary is not perfect)
tmp = collectDesc(rawDesc)
occ = 0
for s, v in tmp.most_common():
    if len(filter(lambda w: w not in NWORDS, words(s))) > 0:
        occ += v
        if v >= 50:
            print s, v
print occ

waranta 294
oldhome 98
oldarch 94
marchfast 93
towerbuildings 83
congrete 79
playingyard 77
focuslight 75
muddview 73
partymeet 71
archview 71
gracees 71
horsr ride place 69
entranceview 69
starhotel 69
wardrob 67
woodengate 65
coolingtower 64
ratinum 64
damagedhome 64
carstand 60
scpriture 57
washingarea 57
oldcollege 57
lightingarea 56
natureview 54
pocline 53
almirha 52
agrifield 50
bonestructures 50
25397


In [19]:
# Check most common ones
tmp = collectDesc(rawDesc)
for s, v in tmp.most_common(100):
    print s, v

house 228983
building 214988
factory 199000
mountain 171521
hall 150940
shop 122592
church 118035
forest 115086
garden 103446
road 96550
sea 79436
restaurant 75395
office 68800
room 62542
bridge 61770
ground 59648
river 59191
water 58199
bedroom 54872
palace 54275
hotel 54250
play ground 51801
stadium 47681
park 46609
fort 38750
tree 37494
home 35389
lake 32653
nature 32633
machine 32396
people 31956
car 31551
hill 30986
shed 30221
ship 28166
old place 23535
hospital 23386
swimming pool 23326
company 22878
kitchen 21123
wall 20273
living room 19768
airport 18766
rock 18332
bathroom 18158
auditorium 18033
highway 17975
street 17237
train 16743
tower 16646
cave 16490
tent 16421
temple 16032
field 15606
guest house 15574
farm 15422
class room 14819
waterfall 14674
step 14357
hut 13730
man 13497
land 13378
apartment 13014
city 12645
fall 12568
ocean 12513
lab 12101
bakery 11721
arch 10940
meeting hall 10887
sky 10863
beach 10699
bar 10566
library 10482
market 10406
desert 10160
boat 10123


In [20]:
# Check frequency==100
tmp = collectDesc(rawDesc)
for s, v in tmp.iteritems():
    if v == 100:
        print s, v

rock wall 100
press 100
bell tower 100
esclator 100
veterinarian 100
helicoptor 100
nature trail 100
putting green 100
shooting place 100
esculator 100
vast land 100
snow road 100
palce 100
reservoir 100
channel 100
dugout 100


In [21]:
# Check frequency==50
tmp = collectDesc(rawDesc)
for s, v in tmp.iteritems():
    if v == 50:
        print s, v

hay bales 50
heater 50
mausoleum 50
sitting 50
wooden cabin 50
chear 50
cruise ship deck 50
football game 50
snow fort 50
sep 50
agrifield 50
line 50
sunroom 50
topiary garden 50
old things 50
mess 50
iron bridge 50
cauge 50
bonestructures 50
crossing 50
ofice 50
opera house 50
stock market 50
sweet shop 50
egg 50
bride 50


In [22]:
# Check frequency==20
tmp = collectDesc(rawDesc)
for s, v in tmp.iteritems():
    if v == 20:
        print s, v

nature scence 20
rock piece 20
players room 20
walk road 20
awards 20
s 20
pilgrimes 20
temple tower 20
canning 20
production area 20
cycle stadium 20
herb garden 20
masach 20
brasserie 20
building in side 20
factory farm 20
water purifier 20
llake 20
rv park 20
flooded street 20
military people 20
portrait 20
college campus 20
ocean shore 20
art building 20
building wall 20
vegetable garden 20
bridge opened 20
health 20
bog 20
corprate office 20
marine park 20
lift gate 20
eating room 20
hill slope 20
horse raid 20
computer cafe 20
boxing area 20
dresser 20
skylight 20
pork 20
turf 20
taxi stand 20
launderette 20
brick work 20
brownstone 20
g 20
aqaurium 20
ringes 20
rotary door 20
main street 20
book rack 20
molten metal 20
boat shed 20
archu 20
shopping mal 20
space capsule 20
child bedroom 20
eliphant 20
steet 20
parcel boxes 20
ghost town 20
people working 20
dinner plate 20
book shelves 20
stor room 20
aeroplan machine 20
factory unit 20
photo shot 20
unknown place 20
office lobb

In [23]:
# Saving file
imgCatDesc = collections.Counter()
catDesc = collections.Counter()
for img, cat, desc in rawDesc:
    imgCatDesc[(img, cat, desc)] += 1
    catDesc[(cat, desc)] += 1
headers = ['image_filename', 'category_id', 'description', 'count']
with open('img_cat_desc.csv', 'w') as fout:
    csvWriter = csv.writer(fout)
    csvWriter.writerow(headers)
    for k, v in imgCatDesc.most_common():
        csvWriter.writerow(k+(v,))
with open('cat_desc.csv', 'w') as fout:
    csvWriter = csv.writer(fout)
    csvWriter.writerow(headers[1:])
    for k, v in catDesc.most_common():
        csvWriter.writerow(k+(v,))