Basic-level Category Preprocessing
===================

In [11]:
from string import letters, punctuation, whitespace, printable, digits
import csv
import re
import string
from collections import Counter, defaultdict

In [12]:
# Load file
wordCatCnt = defaultdict(Counter)
rawDescHeaders = None
with open('data/rawDesc.csv') as fin:
    csvReader = csv.reader(fin)
    for row in csvReader:
        if rawDescHeaders is None:
            rawDescHeaders = row
        else:
            wordCatCnt[row[2]][row[1]] += 1
print rawDescHeaders

wordFreq = Counter()
with open('data/wikipedia_wordfreq.txt') as fin:
    skipLine = 1 # Skipping first line
    for line in fin:
        line = line.strip()
        if skipLine > 0:
            skipLine -= 1
            print line
        else:
            w, c = line.strip().split('\t')
            wordFreq[w.strip().lower()] = int(c)

['image_filename', 'image_category_id', 'description']
ALL	1229245740


In [13]:
def words(text): return re.findall('[a-z]+', text.lower())
def scorePhrase(phrase):
    return min(wordFreq[w] for w in words(phrase))

In [14]:
# Make string updates faster
def applyUpdates(updateMap, warningThreshold=50):
    removalWarning = 0
    removalWarningSum = 0
    splitWarning = 0
    splitWarningSum = 0
    for w in updateMap:
        if w in wordCatCnt:
            t = updateMap[w]
            catCnt = wordCatCnt[w]
            sumCnt = sum(catCnt.itervalues())
            if w == t: continue
            if sumCnt >= warningThreshold:
                print w, sumCnt, '-->', t
            if t is None or len(t) == 0:
                removalWarning += 1
                removalWarningSum += sumCnt
                del wordCatCnt[w]
            elif type(t) is tuple or type(t) is list:
                splitWarning += 1
                splitWarningSum += sumCnt
                del wordCatCnt[w]
                for newW in t:
                    wordCatCnt[newW] += catCnt
            else:
                del wordCatCnt[w]
                wordCatCnt[t] += catCnt
        else:
            raise RuntimeError('Update rule is not applicable: ' + w + ' ' + updateMap[w])
    if removalWarning > 0:
        print 'WARNING - Removing entries:', removalWarning, removalWarningSum
    if splitWarning > 0:
        print 'WARNING - Splitting entries:', splitWarning, splitWarningSum

def getWordCnt():
    return Counter({w:sum(c.itervalues()) for w, c in wordCatCnt.iteritems()})

def fastUpdate(fn, warningThreshold=50, enforce=True):
    updateMap = {w:fn(w) for w in wordCatCnt.iterkeys()}
    if enforce: applyUpdates(updateMap, warningThreshold)

def fullUpdate(fn, warningThreshold=50, enforce=True):
    wordCnt = getWordCnt()
    updateMap = fn(wordCnt)
    if enforce: applyUpdates(updateMap, warningThreshold)

In [15]:
def removeNonPrintableAnswers(w):
    if all(c in printable for c in w):
        return w
    else:
        return None
fastUpdate(removeNonPrintableAnswers)

def removeNumbers(w):
    return w.translate(None, digits)
fastUpdate(removeNumbers)

def punctuationWhitespace2Space(w):
    s = [' ' if (c in whitespace or c in punctuation) else c for c in w]
    return ''.join(s)
fastUpdate(punctuationWhitespace2Space)

# Make everything lowercase, remove leading and trailing spaces/punctuations, and remove multiple spaces
def lowercaseStripMultipleSpaces(w):
    w = w.lower().strip()
    return re.sub(' +',' ', w)
fastUpdate(lowercaseStripMultipleSpaces, warningThreshold=10000)

house & garden 60 --> house   garden
dinning room. 162 --> dinning room 
dentist's office 66 --> dentist s office
children's park 574 --> children s park
children's room 72 --> children s room
multi-storeyed building 109 --> multi storeyed building
bathroom. 50 --> bathroom 
Church 10324 --> church
Forest 19664 --> forest
Garden 10108 --> garden
Hall 23573 --> hall
Factory 20501 --> factory
House 22415 --> house
Sea 10379 --> sea
Shop 11444 --> shop
Mountain 18483 --> mountain
Building 27362 --> building


In [19]:
# Remove space distinction 
# (i.e. map the set of identical answers ignoring space to the most frequent one in the set)
def removeSpaceDistinction(wordCnt, validWordThreshold=1000):
    answersSet = defaultdict(Counter)
    for s, v in wordCnt.iteritems():
        s_nospace = re.sub('\\W+', '', s)
        answersSet[s_nospace][s]+=v
    updateMap = {}
    for k, c in answersSet.iteritems():
        bestOccur = c.most_common(1)[0][0]
        if wordFreq[k] >= validWordThreshold:
            t = k
        else:
            t = bestOccur
        for s in c:
            updateMap[s] = t
    return updateMap
fullUpdate(removeSpaceDistinction, warningThreshold=1000)

bath tub 1092 --> bathtub
super market 3038 --> supermarket
race track 1245 --> racetrack
sea shore 1759 --> seashore
show room 2151 --> showroom
high way 1050 --> highway
play ground 39715 --> playground
night club 1090 --> nightclub
carport 1250 --> car port
rest room 1490 --> restroom
light house 3006 --> lighthouse
barber shop 2277 --> barbershop
water fall 1865 --> waterfall
oldplace 1862 --> old place
farm house 1958 --> farmhouse
swimmingpool 1535 --> swimming pool
bath room 6267 --> bathroom
water falls 3662 --> waterfalls
road side 2140 --> roadside
under water 1917 --> underwater
work shop 1220 --> workshop
class room 8978 --> classroom
grass land 2415 --> grassland
bed room 26106 --> bedroom
air port 1362 --> airport
wood house 2294 --> woodhouse
wind mill 2384 --> windmill
book shop 1523 --> bookshop


In [20]:
# Remove pular words if their corresponding singular forms exist in the dataset 
# and (are more popular) or (have more than 100 occurances)
def handlePularWords(wordCnt, threshold=100, validWordThreshold=1000):
    updateMap = {}
    for s in wordCnt:
        sing = s[:-1]
        if s[-1] == 's' and (wordCnt[s] <= wordCnt[sing] or wordCnt[sing] >= threshold)\
            and wordFreq[sing] >= validWordThreshold:
            updateMap[s] = sing
    return updateMap
fullUpdate(handlePularWords, warningThreshold=1000)

animals 2588 --> animal
clouds 1537 --> cloud
machines 2574 --> machine
toys 1194 --> toy
hotels 1088 --> hotel
sports 3002 --> sport
pigs 1140 --> pig
rocks 3096 --> rock
buildings 3126 --> building
games 3093 --> game
cars 1882 --> car
waterfalls 7492 --> waterfall
houses 1408 --> house
palaces 3049 --> palace
flowers 3269 --> flower
stairs 1114 --> stair
trees 16836 --> tree
ruins 1458 --> ruin
players 1178 --> player
plants 2375 --> plant
homes 1128 --> home
hills 21465 --> hill
hens 1002 --> hen
peoples 16900 --> people
workers 1712 --> worker
chairs 1735 --> chair
apartments 2598 --> apartment
steps 12584 --> step
childrens 1442 --> children
mountains 4456 --> mountain


In [8]:
stopWordsStr='a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,\
at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,\
for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,\
just,least,let,like,likely,may,me,might,most,must,my,neither,nor,of,off,often,on,only,\
or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,\
then,there,these,they,this,tis,to,too,twas,us,want,wants,was,we,were,what,when,where,which,\
while,who,whom,why,will,with,would,yet,you,your,iam,going,giong,live,seeing,look,looks,no,not,image,s,photo'
stopWords = set(stopWordsStr.split(','))
def removeStopWords(w):
    if w in stopWords:
        return None
    else:
        return w
fastUpdate(removeStopWords, warningThreshold=1000)



In [9]:
# Modification distance. Credit to Peter Norvig
def edits1(word):
   splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
   deletes    = [a + b[1:] for a, b in splits if b]
   transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
   replaces   = [a + c + b[1:] for a, b in splits for c in string.ascii_lowercase if b]
   inserts    = [a + c + b     for a, b in splits for c in string.ascii_lowercase+' ']
   return set(deletes + transposes + replaces + inserts) - set(['', word])

In [10]:
# Try to correct spellings
def correctSpelling(wordCnt, validWordThreshold=1000, ignoreThreshold=10, noticeThreshold=100):
    updateMap = {}
    manualProcess = []
    for s, v in wordCnt.iteritems():
        if wordFreq[s] >= validWordThreshold:
            continue
        po = edits1(s)
        bestRef = max(po, key=wordCnt.get)
        if wordCnt[bestRef] > wordCnt[s]:
            updateMap[s] = bestRef
        elif v < 10:
            continue
        else:
            po2 = set([s2 for s1 in po for s2 in edits1(s1)]) | po
            bestRef = max(po2, key=wordCnt.get)
            if wordFreq[bestRef] >= validWordThreshold:
                updateMap[s] = bestRef
            else:
                if v >= noticeThreshold:
                    manualProcess.append(s)
    print '-- Manual Process --'
    for s in manualProcess:
        print s
    print '-- Manual Process --'
    return updateMap
fullUpdate(correctSpelling, warningThreshold=1000)

KeyboardInterrupt: 

In [None]:
handConvertStr = '''
veichile|vehicle
planatorium|planetarium
confrancehall|conference hall
kitchenroom|kitchen
accurium|aquarium
animalshome|animal home
liviningroom|living room
swimmingful|swimming pool
alecoopter|helicopter
liabery|library
frontviewofbuilding|front view of building
fogplace|foggy place
olderbuliding|older building
tallbuildings|tall building
photoview|photo view
sportsroom|sports room
archbuilding|arch building
damagedbuilding|damaged building
trainroute|train route
searesearch|sea research
sushishop|sushi shop
grassshapes|grass shape
damagedarticles|damaged article
damagedthings|damaged thing
roadview|road view
playcourt|play court
woodenframes|wooden frame
riverarea|river area
pitshop|pit shop
rocksview|rock view
damagedstructure|damaged structure
steelstructures|steel structure
sandroad|sand road
indoorstadium|indoor stadium
no image|
unknown|
mountion|mountain
buliding|building
bulding|building
smallhouse|small house
restaurent|restaurant
shope|shop
meating hall|meeting hall
restarant|restaurant
restaruant|restaurant
restraunt|restaurant
chruch|church
hospitel|hospital
hoispatl|hospital
swiming pool|swimming pool
swimming pal|swimming pool
smimming place|swimming place
smimming pool|swimming pool
smimming room|swimming room
parliment|parliament
kichen room|kitchen room
kichen|kitchen
kicthen|kitchen
brige|bridge
restauant|restaurant
railwaybalm|railway balm
charch|church
chirch|church
moutain|mountain
goverment building|government building
seminor class|seminar class
lakh|lake
appartment|apartment
multi storey building|multi story building
bed rooms|bed room
bed rooom|bed room
mountan|mountain
shoping center|shopping center
airplan|airplane
machinary|machinery
machineries|machinery
laboratary|laboratory
lobi|lobby
coridoor|corridor
hallview|hall view
delux car|deluxe car
histrorical place|historical place
rockfort|rock fort
multi storeyed building|multi story building
book centre|book center
liberay|library
tant|tent
hutview|hut view
information centre|information center
swimmimg pool|swimming pool
swimmingfull|swimming pool
cinema theature|cinema theater
theator|theater
theatre|theater
game centre|game center
construciton|construction
musiam|museum
muzium|museum
musem|museum
museiam|museum
statium|stadium
galary|gallery
stedum|stadium
computersoffice|computer office
bouth room|bathroom
both room|bathroom
bath room|bathroom
bathshower|bath shower
bustand|bus stand
light kouse|light house
secutity place|security place
digg|dig
gargen|garden
archbridge|arch bridge
haiway road|highway road
shipview|ship view
shoping|shopping
men|man
women|woman
natural|nature
tallbuildings|tall building
restround|restaurant
animalshome|animal home
kitchenroom|kitchen room
restarea|rest area
circu|circus
damagedstructure|damaged structure
publicsplace|public place
kittar|gittar
bigbuildings|big building
seminor|seminar
hourbor|harbour
fueal station|fuel station
damagedstructures|damaged structure
frontviewofbuildings|front view of building
orch|arch
fissures|fissure
veichile|vehicle
pigsty|pigpen
skyview|sky view
glasshouse|greenhouse
mandai|mandarin
searesearch|sea research
steelstructures|steel structure
gesthouse|guest house
varandha|veranda
borewell|borehole
newstand|news stand
seaview|sea view
cemetary|cemetery
telecaste|telecast
frontview|front view
railywaytrack|railway track
kovil|koil
gamesroom|game room
waittinghall|waiting hall
almirha|almirah
oldbuildings|old building
eatables|edible
scenary|scenery
piknic|picnic
greenary|greenery
viwepoint|viewpoint
'''
handConvert = {line.split('|')[0].strip():line.split('|')[1].strip()\
               for line in handConvertStr.split('\n') if len(line.strip()) > 0}
def handConvertFn(wordCnt):
    updateMap = {}
    for k, v in handConvert.iteritems():
        if k not in wordCnt:
            print 'WARNING - not used', k, '|', v
        else:
            updateMap[k]=v
    return updateMap
fullUpdate(handConvertFn)

In [None]:
fastUpdate(lowercaseStripMultipleSpaces)
fastUpdate(splitPhrases)
fastUpdate(removeStopWords)
fullUpdate(correctSpelling)

In [None]:
# Most frequent non-dictionary word
for k, v in getWordCnt().most_common():
    if wordFreq[k] < 1000 and v >= 100:
        print k, v, wordFreq[k]

In [None]:
# Most common ones
for k, v in getWordCnt().most_common(20):
    print k, v

In [None]:
# Saving file
headers = ['category_id', 'description', 'count']
with open('data/cat_desc.csv', 'w') as fout:
    csvWriter = csv.writer(fout)
    csvWriter.writerow(headers)
    for word, dist in wordCatCnt.iteritems():
        for cat, v in dist.iteritems():
            csvWriter.writerow((cat, word, v))