Cleaning up description dataset - grounding everything to wordnet
================================

In [1]:
from string import letters, punctuation, whitespace, printable, digits
import csv
import re
import string
from collections import Counter, defaultdict
from nltk.corpus import wordnet as wn
import numpy as np

In [2]:
# Load file
wordCatCnt = defaultdict(Counter)
rawDescHeaders = None
catSet = set()
with open('rawDesc.csv') as fin:
    csvReader = csv.reader(fin)
    for row in csvReader:
        if rawDescHeaders is None:
            rawDescHeaders = row
        else:
            img, cat, desc = row
            cat = int(cat) - 1
            wordCatCnt[desc][cat] += 1
            catSet.add(cat)
print rawDescHeaders
print range(len(catSet)) == sorted(list(catSet))
numCat = len(catSet)
wordVec = {}
for word, dist in wordCatCnt.iteritems():
    tmp = np.zeros(numCat, dtype=np.float_)
    for c, v in dist.iteritems():
        tmp[c] = v
    wordVec[word] = tmp

['image_filename', 'image_category_id', 'description']
True


In [3]:
# Result stockpile
wordVecPerfect = {}
s2h = lambda w: w.replace(' ', '_')
def filterPerfectDesc(cntThreshold=10):
    cntTrfr = 0
    cntTrfrSum = 0
    toRm = []
    for w in wordVec:
        tmp = wn.morphy(s2h(w))
        if tmp is not None and (tmp in wordVecPerfect or np.sum(wordVec[w]) >= cntThreshold):
            cntTrfr += 1
            cntTrfrSum += np.sum(wordVec[w])
            if tmp in wordVecPerfect:
                wordVecPerfect[tmp] += wordVec[w]
            else:
                wordVecPerfect[tmp] = wordVec[w]
            toRm.append(w)
    for w in toRm:
        del wordVec[w]
    print 'NEW PERFECT DESC', cntTrfr, cntTrfrSum
inWN = lambda w: wn.morphy(s2h(w)) is not None
phraseInWN = lambda p: all(inWN(s) for s in p.split())
wnStrip = lambda s: ' '.join([wn.morphy(s2h(x)) for x in s.split()])
toWN = lambda s: wn.morphy(s2h(s))
def statWV(wv):
    print len(wv), sum(np.sum(v) for v in wv.itervalues())

In [4]:
# Make string updates faster
def applyUpdates(updateMap, warningThreshold=50, cntThreshold=10):
    removalWarning = 0
    removalWarningSum = 0
    cntTrfr = 0
    cntTrfrSum = 0
    for w in updateMap:
        if w in wordVec:
            t = updateMap[w]
            if w == t: continue
            vec = wordVec[w]
            del wordVec[w]
            if np.sum(vec) >= warningThreshold:
                print w, np.sum(vec), '-->', t
            if t is None or len(t) == 0:
                removalWarning += 1
                removalWarningSum += np.sum(vec)
            else:
                tmp = wn.morphy(s2h(t))
                if tmp is not None and (tmp in wordVecPerfect or np.sum(vec) >= cntThreshold):
                    cntTrfr += 1
                    cntTrfrSum += np.sum(vec)
                    if tmp in wordVecPerfect:
                        wordVecPerfect[tmp] += vec
                    else:
                        wordVecPerfect[tmp] = vec
                elif t in wordVec:
                    wordVec[t] += vec
                else:
                    wordVec[t] = vec
        else:
            print 'Update rule is not applicable: ' + w + ' --> ' + updateMap[w]
    if removalWarning > 0:
        print 'WARNING - Removing entries:', removalWarning, removalWarningSum
    print 'NEW PERFECT DESC', cntTrfr, cntTrfrSum

def getWordCnt(wv):
    return Counter({w: np.sum(v) for w, v in wv.iteritems()})

def fastUpdate(fn, warningThreshold=50, enforce=True):
    updateMap = {w:fn(w) for w in wordVec.iterkeys()}
    if enforce: applyUpdates(updateMap, warningThreshold)

In [5]:
# Word similarity in terms of category distribution
# from scipy.stats import entropy # can't handle entropy(np.array([0, 1, 1]), np.array([0, 1, 1]))
def entropy(P, Q):
    mask = np.where(P != 0.0)
    _P = P[mask]
    _Q = Q[mask]
    return np.sum(_P * np.log(_P / _Q))
from numpy.linalg import norm
def JSD(P, Q):
    _P = P / float(norm(P, ord=1))
    _Q = Q / float(norm(Q, ord=1))
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

In [6]:
statWV(wordVec)
statWV(wordVecPerfect)

127934 5117528.0
0 0


In [7]:
def removeNonPrintableAnswers(w):
    if all(c in printable for c in w):
        return w
    else:
        return None
fastUpdate(removeNonPrintableAnswers)

filterPerfectDesc()

def removeNumbers(w):
    return w.translate(None, digits)
fastUpdate(removeNumbers)

def lowercaseStripMultipleSpaces(w):
    w = w.lower().strip()
    return re.sub(' +',' ', w)
fastUpdate(lowercaseStripMultipleSpaces, warningThreshold=10000)

def removeUrl(w):
    if w.startswith('http'):
        return None
    else:
        return w
fastUpdate(removeUrl)

# Remove prefix
prefixRemovalS = '''
i am giong to
i am going to
iam going to
i am seeing
i am watching
i want to live
iam seeing
seeing
this is
it is
look like
looks like
it's
its
'''
prefixBan = set(tuple(l.split()) for l in prefixRemovalS.split('\n') if l != '')
def removePrefixBan(w):
    s = w.split()
    trunc = 0
    for pattern in prefixBan:
        if tuple(s[:len(pattern)]) == tuple(pattern):
            trunc = max(trunc, len(pattern))
    return ' '.join(s[trunc:])
fastUpdate(removePrefixBan)

# Remove some stopwords
stopWordsS='a,an,the,some' # only include very confident ones
stopWords = set([w.lower().strip() for w in stopWordsS.split(',')])
def removeStopWords(w):
    s = w.split()
    ans = []
    for x in s:
        if x in stopWords:
            continue
        else:
            ans.append(x)
    return ' '.join(ans)
fastUpdate(removeStopWords)

NEW PERFECT DESC 0 0
NEW PERFECT DESC 3585 3598643.0
NEW PERFECT DESC 20 27.0
House 22415.0 --> house
Church 10324.0 --> church
Mountain 18483.0 --> mountain
Forest 19664.0 --> forest
Garden 10108.0 --> garden
Hall 23573.0 --> hall
Factory 20501.0 --> factory
Sea 10379.0 --> sea
Shop 11444.0 --> shop
Building 27362.0 --> building
NEW PERFECT DESC 5708 551648.0
NEW PERFECT DESC 0 0
i am going to the meeting hall 67.0 --> the meeting hall
iam seeing the church 60.0 --> the church
i am going to the stadium 70.0 --> the stadium
i am going to the garden 68.0 --> the garden
this is building 50.0 --> building
iam seeing the historic building 115.0 --> the historic building
this is a building 85.0 --> a building
i am going to the park 191.0 --> the park
iam seeing the building 75.0 --> the building
it is building 1386.0 --> building
i am going to the museum 67.0 --> the museum
i am going to the factory 118.0 --> the factory
iam seeing the house 91.0 --> the house
i am going to the hotel 230.0 

In [8]:
def replaceAmp(w):
    return w.replace('&', ' and ')
fastUpdate(replaceAmp)

def replaceNot(w):
    return w.replace("n't ", ' not ')
fastUpdate(replaceNot)

fastUpdate(lowercaseStripMultipleSpaces)

house & garden 71.0 --> house  and  garden
NEW PERFECT DESC 0 0
NEW PERFECT DESC 0 0
house  and  garden 71.0 --> house and garden
NEW PERFECT DESC 1 1.0


In [9]:
def handleApos(w):
    if "'" in w:
        return w.replace("children's", "child's").replace("s'", "'s")
    else:
        return w
fastUpdate(handleApos)

children's park 602.0 --> child's park
children's room 80.0 --> child's room
NEW PERFECT DESC 4 95.0


In [10]:
def handleHyphen(w):
    if '-' in w:
        if inWN(w.replace('-', '')):
            return w.replace('-', '')
        else:
            return w.replace('-', ' ')
    else:
        return w
fastUpdate(handleApos)
fastUpdate(lowercaseStripMultipleSpaces)

NEW PERFECT DESC 0 0
NEW PERFECT DESC 0 0


In [11]:
def replacePunctuation(w):
    s = [' ' if (c in punctuation) else c for c in w]
    return ''.join(s)
fastUpdate(replacePunctuation)

fastUpdate(lowercaseStripMultipleSpaces)

dentist's office 72.0 --> dentist s office
dinning room. 162.0 --> dinning room 
multi-storeyed building 109.0 --> multi storeyed building
bathroom. 50.0 --> bathroom 
child's park 602.0 --> child s park
NEW PERFECT DESC 27 71.0
dinning room  162.0 --> dinning room
bathroom  50.0 --> bathroom
NEW PERFECT DESC 170 393.0


In [12]:
# Try to remove additional space
def removeAddiSpace(w):
    s = w.split()
    if len(s) == 1:
        return w
    else:
        for i in xrange(len(s) - 1):
            tmp = wn.morphy(s2h(s[i]+s[i+1]))
            if tmp is not None:
                return ' '.join(s[:i] + [tmp] + s[i+2:])
        return w
fastUpdate(removeAddiSpace, warningThreshold=1000)
fastUpdate(removeAddiSpace)

guest house 15131.0 --> guesthouse
bath tub 1093.0 --> bathtub
super market 3049.0 --> supermarket
race track 1245.0 --> racetrack
sea shore 1765.0 --> seashore
show room 2153.0 --> showroom
book shop 1524.0 --> bookshop
under sea 1163.0 --> undersea
high way 1050.0 --> highway
play ground 39733.0 --> playground
play room 2260.0 --> playroom
rest room 1494.0 --> restroom
light house 3016.0 --> lighthouse
barber shop 2282.0 --> barbershop
water fall 1868.0 --> waterfall
farm house 1960.0 --> farmhouse
bath room 6268.0 --> bathroom
road side 2140.0 --> roadside
under water 1934.0 --> underwater
work shop 1221.0 --> workshop
guest room 2842.0 --> guestroom
wash basin 1197.0 --> washbasin
class room 8986.0 --> classroom
grass land 2413.0 --> grassland
bed room 26123.0 --> bedroom
play school 2745.0 --> playschool
air port 1362.0 --> airport
water falls 3665.0 --> waterfall
bull fight 1056.0 --> bullfight
wind mill 2384.0 --> windmill
store room 3613.0 --> storeroom
NEW PERFECT DESC 753 184

In [13]:
statWV(wordVec)
statWV(wordVecPerfect)

97121 775397.0
2754 4342045.0


In [14]:
# Modification distance. Credit to Peter Norvig
def editLetter(word):
   splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
   deletes    = [a + b[1:] for a, b in splits if b]
   transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
   replaces   = [a + c + b[1:] for a, b in splits for c in string.ascii_lowercase if b]
   inserts    = [a + c + b     for a, b in splits for c in string.ascii_lowercase]
   return set(deletes + transposes + replaces + inserts + [word])

def editLetter2(word):
    ret = set()
    for w in editLetter(word):
        ret.update(editLetter(w))
    return ret

def correctMisspelled1(w, jsdThreshold=0.2, cntThreshold=10):
    if phraseInWN(w):
        return w
    po = editLetter(w)
    po1 = set([x for x in po if phraseInWN(x)])
#     if len(po1) == 1:
#         return po1.pop()
    po2 = set([x for x in filter(None, map(toWN, po1))
           if x in wordVecPerfect and JSD(wordVec[w], wordVecPerfect[x]) <= jsdThreshold])
    if len(po2) == 1:
        return po2.pop()
    if len(po2) > 1:
        print '@1', w, list(po2)
        return w
    if np.sum(wordVec[w]) < cntThreshold:
        return w
    po = editLetter2(w)
    po1 = set([x for x in filter(None, map(toWN, po))
           if x in wordVecPerfect and JSD(wordVec[w], wordVecPerfect[x]) <= jsdThreshold])
    if len(po1) == 1:
        return po1.pop()
    if len(po1) > 1:
        print '@2', w, list(po1)
    return w
fastUpdate(correctMisspelled1, warningThreshold=1000)

fastUpdate(removeStopWords)
fastUpdate(removeAddiSpace)

@2 billiats ['billiard', 'billiards']
@1 soliders [u'solid', u'soldier']
@1 shoses ['shoes', u'shoe']
@1 saull ['skull', 'scull']
@1 airoplane ['airplane', 'aeroplane']
@2 sketting ['skating', u'scat']
@2 airopline ['airplane', 'aeroplane']
@1 skatting ['skating', u'scat']
@1 caf ['cab', 'cap']
@1 bons ['bones', 'bone', u'boon']
@2 barell ['barrels', 'barrel']
@1 shose ['shoe', 'shoes']
@1 fales [u'fall', 'falls']
@1 bolling ['rolling', 'bowling']
@1 sckulls [u'scull', u'skull']
@1 barrell ['barrels', 'barrel']
@1 teepe ['teepee', 'tepee']
@2 airoplan ['airplane', 'aeroplane']
@1 fals ['false', 'falls']
@1 barrells ['barrels', u'barrel']
@1 teppee ['teepee', 'tepee']
@1 shull ['skull', 'scull']
@1 shoces ['shoes', u'shoe']
@1 carousl ['carousal', 'carousel']
@1 jewelery ['jewelry', 'jewellery']
@1 modlling ['modeling', 'modelling']
@1 wresling [u'wrest', 'wrestling']
@2 carbur ['harbor', 'harbour']
bulinding 4871.0 --> building
resturant 6534.0 --> restaurant
mountion 4929.0 --> mounta

In [15]:
# Try to add space
def editSpace(word):
    return set([word[:i] + ' ' + word[i:] for i in range(1, len(word))])

def addSpace1(w, leastLength=3, jsdThreshold=0.2):
    if len(w) < leastLength * 2 or phraseInWN(w):
        return w
    po = editSpace(w)
    po = [x for x in po if min(map(len, x.split())) >= leastLength]
    po1 = set(filter(None, map(toWN, po)))
    if len(po1) == 1:
        return po1.pop()
    po2 = set([x for x in po1 if x in wordVecPerfect and JSD(wordVec[w], wordVecPerfect[x]) <= jsdThreshold])
    if len(po2) == 1:
        return po2.pop()
    if len(po2) > 1:
        print w, po2
    return w
fastUpdate(addSpace1)

fastUpdate(removeStopWords)
filterPerfectDesc()

paddyfield 61.0 --> paddy_field
themepark 97.0 --> theme_park
restarea 104.0 --> rest_area
wallpaintings 84.0 --> wall_painting
shoppingmall 177.0 --> shopping_mall
phonebooth 250.0 --> phone_booth
waitingroom 559.0 --> waiting_room
telephonebooth 225.0 --> telephone_booth
diningroom 95.0 --> dining_room
tajmahal 57.0 --> taj_mahal
coolingtower 51.0 --> cooling_tower
controlroom 104.0 --> control_room
treehouse 269.0 --> tree_house
drawingroom 55.0 --> drawing_room
sitout 287.0 --> sit_out
receptionroom 51.0 --> reception_room
officebuildings 96.0 --> office_building
washingmachine 104.0 --> washing_machine
watertank 95.0 --> water_tank
flowergarden 105.0 --> flower_garden
townhouse 64.0 --> town_house
livingroom 782.0 --> living_room
carrace 68.0 --> car_race
horserace 158.0 --> horse_race
swimmingpool 1537.0 --> swimming_pool
frontyard 425.0 --> front_yard
busstop 148.0 --> bus_stop
railwaystation 166.0 --> railway_station
gameroom 154.0 --> game_room
firepit 77.0 --> fire_pit
dining

In [16]:
statWV(wordVec)
statWV(wordVecPerfect)

95021 663241.0
2950 4454201.0


In [17]:
# Remove overly short words (<3 letters)
for desc in filter(lambda s: len(s) < 3, wordVecPerfect.keys()):
    del wordVecPerfect[desc]

In [18]:
# Remove single stop words and junks
stopWordsS2 = 'a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,\
cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,\
i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,nor,of,off,often,on,\
only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,\
this,tis,to,too,twas,us,want,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,\
your,none,unknown,no'
stopWords2 = [s.strip() for s in stopWordsS2.split(',') if len(s.strip()) > 0]
for w in stopWords2:
    if w in wordVecPerfect:
        del wordVecPerfect[w]

In [19]:
statWV(wordVecPerfect)
# Saving file
headers = ['category_id', 'description', 'count']
with open('cat_desc_wn.csv', 'w') as fout:
    csvWriter = csv.writer(fout)
    csvWriter.writerow(headers)
    for word, vec in wordVecPerfect.iteritems():
        for c in xrange(numCat):
            v = int(round(vec[c]))
            if v == 0: continue
            csvWriter.writerow((str(c + 1), word, str(v)))

2917 4450878.0
