In [2]:
import pandas as pd
import numpy as np
import json
import csv
import requests
import haversine

import sys
import re
import time
# i used this library pynlpl for n-grams creation and frequency calculations
import pynlpl
from pynlpl.statistics import FrequencyList

## Cleanse the data and remove all extraneous information

#### Cleansing steps:
Remove these characters: Period | ( | ) | - | :
Remove these characters and replace with a space
Lower case the text

### Expanding and Standardising

#### Abbreviations expanded:
rd: road
ngr: nagar
nr: near
apt: apartment
apts: apartment
opp: opposite
extn: extension

In [3]:
expansions_dict = {' rd,': ' road,', ' rd ': ' road ',
                   ' apts,': ' apartment,', ' apts ': ' apartment ',
                   ' apt,': ' apartment,', ' apt ': ' apartment ',
                   ' appts,': ' apartment,',' appts ': ' apartment ',
                   'apartments':'apartment',
                   ' ngr,': ' nagar,', ' ngr ': ' nagar ',
                   ',opp ': ',opposite ', ' opp ': ' opposite ',
                   ',nr ': ',near ', ' nr ': ' near ',
                   ' extn,': ' extension,', ' extn ': ' extension ',
                   ' & ': ' and ', '&': ' and ',
                  ' th ': 'th '}
expansions_re = re.compile('(%s)' % '|'.join(expansions_dict.keys()))

In [4]:
def expand_expansions(s, expansions_dict=expansions_dict):
    def replace(match):
        return expansions_dict[match.group(0)]
    return expansions_re.sub(replace, s)

In [5]:
romanSearch = re.compile(r'\b(?=[MDCLXVI]+\b)M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\b')

def roman_to_int(inputRoman):
    inputRoman = inputRoman.upper(  )
    nums = {'M':1000, 'D':500, 'C':100, 'L':50, 'X':10, 'V':5, 'I':1}
    result = 0
    for i in range(len(inputRoman)):
        try:
            value = nums[inputRoman[i]]
            # If the next place holds a larger number, this value is negative
            if i+1 < len(inputRoman) and nums[inputRoman[i+1]] > value:
                result -= value
            else: 
                result += value
        except:
            continue
    return(result)


def roman_int_regex(inputAddr):
    def roman_to_int_repl(match):
        exclude = set(["LLC"])   # add any other strings you don't want to replace
        if match.group(0) in exclude:
            return match.group(0)
        return str(roman_to_int(match.group(0)))
    return(romanSearch.sub(roman_to_int_repl, inputAddr))

#### Combining cleaning steps

In [6]:
def cleanAndExpand(inputDF):
    tmpAddress = [re.sub(' +',' ',re.sub('[-#.(:)",]',' ', inputDF.loc[ix, 'customerAddress'].lower())) for ix in inputDF.index]
    tmpAddress1 = [expand_expansions(tmpAddress[ix]) for ix in range(len(tmpAddress))]
    tmpAddress2 = [roman_int_regex(tmpAddress1[ix]) for ix in range(len(tmpAddress1))]
    inputDF['cleanAddress'] = tmpAddress2
    return(inputDF)
    

In [7]:
def cleanRoundData(inputDF):
    cleanAddress = cleanAndExpand(inputDF)
    cleanAddress['del_latRounded4'] = np.round(cleanAddress['deliveredLat'],4)
    cleanAddress['del_lngRounded4'] = np.round(cleanAddress['deliveredLng'],4)
    return(cleanAddress)

#### Attempt to parallelise the code

In [8]:
import multiprocessing
from functools import reduce
import operator

In [9]:
def getBiGrams(inputAddress):
    return([phrase.strip().split(" ")[i] + " " + phrase.strip().split(" ")[i+1] for phrase in inputAddress.split(',') if len(phrase.strip().split(" "))>1 for i in range(len(phrase.strip().split(" "))-1)])


In [10]:
def getTriGrams(inputAddress):
    return([phrase.strip().split(" ")[i] + " " + phrase.strip().split(" ")[i+1] + " " + phrase.strip().split(" ")[i+2] for phrase in inputAddress.split(',') if len(phrase.strip().split(" "))>1 for i in range(len(phrase.strip().split(" "))-2)])


In [11]:
# for given n-grams, get their frequency
def getFreqList(biGrams):    
    freqlist =  pynlpl.statistics.FrequencyList()
    freqlist.append(biGrams)
    return(freqlist)

In [12]:
# get n-grams within 1 Levenshtein Distance
def getNearestNgramsDict(freqlist):
    nGramKeys = list(freqlist.dict().keys())
    finalngramDict = {}
    for i in range(len(nGramKeys)):
        finalngramDict[nGramKeys[i]] = {}
        finalngramDict[nGramKeys[i]]['levDis'] = {}
        for j in range(0,len(nGramKeys)):
            finalngramDict[nGramKeys[i]]['levDis'][nGramKeys[j]] = pynlpl.statistics.levenshtein(nGramKeys[i],nGramKeys[j])
    
    nearestnGramsDict = {}
    for i in range(len(nGramKeys)):
        nGramCount = 0
        for key in finalngramDict[nGramKeys[i]]['levDis'].keys():
            if finalngramDict[nGramKeys[i]]['levDis'][key] <= 1: # change this value if needed
                nGramCount += freqlist[key]
        nearestnGramsDict[nGramKeys[i]] = nGramCount
    return(nearestnGramsDict)
    

In [13]:
# input is data file and indices of the rounded lat lngs for processing
# creates bigrams and counts including bigrams within 1 LD
def createNearestNGrams(kom_licious_cleanAddress,cutoffLocs):
    stime = time.time()
    biGramListofLists = []
    pool_bgoutput = multiprocessing.Pool()
    for ix in cutoffLocs.index:
        selIndex = kom_licious_cleanAddress[(kom_licious_cleanAddress.del_latRounded4==cutoffLocs.loc[ix,'del_latRounded4']) & (kom_licious_cleanAddress.del_lngRounded4==cutoffLocs.loc[ix,'del_lngRounded4'])].index

        bgoutput = pool_bgoutput.map(getBiGrams, kom_licious_cleanAddress.loc[selIndex,'cleanAddress'].values)
        bgoutput_concat = reduce(operator.concat, bgoutput)
        biGramListofLists.append(bgoutput_concat)
    freqListofLists = pool_bgoutput.map(getFreqList, biGramListofLists)
    nearestNgramsListofLists = pool_bgoutput.map(getNearestNgramsDict, freqListofLists)

    etime = time.time()
    print(etime-stime)
    return(nearestNgramsListofLists, freqListofLists)

In [14]:
# creates a file of bigrams and their counts. for debug essentially
def getBigramsFile(kom_licious_cleanAddress, cutoffLocs, nearestNgramsListofLists, freqListofLists, outputFile):
    with open(outputFile,'w') as inFile:
        fields = ['lat','lng','biGram','latLngOccurence','biGramCount','nearbyBiGramCount','confidence%','index']
        writer = csv.DictWriter(inFile, fieldnames=fields)
        writer.writeheader()
        for i, ix in enumerate(cutoffLocs.index):
            selIndex = kom_licious_cleanAddress[(kom_licious_cleanAddress.del_latRounded4==cutoffLocs.loc[ix,'del_latRounded4']) & (kom_licious_cleanAddress.del_lngRounded4==cutoffLocs.loc[ix,'del_lngRounded4'])].index

            for key in nearestNgramsListofLists[i].keys():
                tmpDict = {}
                tmpDict['lat'] = cutoffLocs.loc[ix,'del_latRounded4']
                tmpDict['lng'] = cutoffLocs.loc[ix,'del_lngRounded4']
                tmpDict['biGram'] = key
                tmpDict['latLngOccurence'] = len(selIndex)
                tmpDict['biGramCount'] = freqListofLists[i][key]
                tmpDict['nearbyBiGramCount'] = nearestNgramsListofLists[i][key]
                tmpDict['index'] = ix
                tmpDict['confidence%'] = 100*tmpDict['nearbyBiGramCount'] / tmpDict['latLngOccurence']
                writer.writerow(tmpDict)

In [15]:
# given a bigram, it finds other bigrams within a distance threshold
# and computes confidence of that bigram
def getAggBiGram(biGramVal):
    tmp = {}
    thresh_nearby = 0
    thresh_aggregate = 0
    for srcInd in testbiGrams[testbiGrams['biGram']==biGramVal].index:
        cntr = 0
        llcntr = 0
        latList = []
        lngList = []
        # go through all entries found for the given bigram
        for destInd in testbiGrams[testbiGrams['biGram']==biGramVal].index:
            drift = haversine.haversine((testbiGrams.loc[srcInd,'lat'],testbiGrams.loc[srcInd,'lng']),(testbiGrams.loc[destInd,'lat'],testbiGrams.loc[destInd,'lng']))
            if drift <= 0.15: # threshold distance in km. collate all entries within threshold
                latList.append(testbiGrams.loc[destInd,'lat'])
                lngList.append(testbiGrams.loc[destInd,'lng'])
                cntr +=testbiGrams.loc[destInd,'biGramCount']
                llcntr +=testbiGrams.loc[destInd,'latLngOccurence']
        # idea is to pick the best bigram-latLng combo in terms of frequency
        # note to self and Sagar: This bit can be changed to output all bigram-latLng combo
        # and make a choice at another step based on confidence and frequency count.
        if (cntr >= thresh_aggregate) and (testbiGrams.loc[srcInd,'nearbyBiGramCount'] >= thresh_nearby):
            tmp['del_lat'] = testbiGrams.loc[srcInd,'lat']
            tmp['del_lng'] = testbiGrams.loc[srcInd,'lng']
            tmp['latLngOccurence'] = llcntr
            tmp['biGramOccurence'] = testbiGrams[testbiGrams['biGram']==biGramVal]['biGramCount'].sum()
            tmp['biGramCount'] = testbiGrams.loc[srcInd,'biGramCount']
            tmp['nearbyBiGramCount'] = testbiGrams.loc[srcInd,'nearbyBiGramCount']
            tmp['aggregateBiGramCount'] = cntr
            tmp['agg_lat'] = np.mean(latList)
            tmp['agg_lng'] = np.mean(lngList)
            tmp['biGram'] = biGramVal
            tmp['confidence%'] = 100*cntr/tmp['biGramOccurence']
            thresh_aggregate = cntr
            thresh_nearby = testbiGrams.loc[srcInd,'nearbyBiGramCount']
    return(tmp)
                

#### bigrams - example

In [16]:
import re
from unidecode import unidecode
cleanPattern = re.compile(r'[\s,-.\'"/\\\n\r]')
hashNumber = re.compile(r'#(?=\d)|(?<=\d)#')
newLine = re.compile(r'[\n\r]')

In [1]:
# data has to be extracted. similar to what was done in the repeatRegionCreation
# ensure that the column names are correct. customerAddress, deliveredLat, deliveredLng are the 3 columns we need
# rename columns accordingly

In [29]:
testFile = pd.read_csv(sourceDataFile)

In [30]:
# removing all duplicated addresses to remove inbuilt bias towards n-grams in those addresses
testFile['tmpAddress'] = [newLine.sub(', ',testFile.loc[ix,'Customer Address']) for ix in testFile.index]
testFile['cleanedAddress'] = [unidecode(hashNumber.sub('',cleanPattern.sub('',testFile.loc[ix,'tmpAddress'].lower()))) for ix in testFile.index ]
testFile = testFile[~testFile['cleanedAddress'].duplicated()].reset_index().copy()


In [32]:
testFile.rename(columns={"Completed Lat":"deliveredLat","Completed Lng":"deliveredLng","Customer Address":"customerAddress"}, inplace=True)

In [37]:
testFileCleaned = cleanRoundData(testFile)
testFileGrouped = testFileCleaned.groupby(['del_latRounded4','del_lngRounded4'])['cleanAddress'].count().reset_index().copy()
# min number of addresses per rounded lat lng for next step
cutoffLocs = testFileGrouped[testFileGrouped.cleanAddress>=1].copy()

In [40]:
nearestNGrams, freqList = createNearestNGrams(testFileCleaned, cutoffLocs)
getBigramsFile(testFileCleaned, cutoffLocs, nearestNGrams,freqList, bigramsFileOuputLocation)


In [41]:
testbiGrams = pd.read_csv(bigramsFileOuputLocation)
biGramTotalOccurence = testbiGrams.groupby('biGram')['biGramCount'].sum().reset_index().copy()
# min frequency of the bigram for further processing
shortListBiGrams = biGramTotalOccurence[biGramTotalOccurence.biGramCount>=10].copy()

In [42]:
pool_bgoutput = multiprocessing.Pool()

stime = time.time()
with open(bigramsAggregationFileOutputLocation,'w') as inFile:
    fields = ['del_lat','del_lng','agg_lat','agg_lng',
              'biGram','latLngOccurence','biGramOccurence',
              'biGramCount','nearbyBiGramCount','aggregateBiGramCount','confidence%']
    writer = csv.DictWriter(inFile, fieldnames=fields)
    writer.writeheader()
    output = pool_bgoutput.map(getAggBiGram,list(shortListBiGrams['biGram'].values))
    writer.writerows(output)
etime = time.time()
print(etime-stime)

0.5675580501556396


In [99]:
def aggregateBigrams(shortListBiGrams, outputFile):
    pool_bgoutput = multiprocessing.Pool()

    stime = time.time()
    with open(outputFile,'w') as inFile:
        fields = ['del_lat','del_lng','agg_lat','agg_lng',
                  'biGram','latLngOccurence','biGramOccurence',
                  'biGramCount','nearbyBiGramCount','aggregateBiGramCount','confidence%']
        writer = csv.DictWriter(inFile, fieldnames=fields)
        writer.writeheader()
        output = pool_bgoutput.map(getAggBiGram,list(shortListBiGrams['biGram'].values))
        writer.writerows(output)
    etime = time.time()
    print(etime-stime)

In [137]:
# created a single function to combine all the above steps
def learn(gg3monthsNoRepeats, bigramsFile, aggregateBiGramsFile, learntFile):
    gg3monthsNoRepeats = cleanRoundData(gg3monthsNoRepeats)
    gg3monthsNoRepeatsGrouped = gg3monthsNoRepeats.groupby(['del_latRounded4','del_lngRounded4'])['cleanAddress'].count().reset_index().copy()
    gg3monthsNoRepeatscutoffLocs = gg3monthsNoRepeatsGrouped[gg3monthsNoRepeatsGrouped.cleanAddress>=1].copy()
    nearestNGrams, freqList = createNearestNGrams(gg3monthsNoRepeats, gg3monthsNoRepeatscutoffLocs)
    getBigramsFile(gg3monthsNoRepeats, gg3monthsNoRepeatscutoffLocs, nearestNGrams,freqList, bigramsFile)
    testbiGrams = pd.read_csv(bigramsFile)
    biGramTotalOccurence = testbiGrams.groupby('biGram')['biGramCount'].sum().reset_index().copy()
    shortListBiGrams = biGramTotalOccurence[biGramTotalOccurence.biGramCount>=10].copy()
    # start aggregating bi-grams
    aggregateBigrams(shortListBiGrams, aggregateBiGramsFile)
    # write aggregate bigrams into standard format
    bigramfile = pd.read_csv(aggregateBiGramsFile)
    with open(learntFile,'w') as inFile:
        for ix in bigramfile[bigramfile.confidence>80][bigramfile.aggregateBiGramCount>10].index:
            inFile.write(str(bigramfile.loc[ix,'biGram']) + ' | ' + str(bigramfile.loc[ix,'del_lat']) + ' | ' + str(bigramfile.loc[ix,'del_lng']) + ' | ' + str(bigramfile.loc[ix,'confidence']) + ' | ' + str(bigramfile.loc[ix,'aggregateBiGramCount']) + '\n')
        inFile.close()

In [109]:
# learntFile is the txt file in the format that backend consumes learnt regions
learn(gg3months, bigramsFileOuputLocation, bigramsAggregationFileOutputLocation, learntFileLocation)

228.108323097229
254.50580501556396


  from ipykernel import kernelapp as app
