In [15]:
import requests
import pandas as pd
from unidecode import unidecode
import json
from haversine import haversine
import numpy as np
from datetime import datetime
import re
from requests.auth import HTTPBasicAuth
import time
import sys, csv

In [16]:
def get_dist_mat(taskLats, taskLngs, vehLats, vehLngs):
  
    # index 0 of tuple has lat
    # index 1 of tuple has lng
    sLats = np.array([[taskLats[k]  for k in range(len(taskLats))] for l in range(len(vehLats))])
    sLngs = np.array([[taskLngs[k]  for k in range(len(taskLats))] for l in range(len(vehLats))])

    dLats = np.array([[vehLats[l]  for k in range(len(taskLats))] for l in range(len(vehLats))])
    dLngs = np.array([[vehLngs[l]  for k in range(len(taskLats))] for l in range(len(vehLats))])

    R = 6371.0088

    s_lat = sLats*np.pi/180.0                      
    s_lng = np.deg2rad(sLngs)     
    e_lat = np.deg2rad(dLats)                       
    e_lng = np.deg2rad(dLngs)  

    d = np.sin((e_lat - s_lat)/2)**2 + np.cos(s_lat)*np.cos(e_lat) * np.sin((e_lng - s_lng)/2)**2

    return 2 *1000 * R * np.arcsin(np.sqrt(d))

In [None]:
client_id = 'gourmetgarden'
start_date = '2020-08-01'
end_date = '2020-10-30'
team_id = 'bengaluru'

In [4]:
# different address componets get concatenated while extracting the data itself because I found issues with 
# dtypes changing. Pincodes became float sometimes and a .0 got appended
# I concatenated rawAddress, city and pincode. If a client has other components like locality, state etc, please add them
# order of concat is given below:
# 1. rawAddress 2. Locality 3. subLocality 4. City 5. State 6. Pincode

def extractTeamData(client_id, start_date, end_date, team_id, path):
    page_token = ""
    tasks_export = []
    auth = HTTPBasicAuth("mara/personnel/xxxxxxx", "xxxxx-xxxxx")
    writeUrl = f'https://api.locus-api.com/v1/write-access?clientId={client_id}&duration=10'
    authHeaders = {'Authorization': 'Basic xxxxxxxxxxxxxxxxxxxx'}
    writeResp = requests.get(writeUrl, headers = authHeaders )
    print(writeResp.status_code)
    time.sleep(15)
    sheetColumns = ['Created On', 'Task Id', 'Team Name', 'Order Completion Time',
           'Geocoded Lat', 'Geocoded Lng', 'Completed Lat', 'Completed Lng',
           'Customer Address', 'Source', 'Pincode', 'City', 'countryCode',
           'confidence','date','hour','concatAddress']
    outFile = path + 'latLngData_' + str(team_id) + '_' + str(client_id) + '_' + str(start_date) + '_' + str(end_date) +'.csv'
    with open(outFile, "w") as output:
        writer = csv.DictWriter(output, fieldnames=sheetColumns)
        writer.writeheader()
        while page_token is not None:

            if page_token == "":
                get_tasks_url = f"https://locus-api.com/v1/client/{client_id}/" \
                    f"task?limit=2000&lowerTimestamp={start_date}T00:00:00.000%2B0530&timestamp=" \
                    f"{end_date}T23:59:59.000%2B0530&taskStatus=COMPLETED&carrierClient=" \
                    f"{client_id}&carrierTeam={team_id}"
            else:
                get_tasks_url = f"https://locus-api.com/v1/client/{client_id}/" \
                    f"task?limit=2000&lowerTimestamp={start_date}T00:00:00.000%2B0530&timestamp=" \
                    f"{end_date}T23:59:59.000%2B0530&taskStatus=COMPLETED&" \
                    f"pageToken={page_token}&carrierClient={client_id}&carrierTeam={team_id}"

            task_response = requests.get(get_tasks_url, auth=auth).json()
            tasks = task_response.get("tasks")
            page_token = task_response.get("pageToken")
            print(page_token)

            if tasks[0].get("taskGraph").get("visits")[1].get("locationOptions")[0].get("locationAddress").get("formattedAddress") == None:
                writeResp = requests.get(writeUrl, headers = authHeaders )
                print(writeResp.status_code)
                task_response = requests.get(get_tasks_url, auth=auth).json()
                tasks = task_response.get("tasks")
                page_token = task_response.get("pageToken")
            else:
                print('Already have write access')



            for task in tasks:
                try:
                    if task.get("status").get("location") is not None:
                        completed_lat = task.get("status").get("location").get("lat")
                        completed_lng = task.get("status").get("location").get("lng")
                    else:
                        completed_lat, completed_lng = None

            


                    row = {
                        "Created On": task.get("creationTime"),
                        "Task Id": task.get("taskId"),
                        "Team Name": task.get("carrierTeams")[0].get("teamId"),
                        "Order Completion Time": task.get("status").get("triggerTime"),
                        "Geocoded Lat": task.get("taskGraph").get("visits")[1].get("locationOptions")[0].get(
                            "geometry").get("latLng").get(
                            "lat"),
                        "Geocoded Lng": task.get("taskGraph").get("visits")[1].get("locationOptions")[0].get(
                            "geometry").get("latLng").get(
                            "lng"),
                        "Completed Lat": completed_lat,
                        "Completed Lng": completed_lng,
                        "Customer Address": task.get("taskGraph").get("visits")[1].get("locationOptions")[0].get(
                            "locationAddress").get(
                            "formattedAddress"),
                        "Source": task.get("taskGraph").get("visits")[1].get("locationOptions")[
                            0].get("geocodingMetadata").get("provider"),
                        "Pincode": task.get("taskGraph").get("visits")[1].get("locationOptions")[0].get("locationAddress").get("pincode"),
                        "City": task.get("taskGraph").get("visits")[1].get("locationOptions")[0].get("locationAddress").get("city"),
                        "countryCode": task.get("taskGraph").get("visits")[1].get("locationOptions")[0].get("locationAddress").get("countryCode"),
                        "confidence":task.get("taskGraph").get("visits")[1].get("locationOptions")[0].get("geocodingMetadata").get("confidence"),
                        "date":pd.to_datetime(task.get("status").get("triggerTime")[:-5]).date(),
                        "hour":pd.to_datetime(task.get("status").get("triggerTime")[:-5]).hour,
                        "concatAddress":task.get("taskGraph").get("visits")[1].get("locationOptions")[0].get("locationAddress").get("formattedAddress") + ' , ' + task.get("taskGraph").get("visits")[1].get("locationOptions")[0].get("locationAddress").get("city") + ' , ' + task.get("taskGraph").get("visits")[1].get("locationOptions")[0].get("locationAddress").get("pincode")
                    }
                    writer.writerow(row)
                except:
                    continue
    output.close()

In [5]:
extractTeamData(client_id=client_id, start_date=start_date, end_date=end_date, team_id=team_id, path='./gourmetgarden/')

200
2020-10-20T11:30:00.000Z/AAA36971
Already have write access
2020-10-10T11:30:00.000Z/AAA34570
Already have write access
2020-09-29T11:30:00.000Z/AAA32283
Already have write access
2020-09-17T11:30:00.000Z/AAA30123
Already have write access
2020-09-05T11:30:00.000Z/AAA27999
Already have write access
2020-08-23T11:30:00.000Z/AAA25794
Already have write access
2020-08-11T11:30:00.000Z/AAA23651
Already have write access
None
Already have write access


In [17]:
def prepareData(inputFile):
    data = pd.read_csv(inputFile)
    data = data[~data.duplicated(['Task Id','Order Completion Time'])] # just to remove any possible task duplicates
    data.reset_index(inplace=True)
    data.fillna('',inplace=True)
    # use the below line if address is not concatenated while extracting
    # data['concatAddress'] = [data.loc[ix,'Customer Address'] +', ' + str(data.loc[ix,'Pincode']) +', ' + data.loc[ix,'City'] for ix in data.index]
    # follow the country wise cleaning logic used in the backend for consistency
    cleanPattern = re.compile(r'[\s,-.\'"/\\\n\r]')
    hashNumber = re.compile(r'#(?=\d)|(?<=\d)#') # remove hash if it is next to a number
    data['cleanedAddress'] = [unidecode(hashNumber.sub('',cleanPattern.sub('',data.loc[ix,'concatAddress'].lower()))) for ix in data.index ]
    tmp = data.groupby('cleanedAddress')['Task Id'].count().reset_index()
    # minimum frequency of duplicated addresses is set here. In this instance, it is 3.
    duplicatedAddress = list(tmp[tmp['Task Id']>2]['cleanedAddress'].unique())
    return(duplicatedAddress, data)

In [18]:
def learnRepeatedAddresses(montData, duplicatedAddress, outputFileName, buffer=25, minRepeatedInstances=3):
#     buffer = 25 # in metres
#     minRepeatedInstances = 3 # minimum number of times an address has to repeat to be considered for learning 
    with open(outputFileName,'w') as outputFile:
    #     outputFile.write('concatAddress,cleanAddress,learntLat,learntLng,learntFrom\n')
        for address in duplicatedAddress:
            compLats = list(montData[montData['cleanedAddress']==address]['Completed Lat'])
            compLngs = list(montData[montData['cleanedAddress']==address]['Completed Lng'])
            dismat = get_dist_mat(compLats, compLngs, compLats, compLngs)
            sourceIndex, destIndex = np.where(dismat<buffer)
            freqCount = np.unique(sourceIndex, return_counts=True)
            if np.max(freqCount[1]) >=minRepeatedInstances:
                selectedIndex = np.argmax(freqCount[1])
                finalLatLng = [(compLats[ix],compLngs[ix]) for ix in [destIndex[item] for item in np.where(sourceIndex==selectedIndex)[0]]]
                learntLat, learntLng = np.mean(finalLatLng,axis=0)
                concatAddress = montData[montData['cleanedAddress']==address]['concatAddress'].unique()[0]
                outputFile.write(concatAddress + ' | ' +str(learntLat)+' | '+str(learntLng)+' | 100.0 | '+str(np.max(freqCount[1])) +' | ' + address+'\n')
            else:
                pass

In [8]:
# example for preparing data
duplicatedAddresses, updatedDF = prepareData(sourceDataFile)

In [9]:
# example for learning repeat regions
# writing to a txt file which is used by backend
learnRepeatedAddresses(updatedDF, duplicatedAddresses, outputTxtFile, buffer=50)

In [11]:
# reading the txt file for analysis/easy reading
output50 = pd.read_csv(outputTxtFile, sep= '|', names = ['rawAddress','lat','lng','conf','freq','concatAd'])