# Phase 1: Residential Building Consent Dataset

## INPUTS
1.	Geocoded individual consent data  
Various information (specified below under OUTPUTS) need to be added to each observation in these two datasets 
-	csv file containing building consents for 2000 to 2020 [filename: BCs_issued_by_AUP_TLADCs_2021DEC.xlsx]
2.	LINZ spine from phase 0

## OUTPUTS
csv file of building consents with the data fields 1-9 above. The matching process for assigning each consent to a LINZ parcel is described below. 
The following additional data fields are added. These are flags to designate the match to the LINZ dataset.

10.	Ranged Address Indicator  
a.	Indicator (1 or 0) for LINZ parcel that is part of a ranged address consent Ranged_Address_indicator

In [None]:
#pragma nodebook off
#Use nodebook for better reproducibility https://github.com/uoa-eResearch/nodebook
%reload_ext nodebook.ipython
%nodebook disk phase1

In [None]:
# load libraries
import geopandas as gpd # vector data
import pandas as pd # tabular data, loading CSVs
import numpy as np # numeric data
from util import *
import matplotlib # plotting
import contextily as ctx # Used for contextual basemaps
from scipy.spatial import cKDTree
from matplotlib_scalebar.scalebar import ScaleBar # scalebar for plot
import matplotlib.pyplot as plt # plotting
from tqdm.auto import tqdm # progress bars
tqdm.pandas()
import requests
import requests_cache
requests_cache.install_cache('requests_cache')
import json
from shapely.geometry import Point, shape, LineString, MultiLineString, GeometryCollection, MultiPoint, Polygon  # creating points
plt.rcParams['figure.figsize'] = (20, 20)
pd.set_option('max_columns', None)
pd.set_option('max_rows', 300)

In [None]:
%%time
df = pd.read_excel("restricted/BCs_issued_by_AUP_TLADCs_2021DEC.xlsx")
df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.X_Coordinate, df.Y_Coordinate), crs=4326)

In [None]:
df

In [None]:
df["Ranged_Address_indicator"] = df.ADDRESS_1.str.contains('\d\w?\s?-', regex=True, na=False)
df["Ranged_Address_indicator"].value_counts()

11.	Matching Type Indicators:  
a.	LINZ_MATCH_CODE  
b.	LINZ_2ND_MATCH_CODE  
PROCEDURE FOR MATCHING CONSENTS TO LINZ PARCELS  
MATCHING FOR NON-RANGED ADDRESSES:  
   Find the LINZ parcel of the geo-coordinate of the consent.  
a. Check whether the address of the consent and the address of the LINZ parcel match, by matching on number and first word. If so, stop and set LINZ_MATCH_CODE = 1. If not, go on to 2:

In [None]:
%%time
# get number and name of street (but not 'road', 'street', 'place' etc)
# this can be used to match addresses with building consents

def number_name_bc(x):
    """extract street number and first complete word of the street name from building consents"""
    if x.ADDRESS_1 is None:
        pass
    else:
        # get number and first word of address
        joined_address = ' '.join([str(x[f'ADDRESS_{i}']) for i in [1,2, 3]]).lower()
        return ' '.join(joined_address.split(' ')[:2])

def full_address_bc(x):
    """extract full address from building consents"""
    if x.ADDRESS_1 is None:
        pass
    else:
        # get number and first word of address
        joined_address = ' '.join([str(x[f'ADDRESS_{i}']) for i in [1,2, 3] if not str(x[f'ADDRESS_{i}']) == 'nan']).lower()
        return joined_address

df['number_name'] = df.apply(number_name_bc, axis=1)
df['full_address'] = df.apply(full_address_bc, axis=1)

In [None]:
any(df.full_address.str.contains("[āēīōū]"))

In [None]:
parcels = gpd.read_file('input/NZ_Primary_Parcels_Nov_2016_filtered.gpkg').to_crs(2193)
parcels = parcels.set_index("id")
parcels

In [None]:
addresses = gpd.read_file('input/lds-nz-addresses-pilot-FGDB.zip!nz-addresses-pilot.gdb').to_crs(2193)
addresses

In [None]:
def number_name_addresses(x):
    return ' '.join(x.split(' ')[:2]).lower()
addresses['address_number_name'] = addresses.full_address_ascii.apply(number_name_addresses)
addresses

In [None]:
%%time
parcels_addressed = gpd.sjoin(parcels, addresses)

In [None]:
%%time
parcels["addresses"] = parcel_address_number_name = parcels_addressed.groupby('id')['address_number_name'].apply(list)
display(parcels["addresses"])
parcels["full_addresses"] = parcels_addressed.groupby('id')["full_address"].apply(list)
display(parcels["full_addresses"])

In [None]:
%%time
df = gpd.sjoin(df.to_crs(2193), parcels[["addresses","full_addresses","geometry"]], how="left")

In [None]:
print(sum(df.number_name.str.contains("mt")),
sum(df.number_name.str.contains("mount")),
sum(addresses.address_number_name.str.contains("mt")),
sum(addresses.address_number_name.str.contains("mount")))

In [None]:
df = df.rename(columns={"index_right": "LINZ_parcel_ID"})
df.LINZ_parcel_ID = df.LINZ_parcel_ID.astype("Int64")
df

In [None]:
assert df.index.is_unique

In [None]:
def check_match(row):
    if row.number_name in row.addresses:
        return 1
    return np.nan
match = df[~pd.isna(df.number_name) & ~pd.isna(df.addresses)].apply(check_match, axis=1)
df["LINZ_MATCH_CODE"] = match
df.LINZ_MATCH_CODE.value_counts(dropna=False)

Find all the LINZ parcels within r radius of the geo-coordinate of the consent  
a.       Search for a match of the consent address within the set of LINZ parcels within the radius r. If a match is found, stop and set LINZ_MATCH_CODE = 2. If there is no match, go on to 3:

In [None]:
parcel_tree = cKDTree(list(parcels.centroid.apply(lambda x: (x.x, x.y))))
parcel_tree

In [None]:
sample = df[pd.isna(df["LINZ_MATCH_CODE"]) & df.number_name.isin(addresses.address_number_name)].head(1)
sample

In [None]:
indices_in_range = parcel_tree.query_ball_point([sample.centroid.iloc[0].x, sample.centroid.iloc[0].y], 1250)
ax = sample.to_crs(epsg=3857).plot(color="red", alpha=1)
sample.buffer(1250).to_crs(epsg=3857).plot(ax=ax, alpha=.5, color="green")
potential_matches = parcels.iloc[indices_in_range]
potential_matches.to_crs(epsg=3857).plot(ax=ax, alpha=.5, edgecolor="black")
potential_matches[potential_matches.addresses.str.contains(sample.number_name.iloc[0], na=False, regex=False)].to_crs(epsg=3857).plot(ax=ax, color="red")
ctx.add_basemap(ax, source=ctx.providers.Esri.WorldImagery)

In [None]:
def find_match_in_range(row):
    indices_in_range = parcel_tree.query_ball_point([row.geometry.x, row.geometry.y], 1250)
    potential_matches = parcels.iloc[indices_in_range]
    match = potential_matches[potential_matches.addresses.str.contains(row.number_name, na=False, regex=False)]
    if len(match):
        row.LINZ_parcel_ID = match.index[0]
        row.addresses = match.addresses.iloc[0]
        row.full_addresses = match.full_addresses.iloc[0]
        row.LINZ_MATCH_CODE = 2
    return row
matches = df[
    ~pd.isna(df.number_name) &
    pd.isna(df["LINZ_MATCH_CODE"]) &
    df.number_name.isin(addresses.address_number_name)].progress_apply(find_match_in_range, axis=1)

In [None]:
matches

In [None]:
df.update(matches)
df.LINZ_MATCH_CODE.value_counts(dropna=False)

In [None]:
df[pd.isna(df["LINZ_MATCH_CODE"]) & df.number_name.isin(addresses.address_number_name)]

In [None]:
%%time
number_name_lookup = {}
for parcel_id, addresses in parcels.addresses[~pd.isna(parcels.addresses)].items():
    for address in addresses:
        if address not in number_name_lookup:
            number_name_lookup[address] = []
        number_name_lookup[address].append(parcel_id)

In [None]:
%%time
def find_text_match(row):
    matching_ids = number_name_lookup.get(row.number_name)
    if len(matching_ids):
        match = parcels.loc[matching_ids]
        closest_parcel_id = match.distance(row.geometry).idxmin()
        row.LINZ_parcel_ID = closest_parcel_id
        row.addresses = match.addresses[closest_parcel_id]
        row.full_addresses = match.full_addresses[closest_parcel_id]
        row.LINZ_MATCH_CODE = 3
    return row
matches = df[pd.isna(df["LINZ_MATCH_CODE"]) & df.number_name.isin(number_name_lookup.keys())].progress_apply(find_text_match, axis=1)
df.update(matches)
df.LINZ_MATCH_CODE.value_counts(dropna=False)

In [None]:
df.Ranged_Address_indicator = df.Ranged_Address_indicator.astype(bool)

In [None]:
maybe_subdivision = df[df.number_name.str.contains("/") & pd.isna(df.LINZ_MATCH_CODE) & ~df.Ranged_Address_indicator]
maybe_subdivision

In [None]:
maybe_subdivision.number_name = maybe_subdivision.number_name.str.split("/").str[-1]

In [None]:
def check_match(row):
    if row.number_name in row.addresses:
        return 1
    return np.nan
match = maybe_subdivision[~pd.isna(maybe_subdivision.addresses)].apply(check_match, axis=1)
maybe_subdivision.LINZ_MATCH_CODE = match
match.value_counts(dropna=False)

In [None]:
matches = maybe_subdivision[
    pd.isna(match) &
    maybe_subdivision.number_name.isin(number_name_lookup.keys())].progress_apply(find_match_in_range, axis=1)

In [None]:
matches

In [None]:
maybe_subdivision.update(matches)
maybe_subdivision.LINZ_MATCH_CODE.value_counts(dropna=False)

In [None]:
matches = maybe_subdivision[pd.isna(maybe_subdivision["LINZ_MATCH_CODE"]) & maybe_subdivision.number_name.isin(number_name_lookup.keys())].progress_apply(find_text_match, axis=1)
maybe_subdivision.update(matches)
maybe_subdivision.LINZ_MATCH_CODE.value_counts(dropna=False)

In [None]:
maybe_subdivision

In [None]:
maybe_subdivision.LINZ_MATCH_CODE = maybe_subdivision.LINZ_MATCH_CODE.replace({1: 4, 2: 5, 3: 6})
maybe_subdivision.LINZ_MATCH_CODE.value_counts()

In [None]:
df.update(maybe_subdivision)

In [None]:
maybe_subdivision = df[
    df.number_name.str.contains("[0-9][a-zA-Z] ") &
    pd.isna(df.LINZ_MATCH_CODE) &
    (df.Ranged_Address_indicator == False)
]
maybe_subdivision

In [None]:
def strip_char(s):
    number, name = s.split()
    number = "".join([c for c in number if c.isnumeric()])
    return number + " " + name
maybe_subdivision.number_name = maybe_subdivision.number_name.apply(strip_char)

In [None]:
def check_match(row):
    if row.number_name in row.addresses:
        return 1
    return np.nan
match = maybe_subdivision[~pd.isna(maybe_subdivision.addresses)].apply(check_match, axis=1)
maybe_subdivision.LINZ_MATCH_CODE = match
match.value_counts(dropna=False)

In [None]:
matches = maybe_subdivision[
    pd.isna(match) &
    maybe_subdivision.number_name.isin(number_name_lookup.keys())].progress_apply(find_match_in_range, axis=1)

In [None]:
maybe_subdivision.update(matches)
maybe_subdivision.LINZ_MATCH_CODE.value_counts(dropna=False)

In [None]:
matches = maybe_subdivision[pd.isna(maybe_subdivision["LINZ_MATCH_CODE"]) & maybe_subdivision.number_name.isin(number_name_lookup.keys())].progress_apply(find_text_match, axis=1)
maybe_subdivision.update(matches)
maybe_subdivision.LINZ_MATCH_CODE.value_counts(dropna=False)

In [None]:
maybe_subdivision

In [None]:
maybe_subdivision.LINZ_MATCH_CODE = maybe_subdivision.LINZ_MATCH_CODE.replace({1: 4, 2: 5, 3: 6})
maybe_subdivision.LINZ_MATCH_CODE.value_counts()

In [None]:
df.update(maybe_subdivision)

In [None]:
%%time
distances = df[~pd.isna(df.LINZ_parcel_ID)].distance(parcels.loc[df.LINZ_parcel_ID[~pd.isna(df.LINZ_parcel_ID)]], align=False)
distances

In [None]:
distances[distances>0].describe()

In [None]:
df["distance_to_parcel"] = distances

 Identify the LINZ parcel of the geo-coordinate of the consent. If the name of the road in the address of the LINZ parcel matches the road name of the address given in the consent dataset, set the parcel to be the LINZ parcel of the of geocoordinate of the consent. Set LINZ_MATCH_CODE = 7 and stop. If not, proceed to 8:

In [None]:
df[["number_name", "addresses"]]

In [None]:
def check_street_match(row):
    if row.number_name and len(row.number_name)>1:
        street = row.number_name.split()[-1]
        if street in str(row.addresses):
            return 7
    return np.nan
street_matches = df[pd.isna(df.LINZ_MATCH_CODE)].progress_apply(check_street_match, axis=1)
display(street_matches.value_counts(dropna=False))
df.LINZ_MATCH_CODE.update(street_matches)

Identify the LINZ parcel of the geo-coordinate of the consent and use this. Set LINZ_MATCH_CODE = 8 and stop. If there is no parcel under the geo-coordinate, go to step 9:

In [None]:
df.LINZ_MATCH_CODE[~pd.isna(df.LINZ_parcel_ID) & pd.isna(df.LINZ_MATCH_CODE)] = 8

Set LINZ_MATCH_CODE = 9. This indicates no match even with just the consent long-lat.

In [None]:
df.LINZ_MATCH_CODE = df.LINZ_MATCH_CODE.fillna(9)
df.LINZ_MATCH_CODE.value_counts()

In [None]:
df.LINZ_MATCH_CODE[df.Ranged_Address_indicator].value_counts()

MATCHING FOR RANGED ADDRESSES:
For ranged addresses, follow the same approach for each individual address, using the same geo-coordinate for each address in step 1. However, we set r in step 2 to a larger number in order to do a wider search. If no matches are found at step 2, but at least one match in the ranged address is found at step 1 or step 2, go to step 5. This avoids matching parcels that are far away from the rest of the group. Only if there is no match within the ranged addresses at either step 1 or step 2 do we proceed to step 3. 

We include an additional flag for each ranged address that indicates the best (i.e. lowest) LINZ_MATCH_CODE for that range. E.g. for a ranged address with five addresses, if at least one of them had LINZ_MATCH_CODE = 2, and all others were LINZ_MATCH_CODE > 2, then the best LINZ_MATCH_CODE would be 2. This is call LINZ_2ND_MATCH_CODE below.
Each address (in the ranged address set) is assigned its own unique LINZ parcel. Each parcel is then assigned the information given in 1 through 11 below. Because the parcels share the same consent ID, we can tell that the different parcels pertain to the same consent, thereby avoiding double counting. 

In [None]:
ranged = df[df.Ranged_Address_indicator & (df.LINZ_MATCH_CODE > 2)]
ranged

In [None]:
def parse_int(value):
    return int(''.join(filter(lambda c: c.isdigit(), value)))

def char_range(c1, c2):
    """Generates the characters from `c1` to `c2`, inclusive."""
    for c in range(ord(c1), ord(c2)+1):
        yield chr(c)

def range_expand(r):
    """given a range r, like '20-23', expand the range on one side of the road, e.g. ['20', '21', '22', '23']"""
    if '/' in r:
        a, b = r.split('/')
        if "-" in a:
            r = a
            suffix = "/" + b
        else:
            r = b
            suffix = "/" + a
    else:
        suffix = ''
    r1, r2 = r.split('-')
    if r2 == "sep" or r2 == "":
        return [r1]
    if r2.isalpha():
        return [r1.replace(r1[-1], char) for char in char_range(r1[-1], r2)]
    r1 = parse_int(r1)
    r2 = parse_int(r2)
    if suffix:
        step = 1
    else:
        step = 2
    if r1 < r2:
        return [str(i) + suffix for i in range(r1, r2 + 1, step)]
    else:
        return [str(i) + suffix for i in range(r2, r1 + 1, step)]

corrections = {
    "1/ 231-235": "231-235 hinemoa",
    "2/ 16-18": "16-18 montgomery",
    "2/ 2-8": "2-8 matipo",
    "r 35-41": "35-41 birkenhead",
    "r 31-35": "31-35 cresta",
    "27 -": "27-31 victoria",
    "1 &": "2-1/96 bernleigh",
    "55121-23 morrison": "21-23 morrison",
    "1954-60 great": "1954-1960 great",
    "3-9 ": "3-9 faldo",
    "22 -": "22-26 rawalpindi",
    "36 -": "36-38 border",
    "57a -": "57-61 woodward",
    "102 -": "102-104 may"
}
ranged.number_name = ranged.number_name.replace(corrections)

for r in ranged.number_name:
    e = None
    try:
        number, name = r.split()
        range_expanded = range_expand(number)
        assert len(range_expanded) > 0 and len(range_expanded) < 1000
    except Exception as e:
        print(f"Parsing of {r} failed: {e}")
        display(ranged[ranged.number_name == r])

In [None]:
for test_case in ['20-28', '1-5/32a', '53a-c']:
    print(f"{test_case}: {range_expand(test_case)}")

In [None]:
%%time
expanded_ranged_addresses = []
for i, sample in tqdm(ranged.iterrows(), total=len(ranged)):
    # Expand this ranged address into it's own dataframe, with each number as it's own row
    number, name = sample.number_name.split()
    rows = []
    for n in range_expand(number):
        row = sample.copy()
        row.number_name = n + " " + name
        row["unique_id"] = f"{row.OBS}_{n}"
        rows.append(row)
    sample = pd.DataFrame(rows).set_index("unique_id")
    if not sample.index.is_unique:
        display(sample)
        raise

    # Step 1
    def check_match(row):
        if row.number_name in row.addresses:
            return 1
        return np.nan
    match = sample[~pd.isna(sample.number_name) & ~pd.isna(sample.addresses)].apply(check_match, axis=1)
    if len(match):
        sample["LINZ_MATCH_CODE"] = match
    else:
        sample["LINZ_MATCH_CODE"] = np.nan

    #However, we set r in step 2 to a larger number in order to do a wider search.

    def find_match_in_range(row):
        if str(row.geometry) == "nan":
            return row
        indices_in_range = parcel_tree.query_ball_point([row.geometry.x, row.geometry.y], 2000)
        potential_matches = parcels.iloc[indices_in_range]
        match = potential_matches[potential_matches.addresses.str.contains(row.number_name, na=False, regex=False)]
        if len(match):
            row.LINZ_parcel_ID = match.index[0]
            row.addresses = match.addresses.iloc[0]
            row.full_addresses = match.full_addresses.iloc[0]
            row.LINZ_MATCH_CODE = 2
        return row
    matches = sample[
        ~pd.isna(sample.number_name) &
        pd.isna(sample["LINZ_MATCH_CODE"])].apply(find_match_in_range, axis=1)
    sample.update(matches)

    # If no matches are found at step 2, but at least one match in the ranged address is found at step 1 or step 2, stop and set LINZ_match_code to 10
    # This avoids matching parcels that are far away from the rest of the group
    if all(pd.isna(sample.LINZ_MATCH_CODE)): # all na here means no matches in step 1 or 2
        # Only if there is no match within the ranged addresses at either step 1 or step 2 do we proceed to step 3. 
        matches = sample[pd.isna(sample["LINZ_MATCH_CODE"]) & sample.number_name.isin(number_name_lookup.keys())].apply(find_text_match, axis=1)
        sample.update(matches)
        maybe_subdivision = sample[sample.number_name.str.contains("/") & pd.isna(sample.LINZ_MATCH_CODE)]
        if len(maybe_subdivision):
            maybe_subdivision.number_name = maybe_subdivision.number_name.str.split("/").str[-1]
            # step 1
            match = maybe_subdivision[~pd.isna(maybe_subdivision.addresses)].apply(check_match, axis=1)
            if len(match):
                maybe_subdivision["LINZ_MATCH_CODE"] = match
            else:
                maybe_subdivision["LINZ_MATCH_CODE"] = np.nan
            # step 2
            matches = maybe_subdivision[
                pd.isna(match) &
                maybe_subdivision.number_name.isin(number_name_lookup.keys())
            ].apply(find_match_in_range, axis=1)
            maybe_subdivision.update(matches)
            # step 3
            matches = maybe_subdivision[
                pd.isna(maybe_subdivision["LINZ_MATCH_CODE"]) & maybe_subdivision.number_name.isin(number_name_lookup.keys())
            ].apply(find_text_match, axis=1)
            maybe_subdivision.update(matches)
            maybe_subdivision.LINZ_MATCH_CODE = maybe_subdivision.LINZ_MATCH_CODE.replace({1: 4, 2: 5, 3: 6})
            sample.update(maybe_subdivision)
        maybe_subdivision = sample[
            sample.number_name.str.contains("[0-9][a-zA-Z] ") &
            pd.isna(sample.LINZ_MATCH_CODE)
        ]
        if len(maybe_subdivision):
            maybe_subdivision.number_name = maybe_subdivision.number_name.apply(strip_char)
            # step 1
            match = maybe_subdivision[~pd.isna(maybe_subdivision.addresses)].apply(check_match, axis=1)
            if len(match):
                maybe_subdivision["LINZ_MATCH_CODE"] = match
            else:
                maybe_subdivision["LINZ_MATCH_CODE"] = np.nan
            # step 2
            matches = maybe_subdivision[
                pd.isna(match) &
                maybe_subdivision.number_name.isin(number_name_lookup.keys())
            ].apply(find_match_in_range, axis=1)
            maybe_subdivision.update(matches)
            # step 3
            matches = maybe_subdivision[
                pd.isna(maybe_subdivision["LINZ_MATCH_CODE"]) & maybe_subdivision.number_name.isin(number_name_lookup.keys())
            ].apply(find_text_match, axis=1)
            maybe_subdivision.update(matches)
            maybe_subdivision.LINZ_MATCH_CODE = maybe_subdivision.LINZ_MATCH_CODE.replace({1: 4, 2: 5, 3: 6})
            sample.update(maybe_subdivision)

        if all(pd.isna(sample.LINZ_MATCH_CODE)):
            street_matches = sample[pd.isna(sample.LINZ_MATCH_CODE)].apply(check_street_match, axis=1)
            sample.LINZ_MATCH_CODE.update(street_matches)
            sample.loc[~pd.isna(sample.LINZ_parcel_ID) & pd.isna(sample.LINZ_MATCH_CODE), "LINZ_MATCH_CODE"] = 8
            sample.LINZ_MATCH_CODE = sample.LINZ_MATCH_CODE.fillna(9)
        else:
            sample.LINZ_MATCH_CODE = sample.LINZ_MATCH_CODE.fillna(10)
    else:
        sample.LINZ_MATCH_CODE = sample.LINZ_MATCH_CODE.fillna(10)
    #We include an additional flag for each ranged address that indicates the best (i.e. lowest) LINZ_MATCH_CODE for that range.
    #E.g. for a ranged address with five addresses, if at least one of them had LINZ_MATCH_CODE = 2,
    #and all others were LINZ_MATCH_CODE > 2, then the best LINZ_MATCH_CODE would be 2. This is call LINZ_2ND_MATCH_CODE below.
    # Each address (in the ranged address set) is assigned its own unique LINZ parcel.
    # Each parcel is then assigned the information given in 1 through 11 below.
    # Because the parcels share the same consent ID, we can tell that the different parcels pertain to the same consent, thereby avoiding double counting. 
    sample["LINZ_2ND_MATCH_CODE"] = sample.LINZ_MATCH_CODE.min()
    expanded_ranged_addresses.append(sample)

In [None]:
df = df.drop(ranged.index)
df = pd.concat([df] + expanded_ranged_addresses)
df

In [None]:
df.LINZ_MATCH_CODE[df.Ranged_Address_indicator].value_counts(dropna=False)

In [None]:
df.LINZ_2ND_MATCH_CODE[df.Ranged_Address_indicator].value_counts(dropna=False)

In [None]:
df.LINZ_MATCH_CODE.value_counts(dropna=False)

In [None]:
%%time
df = df.reset_index()
distances = df[~pd.isna(df.LINZ_parcel_ID)].distance(parcels.loc[df.LINZ_parcel_ID[~pd.isna(df.LINZ_parcel_ID)]], align=False)
display(distances)
display(distances[distances>0].describe())

In [None]:
df["distance_to_parcel"] = distances

In [None]:
df[(df.LINZ_MATCH_CODE == 3) & (df.distance_to_parcel > 10000)]

In [None]:
%%time
def fix_full_address(row):
    number_name = ' '.join(row.full_address.split(' ')[:2])
    row.full_address = row.full_address.replace(number_name, row.number_name)
    return row
df = df.progress_apply(fix_full_address, axis=1)
df

In [None]:
session = requests.Session()
def geocode(address):
    result = session.get("https://api-proxy.auckland-cer.cloud.edu.au/nominatim/search.php?q=" + address).json()
    if result:
        return pd.Series(result[0])
    else:
        return pd.Series()
bad_matches = df.full_address[df.LINZ_MATCH_CODE.isin([3,6,7,8,9,10])]
geocoding_results = bad_matches.progress_apply(geocode)
geocoding_results

In [None]:
geocoding_results = geocoding_results.dropna()
display(geocoding_results.category.value_counts())
display(geocoding_results.type.value_counts())

In [None]:
geocoding_results = gpd.GeoDataFrame(geocoding_results, geometry=gpd.points_from_xy(geocoding_results.lon, geocoding_results.lat), crs=4326).to_crs(2193)
geocoding_results

In [None]:
df["distance_from_geocoding_result"] = df.loc[geocoding_results.index].distance(geocoding_results)
df["distance_from_geocoding_result"].describe()

In [None]:
%%time
# redo join with updated coordinates
geocoding_results = gpd.sjoin(geocoding_results, parcels[["addresses","geometry"]], how="left")

In [None]:
geocoding_results = geocoding_results.rename(columns={"index_right": "LINZ_parcel_ID"})
geocoding_results.LINZ_parcel_ID = geocoding_results.LINZ_parcel_ID.astype("Int64")
geocoding_results

In [None]:
geocoding_results["number_name"] = df.number_name[geocoding_results.index]
match = geocoding_results[~pd.isna(geocoding_results.number_name) & ~pd.isna(geocoding_results.addresses)].apply(check_match, axis=1)
geocoding_results["LINZ_MATCH_CODE"] = match
geocoding_results.LINZ_MATCH_CODE.value_counts(dropna=False)

In [None]:
geocoding_results["LINZ_2ND_MATCH_CODE"] = df.LINZ_2ND_MATCH_CODE[geocoding_results.index]
def find_match_in_range(row):
    if np.isnan(row.LINZ_2ND_MATCH_CODE):
        r = 1250
    else:
        r = 2000
    indices_in_range = parcel_tree.query_ball_point([row.geometry.x, row.geometry.y], r)
    potential_matches = parcels.iloc[indices_in_range]
    match = potential_matches[potential_matches.addresses.str.contains(row.number_name, na=False, regex=False)]
    if len(match):
        row.LINZ_parcel_ID = match.index[0]
        row.addresses = match.addresses.iloc[0]
        row.full_addresses = match.full_addresses.iloc[0]
        row.LINZ_MATCH_CODE = 2
    return row
matches = geocoding_results[
    ~pd.isna(geocoding_results.number_name) &
    pd.isna(geocoding_results["LINZ_MATCH_CODE"]) &
    geocoding_results.number_name.isin(number_name_lookup.keys())].progress_apply(find_match_in_range, axis=1)

In [None]:
geocoding_results.update(matches)
geocoding_results.LINZ_MATCH_CODE.value_counts(dropna=False)

In [None]:
geocoding_results["original_LINZ_MATCH_CODE"] = df.LINZ_MATCH_CODE[geocoding_results.index]

In [None]:
geocoding_results

In [None]:
text_match = geocoding_results.original_LINZ_MATCH_CODE.isin([3,6]) & pd.isna(geocoding_results.LINZ_MATCH_CODE)
print(sum(text_match))
geocoding_results = geocoding_results[~text_match]

In [None]:
street_matches = geocoding_results[pd.isna(geocoding_results.LINZ_MATCH_CODE)].progress_apply(check_street_match, axis=1)
display(street_matches.value_counts(dropna=False))
geocoding_results.LINZ_MATCH_CODE.update(street_matches)

In [None]:
geocoding_results.LINZ_MATCH_CODE[
    pd.isna(geocoding_results.LINZ_MATCH_CODE) &
    pd.isna(geocoding_results.LINZ_2ND_MATCH_CODE) &
    ~pd.isna(geocoding_results.LINZ_parcel_ID)] = 8
geocoding_results.LINZ_MATCH_CODE[
    pd.isna(geocoding_results.LINZ_MATCH_CODE) &
    pd.isna(geocoding_results.LINZ_2ND_MATCH_CODE) &
    pd.isna(geocoding_results.LINZ_parcel_ID)] = 9

In [None]:
geocoding_results.LINZ_MATCH_CODE.value_counts(dropna=False)

In [None]:
geocoding_results["OBS"] = df.OBS[geocoding_results.index]
geocoding_results.LINZ_MATCH_CODE[pd.isna(geocoding_results.LINZ_MATCH_CODE) &
    geocoding_results.LINZ_2ND_MATCH_CODE.isin([1,2,4,5])
] = 10
geocoding_results.LINZ_MATCH_CODE[pd.isna(geocoding_results.LINZ_MATCH_CODE) & ~pd.isna(geocoding_results.LINZ_parcel_ID)] = 8

In [None]:
# Actually, this operation is equivalent to +=3
geocoding_results.LINZ_MATCH_CODE.update(geocoding_results.LINZ_MATCH_CODE[geocoding_results.original_LINZ_MATCH_CODE.isin([4,5,6])].replace({1: 4, 2: 5, 3: 6}))

In [None]:
made_it_worse = geocoding_results.LINZ_MATCH_CODE > geocoding_results.original_LINZ_MATCH_CODE
print(sum(made_it_worse))
geocoding_results = geocoding_results[~made_it_worse]

In [None]:
geocoding_results["change"] = geocoding_results.apply(lambda row: f"{row.original_LINZ_MATCH_CODE} -> {row.LINZ_MATCH_CODE}", axis=1)
geocoding_results["change"].value_counts()

In [None]:
display(geocoding_results.change[pd.isna(geocoding_results.LINZ_2ND_MATCH_CODE)].value_counts())
display(geocoding_results.change[~pd.isna(geocoding_results.LINZ_2ND_MATCH_CODE)].value_counts())

In [None]:
good_changes = geocoding_results.LINZ_MATCH_CODE < geocoding_results.original_LINZ_MATCH_CODE
print(sum(good_changes))
geocoding_results = geocoding_results[good_changes]
geocoding_results["change"].value_counts()

In [None]:
df.update(geocoding_results)
df

In [None]:
df.LINZ_MATCH_CODE.value_counts(dropna=False)

In [None]:
sum(df.LINZ_MATCH_CODE < df.LINZ_2ND_MATCH_CODE)

In [None]:
group = df[~pd.isna(df.LINZ_2ND_MATCH_CODE)].groupby("OBS")
group

In [None]:
updates = []
for ix, sample in tqdm(group):
    minmatch = sample.LINZ_MATCH_CODE.min()
    if minmatch <=2:
        sample.loc[sample.LINZ_MATCH_CODE>2, "LINZ_MATCH_CODE"] = 10
    elif minmatch <=5:
        sample.loc[sample.LINZ_MATCH_CODE>5, "LINZ_MATCH_CODE"] = 10
    sample.LINZ_2ND_MATCH_CODE = minmatch
    updates.append(sample)
updates = pd.concat(updates)
updates

In [None]:
df.update(updates)

In [None]:
assert sum(df.LINZ_MATCH_CODE < df.LINZ_2ND_MATCH_CODE) == 0

In [None]:
df.LINZ_MATCH_CODE.value_counts()

In [None]:
df.LINZ_parcel_ID[df.LINZ_MATCH_CODE >= 9] = np.nan
df.addresses[df.LINZ_MATCH_CODE >= 9] = np.nan
df.full_addresses[df.LINZ_MATCH_CODE >= 9] = np.nan

In [None]:
distances = df[~pd.isna(df.LINZ_parcel_ID)].distance(parcels.loc[df.LINZ_parcel_ID[~pd.isna(df.LINZ_parcel_ID)]], align=False)
display(distances)
display(distances[distances>0].describe())

In [None]:
df["distance_to_parcel"] = distances

In [None]:
print(sum((df.LINZ_MATCH_CODE < 7) & (df.addresses.str.len()!=df.full_addresses.str.len())))

In [None]:
non_na_parcel_ids = df.LINZ_parcel_ID[~pd.isna(df.LINZ_parcel_ID)]
full_addrs = parcels.full_addresses[non_na_parcel_ids]
full_addrs.index = non_na_parcel_ids.index
df.full_addresses = full_addrs
print(sum((df.LINZ_MATCH_CODE < 7) & (df.addresses.str.len()!=df.full_addresses.str.len())))
display(df)

In [None]:
def get_full_addr(row):
    try:
        return row.full_addresses[row.addresses.index(row.number_name)]
    except:
        display(row)
df["matched_address"] = df[df.LINZ_MATCH_CODE < 7].progress_apply(get_full_addr, axis=1)
df

In [None]:
%%time
phase0 = pd.read_csv("output/parcels_phase0.csv", index_col="LINZ_parcel_ID", low_memory=False, lineterminator="\n")
phase0

In [None]:
len(df), len(df.LINZ_parcel_ID.unique()), len(phase0), len(phase0.index.unique())

In [None]:
%%time
df = df.drop(columns="geometry").merge(phase0, on="LINZ_parcel_ID", how="left")
df

In [None]:
df.columns

In [None]:
df.to_csv("restricted/BCs_issued_by_AUP_TLADCs_2021DEC_augmented.csv", index=False)