In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import json
import geocoder
import pandas as pd

df = pd.read_csv("nick.nzavs.long.csv")
regc_centroids = pd.read_csv("regc_centroids.csv")

In [None]:
df.dropna(subset=["Census.Born.L2"]).drop_duplicates(subset=["Id"])

In [None]:
l2_lookup = {
0: "Supplementary codes",
10: "Oceania and Antarctica",
11: "Australia",
12: "New Zealand",
13: "Melanesia",
14: "Micronesia",
15: "Polynesia",
16: "Antarctica",
20: "North-West Europe",
21: "United Kingdom",
22: "Ireland",
23: "Western Europe",
24: "Northern Europe",
30: "Southern and Eastern Europe",
31: "Southern Europe",
32: "South Eastern Europe",
33: "Eastern Europe",
40: "North Africa and the Middle East",
41: "North Africa",
42: "Middle East",
50: "South-East Asia",
51: "Mainland South-East Asia",
52: "Maritime South-East Asia",
61: "North-East Asia",
70: "Southern and Central Asia",
71: "Southern Asia",
72: "Central Asia",
80: "The Americas",
81: "Northern America",
82: "South America",
83: "Central America",
84: "Caribbean",
90: "Sub-Saharan Africa",
91: "Central and West Africa",
92: "Southern and East Africa",
}

In [None]:
df.dropna(subset=["Census.Born.L2"]).drop_duplicates(subset=["Id"])["Census.Born.L2"].astype(int).apply(lambda x: l2_lookup[x]).value_counts()

In [None]:
countries = df["Census.Born.L2"].dropna().astype(int).apply(lambda x: l2_lookup[x]).unique()
display(countries)
lookup = {}
for country in tqdm(countries):
    g = gmaps.geocode(country)
    print(country, g)
    if g:
        lookup[country] = g[0]["geometry"]["location"]

In [None]:
display(lookup)

In [None]:
lookup = {'New Zealand': {'lat': -40.900557, 'lng': 174.885971},
 'Ireland': {'lat': 53.1423672, 'lng': -7.692053599999999},
 'United Kingdom': {'lat': 55.378051, 'lng': -3.435973},
 'Western Europe': {'lat': 46.2021848, 'lng': 1.2643875},
 'Polynesia': {'lat': -16.8395019, 'lng': -148.3716902},
 'Maritime South-East Asia': {'lat': 0.1065949, 'lng': 99.82776299999999},
 'Australia': {'lat': -25.274398, 'lng': 133.775136},
 'North-East Asia': {'lat': 34.047863, 'lng': 100.6196553},
 'Northern America': {'lat': 37.09024, 'lng': -95.712891},
 'South Eastern Europe': {'lat': 41.8101472, 'lng': 21.0937311},
 'South America': {'lat': -8.783195, 'lng': -55.491477},
 'Southern and East Africa': {'lat': 0.3504167, 'lng': 32.5970189},
 'Southern Asia': {'lat': 25.03764, 'lng': 76.4563087},
 'Middle East': {'lat': 29.2985278, 'lng': 42.5509603},
 'Southern Europe': {'lat': 41.2745004, 'lng': -1.2121322},
 'Central and West Africa': {'lat': 13.531665, 'lng': -2.4604145},
 'Northern Europe': {'lat': 62.27864750000001, 'lng': 12.3401709},
 'Eastern Europe': {'lat': 52.0055148, 'lng': 37.95874939999999},
 'Mainland South-East Asia': {'lat': 14.7145173, 'lng': 102.0718281},
 'Central Asia': {'lat': 45.4506875, 'lng': 68.8319005},
 'Melanesia': {'lat': -8.190185699999999, 'lng': 152.8264684},
 'North Africa': {'lat': 26.0197776, 'lng': 32.277834},
 'South-East Asia': {'lat': -2.2179704, 'lng': 115.66283},
 'Central America': {'lat': 12.7690126, 'lng': -85.60236429999999},
 'Caribbean': {'lat': 21.4691137, 'lng': -78.6568942},
 'The Americas': {'lat': 54.5259614, 'lng': -105.2551187},
 'North Africa and the Middle East': {'lat': 29.2985278, 'lng': 42.5509603},
 'Micronesia': {'lat': 7.425554, 'lng': 150.550812}}

In [None]:
output = []
countries = df.dropna(subset=["Census.Born.L2"]).drop_duplicates(subset=["Id"])["Census.Born.L2"].astype(int).apply(lambda x: l2_lookup[x]).value_counts()
for country, count in countries.items():
    ll = lookup.get(country)
    output.append({"country": country, "pct": str(round(count / sum(countries) * 100, 4)) + "%", "latlong": ll})
display(output)
with open("birth_countries.json", "w") as f:
    json.dump(output, f)

In [None]:
df.columns

In [None]:
avsregc_lookup = {
    1: "Auckland",
    2: "Bay of Plenty",
    3: "Canterbury",
    4: "Gisborne",
    5: "Hawke's Bay",
    6: "Manawatu-Wanganui",
    7: "Marlborough",
    8: "Nelson",
    9: "Northland",
    10: "Otago",
    11: "Southland",
    12: "Taranaki",
    13: "Tasman",
    14: "Waikato",
    15: "Wellington",
    16: "West Coast"
}
regions = {}
for k, v in avsregc_lookup.items():
    spatial = regc_centroids[regc_centroids.Name == v + " Region"]
    lat = spatial.Y.iloc[0]
    lng = spatial.X.iloc[0]
    regions[k] = {
        "name": v,
        "lat": lat,
        "lng": lng
    }
print(regions)
with open("regions.json", "w") as f:
    json.dump(regions, f)

In [None]:
yearMigrations = {}
for i in tqdm(range(1, len(df))):
    pid = df.Id[i]
    lastPid = df.Id[i - 1]
    wave = df.Wave[i]
    lastWave = df.Wave[i - 1]
    try:
        originId = int(df.REGC_2006[i - 1])
        destinationId = int(df.REGC_2006[i])
        if pid == lastPid and wave == lastWave + 1:
            year = str(lastWave)
            if year not in yearMigrations:
                yearMigrations[year] = {}
            migrationId = f"{originId}>{destinationId}"
            if migrationId not in yearMigrations[year]:
                yearMigrations[year][migrationId] = 0
            yearMigrations[year][migrationId] += 1
    except ValueError:
        pass


In [None]:
for year, migrations in yearMigrations.items():
    yearTotal = sum([v for k,v in migrations.items()])
    for k,v in migrations.items():
        migrations[k] = str(round(v / yearTotal * 100, 4)) + "%"

In [None]:
with open("migrations.json", "w") as f:
    json.dump(yearMigrations, f)