In [2]:
import pandas as pd
import numpy as np
from scipy.stats import truncnorm
import re
import random
import geopandas as gpd
from numba import jit, njit
from shapely import MultiPoint, Point
from shapely.ops import nearest_points

In [3]:
dwellings_file = './Output/dd_2006_snapped_unlinked.csv'
jobs_file = './Output/jj_2006_snapped_unlinked.csv'
persons_file = './Output/pp_2006_unlinked.csv'

In [4]:
job_type_indexes = ["Agri","Mnft","Util","Cons","Retl","Trns","Finc","Rlst","Admn","Serv"]

In [5]:
zones = gpd.read_file('../GeoFiles/Montreal_CT_w_csd_2006.shp')

In [7]:
zones.shape

(913, 5)

In [55]:
dwellings = pd.read_csv(dwellings_file)
d_cols = dwellings.columns
d = lambda x: d_cols.get_loc(x)
dwellings:np.ndarray = dwellings.values

persons = pd.read_csv(persons_file)
# nats = persons['nationality']
p_cols = persons.columns
p = lambda x: p_cols.get_loc(x)
persons:np.ndarray = persons[p_cols].values
persons = persons.astype(np.float32)

households_df = pd.DataFrame(columns=['id', 'dwelling', 'hhSize', 'autos'], dtype=int)
h_cols = households_df.columns
h = lambda x: h_cols.get_loc(x)


jobs = pd.read_csv(jobs_file)
jobs = jobs[jobs['zone'].astype(str).isin(zones['CTUID'].astype(str))]
j_cols = jobs.columns
jobs = jobs.values


In [56]:
# def convert_job_int_to_type(job):
#     typ = int(float(job[3]))
#     if typ != -1:
#         job[3] = job_type_indexes[typ]
#     else:
#         job[3] = '?'
#     return job

In [57]:
def convert_int_to_type(job):
    typ = int(job[3]) # type is float
    if typ != -1:
        return job_type_indexes[typ]
    else:
        return '?'

In [58]:
def convert_job_type_to_int(job):
    jobType = job[3]
    if jobType in job_type_indexes:
        job[3] = job_type_indexes.index(jobType)
    else:
        job[3] = -1
    return job

In [59]:
np.apply_along_axis(convert_job_type_to_int, 1, jobs)

array([[0, 24660061, -1, ..., 193044.04741146124, 32400, 28800],
       [1, 24660061, -1, ..., 193044.04741146124, 32400, 28800],
       [2, 24660061, -1, ..., 193044.04741146124, 32400, 28800],
       ...,
       [1699442, 24590189, -1, ..., 210073.2706329787, 32400, 28800],
       [1699443, 24590189, -1, ..., 210084.9938897257, 32400, 28800],
       [1699444, 24590189, -1, ..., 210073.2706329787, 32400, 28800]],
      dtype=object)

In [75]:
jobs = jobs.astype(np.int32)

In [82]:
@jit(nopython=True)
def findPartner(n, person):
    for i in range(n, persons.shape[0]):
        if persons[i][4]==1 and persons[i][1]==-1 and \
                                        abs(person[2] - persons[i][2]) < 10:
            # print(person[0], 'married to ', persons[i][0])
            return i # retur person index
    return None

In [83]:
@jit(nopython=True)
def findRoommate(n, person):
    for i in range(n, persons.shape[0]):
        if persons[i][4]==0 and persons[i][1]==-1 and \
                                        abs(person[2] - persons[i][2]) < 10:
            return i # return person index
    return None

In [84]:
@jit(nopython=True)
def firstLonelyChild(n):
    for i in range(n, persons.shape[0]):
        if persons[i][1]==-1 and persons[i][4]==2:
            lastIndex = i
            return int(persons[i][0])

In [85]:
@jit(nopython=True)
def nextUnhousedPerson(n):
    for i in range(n, persons.shape[0]):
        if persons[i][1]==-1 and persons[i][4] != 2:
            return int(persons[i][0])

In [86]:
# NEED NUMBA OR at least a float array instead of object type
@jit(nopython=True)
def getNextJob(n):
    for i in range(n, jobs.shape[0]):
        if jobs[i][2] == -1:
            return int(jobs[i][0])

<h1>GIVE PERSONS JOBS</h1>

In [87]:
# np.apply_along_axis(assign_person_job, 1, persons)
lastJob = 0
last_i = 0
num_jobs_assigned = 0
for i in range(0,persons.shape[0]):
    if persons[i][5] == 1 and persons[i][7] == -1:
        job = getNextJob(lastJob)
        if job != None:
            persons[i,7] = job
            jobs[job,2] = persons[i,0]
            num_jobs_assigned+=1
            lastJob = job
    last_i = i
    

In [90]:
with_jobs = 0
able_to_have_job = 0
for i in persons:
    if i[7] != -1 and i[5] == 1:
        with_jobs += 1
    if i[5] == 1:
        able_to_have_job += 1

print(with_jobs)
print(able_to_have_job)

1232656
1232656


In [92]:
outputJobs = pd.DataFrame(columns=j_cols, data=jobs)
outputJobs = outputJobs.astype({'id': 'int32', 'zone': 'str', 'type': 'str', 'personId': 'int32', 'startTime': 'int32', 'duration': 'int32'})
outputJobs['type'] = np.apply_along_axis(convert_int_to_type, 1, jobs)


In [94]:

outputJobs.to_csv('./Output/Final/jj_2006_snapped_linked_new.csv', index=False)

HOUSEHOLD SYNTHESIS

for each dwelling
- create a household
  - has either a single person and 0-1 kids
  - OR married couple with 0-2 kids
- 

In [95]:
households = np.empty([dwellings.shape[0],4])

In [96]:
householdNumber = 0
personNum = 0
lastChildIndex = 0
err = 1 # exhausted dwellings, extra people
totalPersons = persons.shape[0]
for i, dwelling in enumerate(dwellings): # for each dwelling
    hhId = householdNumber
    if personNum >= totalPersons:
        err=2
        break
    if persons[personNum][1]==-1 and persons[personNum][4] != 2: # if current person is unhoused and not a child
        person = persons[personNum] # random.randint(0, len(persons)-1)
    else:
        next = nextUnhousedPerson(personNum) # skip to the index of the next unhoused person to save time
        if next == None:
            err=2 # exhausted people, extra dwellings
            break
        person = persons[next]
        personNum = int(person[0])
        
    persons[personNum][p('hhId')] = hhId
    hhSize = 1
    numKids=0
    if person[p('relationship')]==1: # married
        partner = findPartner(int(personNum), person)
        if partner == None: # can't find partner for person
            persons[personNum][p('relationship')]=0 # set person to single
            numKids = random.randint(0,1)
            hhSize += numKids
        else:  
            persons[int(partner)][1] = hhId
            numKids = random.randint(0,3)
            hhSize += numKids+1

        for j in range(numKids):
            child_ind = firstLonelyChild(lastChildIndex)
            if child_ind != None:
                persons[child_ind][1] = hhId
                lastChildIndex = child_ind
        

    elif person[p('relationship')]==0: # single
        numKids=random.randint(0,1)
        numRoommates = random.randint(0,2)
        if numKids>0:
            child_ind = firstLonelyChild(lastChildIndex)
            if child_ind != None:
                persons[child_ind][1] = hhId
                lastChildIndex = child_ind
        
        if numRoommates>0:
            for rm in range(numRoommates):
                roommate_ind = findRoommate(personNum+1, person)
                if roommate_ind != None:
                    persons[roommate_ind][1] = hhId
        hhSize += numKids+numRoommates

    # households = np.r_[households, [[hhId, dwelling[d('id')], hhSize, random.randint(0,hhSize-numKids)]]]
    households[householdNumber][0] = hhId
    households[householdNumber][1] = dwelling[d('id')]
    households[householdNumber][2] = hhSize
    households[householdNumber][3] = random.randint(0,hhSize-numKids)
    householdNumber+=1
    personNum += 1
    dwellings[i][d('hhId')] = hhId

    # if i%10000==0:
    #     print(i)
    # if i%150000==0 and i!=0:
    #     num=int(i/150000)
    #     print('writing to file:', num)
    #     pd.DataFrame(data=households, columns=h_cols).astype(int).to_csv('./Output/hh_2006_'+str(num)+'.csv', index=False)
    #     households = np.empty([0,4])

if err==1:
    print("Exhausted Dwellings, extra people with no dwelling")
    print('At person', personNum)
elif err==2:
    print("All poeple housed, extra dwellings available")


All poeple housed, extra dwellings available


In [97]:
households = households[:householdNumber]

In [98]:
pd.DataFrame(data=households, columns=h_cols).astype(int).to_csv('./Output/Final/hh_2006.csv', index=False)

In [99]:
dwelling_type_conversions = []
for i in dwellings:
    typ = ''
    if i[2] == 0:
        typ = 'SFD'
    elif i[2] == 1:
        typ = 'SFA' 
    elif i[2] == 2:
        typ = 'MF234'
    elif i[2] == 3:
        typ = 'MF5plus'
    elif i[2] == 4:
        typ = 'MH'
    dwelling_type_conversions.append(typ)

In [100]:

out_dwel = pd.DataFrame(data=dwellings, columns=d_cols)
out_dwel = out_dwel.astype({'id': 'int32', 'zone': 'int32', 'hhId': 'int32', 'bedrooms': 'int32', 'quality': 'int32', 'monthlyCost': 'int32', 'restriction': 'int32', 'yearBuilt': 'int32', 'floor': 'int32', 'building': 'int32', 'usage': 'int32'})
out_dwel['type'] = dwelling_type_conversions
out_dwel.to_csv('./Output/Final/dd_2006_snapped_linked.csv', index=False)

GENERATE NATIONALITIES

In [101]:
tmp = """Canadian 1,670,655
French 870,245
Italian	279,800
Irish 239,460
English 138,320
Haitian 132,255
Scottish 124,130
Chinese	108,775
First Nations 101,915
Québécois 92,115
German 86,025
Algerian 84,585
Moroccan 77,450
Spanish	68,600
Greek 66,395
Lebanese 68,765
Polish 64,895
Portuguese 56,405
Russian	49,275
East Indian	48,485
Romanian 47,980
Vietnamese 38,660
Filipino 35,685
Ukrainian 35,050
Belgian	31,840"""

countries = tmp.replace('\t', ' ').replace(',', '').split('\n')
nationalities = {}
total_nationalities_pop = 0
for i in countries:
    ind = re.search('[0-9]', i).span()[0]
    nationalities[i[:ind-1]] = 0.0
    total_nationalities_pop += int(i[ind:])

for i in countries:
    ind = re.search('[0-9]', i).span()[0]
    nationalities[i[:ind-1]] = int(i[ind:])/total_nationalities_pop

print(nationalities)
nationalities_list = random.choices(list(nationalities.keys()), weights=list(nationalities.values()),k=persons.shape[0])

{'Canadian': 0.3617886574998944, 'French': 0.18845588720950504, 'Italian': 0.06059208296654334, 'Irish': 0.05185625513641339, 'English': 0.029953884617341938, 'Haitian': 0.028640478673124335, 'Scottish': 0.026880969473327465, 'Chinese': 0.023555767779434425, 'First Nations': 0.022070200627359773, 'Québécois': 0.0199479618386817, 'German': 0.018629142020003184, 'Algerian': 0.018317302851054568, 'Moroccan': 0.01677218307990987, 'Spanish': 0.014855671520746508, 'Greek': 0.014378167793293942, 'Lebanese': 0.014891403092188537, 'Polish': 0.014053335325639136, 'Portuguese': 0.012214783558712927, 'Russian': 0.010670746562460412, 'East Indian': 0.010499668129495546, 'Romanian': 0.010390307865385095, 'Vietnamese': 0.00837201546635656, 'Filipino': 0.00772776440550786, 'Ukrainian': 0.007590251994200658, 'Belgian': 0.006895110513419371}


In [102]:
personRelationshipConverts = []
for i in persons:
    rel = ''
    if person[4] == 0:
        rel = 'SINGLE'
    elif person[4] == 1:
        rel = 'MARRIED'
    elif person[4] == 2:
        rel = 'CHILD'
    personRelationshipConverts.append(rel)

In [103]:
outputPersons = pd.DataFrame(data=persons, columns=p_cols)
outputPersons['nationality'] = nationalities_list
outputPersons['relationship'] = personRelationshipConverts
outputPersons = outputPersons.astype({'id': 'int32', 'hhId': 'int32', 'age': 'int32', 'gender': 'int32', 'relationship': 'object', 'occupation': 'int32', 'occupation_type': 'int32', 'workplace': 'int32', 'income': 'int32', 'schoolplace': 'int32'})
outputPersons.to_csv('./Output/Final/pp_2006_linked_jobs.csv', index=False)