#### Objective:- Clean and save needed attributes and create table for generating migration points.

In [1]:
import pandas as pd
import geopandas as gpd
import ast,os,random
pd.set_option('display.float_format','{:.1f}'.format)
import warnings
warnings.filterwarnings('ignore')
import cudf, cupy as cp
import numpy as np
import time
import math
import pickle
# pd.set_option('display.max_colwidth', -1)

#### Load data

In [2]:
df = pd.read_csv('data/mapped_blocks_full.csv',encoding='unicode_escape',usecols=['ID20','STATE','COUNTY','P20','eq_P10'])

In [3]:
df.P20.sum()

334735155

In [4]:
df.COUNTY.replace({r'[^\x00-\x7F]+':''},regex=True,inplace=True)
df.COUNTY.replace({r'([A-Z][a-z]+)([A-Z]+)':r'\1'},regex=True,inplace=True)

In [6]:
len(df)

8174955

In [7]:
df['block_diff'] = df['P20'] - df['eq_P10']
df['block_diff'] = df['block_diff'].round()
df['block_net'] = df['block_diff'].apply(lambda x: 1 if x>0 else ( -1 if x<0 else 0))
df.head()

Unnamed: 0,ID20,STATE,COUNTY,P20,eq_P10,block_diff,block_net
0,10010201001000,1,Autauga County,21,30.5,-10.0,-1
1,10010201001001,1,Autauga County,34,30.5,4.0,1
2,10010201001002,1,Autauga County,29,51.8,-23.0,-1
3,10010201001003,1,Autauga County,17,13.3,4.0,1
4,10010201001004,1,Autauga County,0,0.0,0.0,0


In [8]:
df['eq_P10'] = df['eq_P10'].round()
df['error'] = (df['P20']-df['eq_P10']) - df['block_diff']
df.head()

Unnamed: 0,ID20,STATE,COUNTY,P20,eq_P10,block_diff,block_net,error
0,10010201001000,1,Autauga County,21,30.0,-10.0,-1,1.0
1,10010201001001,1,Autauga County,34,30.0,4.0,1,0.0
2,10010201001002,1,Autauga County,29,52.0,-23.0,-1,0.0
3,10010201001003,1,Autauga County,17,13.0,4.0,1,0.0
4,10010201001004,1,Autauga County,0,0.0,0.0,0,0.0


In [9]:
df['eq_P10'] = df['eq_P10'] + df['error']
df[(df['P20']-df['eq_P10'])!=(df['block_diff'])]

Unnamed: 0,ID20,STATE,COUNTY,P20,eq_P10,block_diff,block_net,error


In [14]:
df[['ID20','COUNTY','P20','eq_P10','block_diff','block_net']].to_parquet('data/total_attr_gen_df.parquet') #save attributes to be added later

#### Attach county

In [2]:
df = pd.read_parquet('data/total_attr_gen_df.parquet')
df.head()

Unnamed: 0,ID20,COUNTY,P20,eq_P10,block_diff,block_net
0,10010201001000,Autauga County,21,31.0,-10.0,-1
1,10010201001001,Autauga County,34,30.0,4.0,1
2,10010201001002,Autauga County,29,52.0,-23.0,-1
3,10010201001003,Autauga County,17,13.0,4.0,1
4,10010201001004,Autauga County,0,0.0,0.0,0


In [3]:
def calculate_points(row):
    net = row[-1]
    p20 = row[0]
    p10 = row[1]
    if net < 0:
        return p20 + p10
    else: return p20

In [4]:
df['points'] = df[['P20','eq_P10','block_net']].apply(calculate_points,axis=1)

In [5]:
df

Unnamed: 0,ID20,COUNTY,P20,eq_P10,block_diff,block_net,points
0,10010201001000,Autauga County,21,31.0,-10.0,-1,52.0
1,10010201001001,Autauga County,34,30.0,4.0,1,34.0
2,10010201001002,Autauga County,29,52.0,-23.0,-1,81.0
3,10010201001003,Autauga County,17,13.0,4.0,1,17.0
4,10010201001004,Autauga County,0,0.0,0.0,0,0.0
...,...,...,...,...,...,...,...
8174950,721537506022011,Yauco Municipio,27,6.0,21.0,1,27.0
8174951,721537506022012,Yauco Municipio,43,63.0,-20.0,-1,106.0
8174952,721537506022013,Yauco Municipio,195,341.0,-146.0,-1,536.0
8174953,721537506022014,Yauco Municipio,0,0.0,0.0,0,0.0


In [6]:
county2id = pickle.load(open('county2id.pkl','rb'))

In [10]:
df.COUNTY.value_counts().head(10)

Jefferson County      96055
Los Angeles County    91626
Cook County           85108
Washington County     75565
Montgomery County     66524
Maricopa County       61427
Franklin County       60891
Orange County         60830
Jackson County        60381
Wayne County          59249
Name: COUNTY, dtype: int64

In [22]:
df = df[df.points!=0].reset_index(drop=True)

In [23]:
df[df['COUNTY'] == 'Maricopa County'].points.sum()

6200461

In [14]:
df['points'] = df['points'].astype('int32')

In [20]:
counties = df[['COUNTY','points']].apply(lambda row: [county2id[row[0]]]*row[1],axis=1)

In [22]:
gcounties = cudf.from_pandas(counties)

In [25]:
counties_list = gcounties.explode().reset_index(drop=True)

In [27]:
pickle.dump(counties_list,open('county_list.pkl','wb'))

In [28]:
len(counties_list)

504475979

#### Continue making dataset for population gen

In [52]:
print(len(df))

8174955


In [53]:
df =df[df.points!=0]

In [54]:
print(len(df))

6265163


In [55]:
gen_df = df[['ID20','STATE','points']]

In [56]:
gen_df.to_csv('data/total_population_gen_df.csv')

In [61]:
len(gen_df)

6265163