In [1]:
# iPython script for selecting a subsample of zones from the rJourney
# households file. The complete file contains almost 5,000 zones, and
# inter-zone calculations are expensive. Reducing zone count drastically
# speeds up the program.

import pandas as pd
import numpy as np # required for np.random.choice

In [2]:
# This is the big 114M households file. Put it in the same folder as this script.
# NOTE: this is a 5GB file and requires at least twice as much available RAM on the
# host machine in order to process properly.
us_synpop_sorted = pd.read_csv('us_synpop_hh3_sorted.dat', sep="\t")

In [3]:
len(us_synpop_sorted)

114736858

In [4]:
# Check
us_synpop_sorted.head()

Unnamed: 0,hhid,hhtract,hhnuma,hhsize,hhworkers,hhnwkers,hhhaskids,hhderage,hhincome,hhexpfac
0,1125131,1073010500,0,2,0,1,1,46,7600,1
1,1125132,1073010500,0,2,0,1,1,35,14000,1
2,1125133,1073010500,0,2,1,0,1,35,2030,1
3,1125134,1073010500,0,2,1,0,1,27,11000,1
4,1125135,1073010500,0,2,1,0,1,54,12000,1


In [5]:
# Zones rolled up by how frequently they appear in the data
zone_counts = us_synpop_sorted['hhnuma'].value_counts().sort_index()

In [6]:
# Zones normalized to add up to 1
zone_probabilities = zone_counts/len(us_synpop_sorted)

In [7]:
# select a given number of sample zones, using zone counts as probability weights
num_zones = 200
sample_zones = np.random.choice(zone_counts.index, size=num_zones, p=zone_probabilities)
np.sort(sample_zones)

array([   7,   12,   41,   62,   64,   75,   78,   83,   85,  114,  166,
        175,  178,  187,  237,  250,  280,  281,  321,  332,  352,  356,
        358,  393,  397,  415,  424,  425,  436,  455,  499,  500,  501,
        505,  543,  548,  550,  552,  553,  567,  568,  581,  584,  592,
        600,  615,  630,  650,  655,  666,  711,  715,  743,  754,  755,
        763,  786,  829,  852,  858,  879,  891,  931,  933,  951,  957,
        965,  966,  967,  974, 1022, 1026, 1043, 1047, 1057, 1057, 1066,
       1068, 1080, 1082, 1094, 1096, 1105, 1106, 1125, 1135, 1146, 1169,
       1183, 1197, 1211, 1216, 1222, 1237, 1239, 1246, 1249, 1256, 1265,
       1268, 1270, 1272, 1298, 1303, 1308, 1329, 1332, 1332, 1341, 1353,
       1365, 1373, 1389, 1393, 1422, 1425, 1470, 1479, 1561, 1562, 1590,
       1595, 1596, 1614, 1660, 1730, 1731, 1810, 1938, 1942, 1945, 1961,
       2026, 2086, 2105, 2193, 2282, 2339, 2419, 2435, 2509, 2589, 2677,
       2694, 2730, 2825, 2938, 2968, 3038, 3054, 30

In [8]:
# Select all the households that are in the sample zones
us_synpop_sample = us_synpop_sorted.loc[us_synpop_sorted['hhnuma'].isin(sample_zones)]

In [9]:
len(us_synpop_sample)

10227956

In [11]:
us_synpop_sample.head()

Unnamed: 0,hhid,hhtract,hhnuma,hhsize,hhworkers,hhnwkers,hhhaskids,hhderage,hhincome,hhexpfac
302245,1843800,1097003800,7,2,0,1,1,32,0,1
302246,1843801,1097003800,7,2,0,1,1,55,12100,1
302247,1843802,1097003800,7,2,0,1,1,26,1400,1
302248,1843803,1097003800,7,2,0,1,1,40,0,1
302249,1843804,1097003800,7,2,0,1,1,47,3500,1


In [10]:
# Save the sample to a csv
us_synpop_sample.to_csv('us_synpop_hh3_200_zone_sample.dat', header=True, index=False, sep="\t")

In [None]:
# Some state code data in case we ever want to select zones from particular states.
# The first two digits of the hhtract value are the state code.
state_codes = pd.DataFrame([
['Louisiana','LA',22],
['Alabama','AL',1],
['Arizona','AZ',4],
['Arkansas','AR',5],
['California','CA',6],
['Colorado','CO',8],
['Connecticut','CT',9],
['Delaware','DE',10],
['Florida','FL',12],
['Georgia','GA',13],
['Hawaii','HI',15],
['Idaho','ID',16],
['Illinois','IL',17],
['Indiana','IN',18],
['Iowa','IA',19],
['Kansas','KS',20],
['Kentucky','KY',21],
['Maine','ME',23],
['Maryland','MD',24],
['Massachusetts','MA',25],
['Michigan','MI',26],
['Minnesota','MN',27],
['Mississippi','MS',28],
['Missouri','MO',29],
['Montana','MT',30],
['Nebraska','NE',31],
['Nevada','NV',32],
['New Hampshire','NH',33],
['New Jersey','NJ',34],
['New Mexico','NM',35],
['New York','NY',36],
['North Carolina','NC',37],
['North Dakota','ND',38],
['Ohio','OH',39],
['Oklahoma','OK',40],
['Oregon','OR',41],
['Pennsylvania','PA',42],
['Rhode Island','RI',44],
['South Carolina','SC',45],
['South Dakota','SD',46],
['Tennessee','TN',47],
['Texas','TX',48],
['Utah','UT',49],
['Vermont','VT',50],
['Virginia','VA',51],
['Washington','WA',53],
['West Virginia','WV',54],
['Wisconsin','WI',55],
['Wyoming','WY',56],
['Alaska','AK',2]],
index=None, columns=['Name','Alpha code','Numeric code'])

In [None]:
state_codes.sort_values('Numeric code')