In [1]:
import time
import numpy as np
import pandas as pd
#from dask import dataframe as dd

In [2]:
rd_schema = pd.read_json('../data/raw/pu2018_schema.json')

In [3]:
rd_schema.dtype.value_counts()

integer    4880
float        37
string       25
Name: dtype, dtype: int64

In [4]:
rd_schema['dtype'] = (['Int64' if x == 'integer'
                       else 'object' if x == 'string'
                       else 'Float64' if x == 'float'
                       else 'ERROR'
                       for x in rd_schema['dtype']]
                     )

In [5]:
rd_schema.dtype.value_counts()

Int64      4880
Float64      37
object       25
Name: dtype, dtype: int64

In [6]:
use_cols = [#Common case-identification variables
    'SSUID','PNUM','MONTHCODE','ERESIDENCEID','ERELRPE','SPANEL','SWAVE',
    #The base weight and monthly in-survey-universe indicator
    'WPFINWGT','RIN_UNIV',
    #Common demographics variables, including age at time of interview (TAGE)
    #and monthly age during the reference period (TAGE_EHC)
    'ESEX','TAGE','TAGE_EHC','ERACE','EORIGIN','EEDUC',
    #Additional variables for analysis
    'TPTOTINC','RTANF_MNYN']


In [9]:
# Load dataset using Pandas. This works on my computer - takes about 69 seconds to read.
start = time.time()
df_data = pd.read_csv("../data/raw/pu2018.csv",
                      names=rd_schema['name'],
                      dtype=dict([(i,v) for i,v in zip(rd_schema.name, rd_schema.dtype)]),
                      sep='|',
                      header=0,
                      usecols=use_cols,
                     )
end = time.time()
print(f'Read with Pandas: {end - start} seconds')

Read with Pandas: 69.01866102218628 seconds


In [None]:
# --Alternative Method-- Load dataset with Dask. This is much faster but the resulting object is not a pandas df.
'''
start = time.time()
df = dd.read_csv("../data/raw/pu2018.csv",
                      names=rd_schema['name'],
                      dtype=dict([(i,v) for i,v in zip(rd_schema.name, rd_schema.dtype)]),
                      sep='|',
                      header=0,
                      #usecols=use_cols,
                      
                     )
end = time.time()

print('Read with dask: ', (end - start), 'seconds')
'''

In [14]:
df_data.describe(include=[object])

Unnamed: 0,SSUID,ERESIDENCEID
count,763186,763186
unique,26215,10
top,88128566228518,100001
freq,168,685427


In [13]:
df_data.describe()

Unnamed: 0,SPANEL,SWAVE,PNUM,ERELRPE,ESEX,EORIGIN,ERACE,EEDUC,MONTHCODE,WPFINWGT,RTANF_MNYN,RIN_UNIV,TAGE,TAGE_EHC,TPTOTINC
count,763186.0,763186.0,763186.0,763186.0,763186.0,763186.0,763186.0,625896.0,763186.0,763186.0,763186.0,763186.0,763186.0,763186.0,620018.0
mean,2018.0,1.0,102.149105,4.253721,1.516031,1.811624,1.364357,40.333218,6.51054,5034.392912,1.99391,1.004958,40.394934,39.599683,3575.846667
std,0.0,0.0,1.358387,3.47232,0.499743,0.391012,0.775861,2.864025,3.452085,1589.148885,0.077802,0.070239,23.419359,23.428759,6822.266449
min,2018.0,1.0,101.0,1.0,1.0,1.0,1.0,31.0,1.0,0.0,1.0,1.0,0.0,0.0,-126633.0
25%,2018.0,1.0,101.0,1.0,1.0,2.0,1.0,39.0,4.0,3826.676848,2.0,1.0,20.0,19.0,722.0
50%,2018.0,1.0,102.0,3.0,2.0,2.0,1.0,41.0,7.0,4832.714481,2.0,1.0,40.0,39.0,2112.0
75%,2018.0,1.0,103.0,7.0,2.0,2.0,1.0,43.0,10.0,6040.97744,2.0,1.0,60.0,59.0,4336.0
max,2018.0,1.0,114.0,18.0,2.0,2.0,4.0,46.0,12.0,24164.915109,2.0,2.0,87.0,87.0,518825.0
