In [12]:
import pandas as pd
import numpy as np

## 01. Retrieve data from postgres

In [13]:
from config import *

In [14]:
from sqlalchemy import create_engine
engine = create_engine(f'postgresql://{param_dic["user"]}:{param_dic["password"]}@localhost:5432/{param_dic["database"]}')
connection = engine.connect()

In [15]:
raw_df = pd.read_sql('select * from unemp_raw', connection)
raw_df

Unnamed: 0,PWCMPWGT,PWSSWGT,PREMPNOT,PESEX,PEEDUCA,PTDTRACE,month
0,17347552,17713809,4,1,38,2,jan
1,16756084,16864805,4,2,40,2,jan
2,21463402,20481802,4,1,40,1,jan
3,30966041,30137016,1,1,39,1,jan
4,17590812,17309391,1,2,43,1,jan
...,...,...,...,...,...,...,...
665457,3608122,3554878,1,1,39,1,may
665458,-1,-1,-1,-1,-1,-1,may
665459,-1,-1,-1,-1,-1,-1,may
665460,-1,-1,-1,-1,-1,-1,may


In [16]:
raw_df.columns = ['weight_cf', 'weight', 'emp', 'sex', 'educa', 'race', 'month']
raw_df.head()

Unnamed: 0,weight_cf,weight,emp,sex,educa,race,month
0,17347552,17713809,4,1,38,2,jan
1,16756084,16864805,4,2,40,2,jan
2,21463402,20481802,4,1,40,1,jan
3,30966041,30137016,1,1,39,1,jan
4,17590812,17309391,1,2,43,1,jan


## 02. Calculate non-seasonally adjusted unemployment rate

In [17]:
raw_df = raw_df.loc[raw_df['weight_cf']>0]

Here is how the weights should work: for example, if a person has a weight of 1,500 then the household is theoretically representing 1,500 other persons. 

In practice, the weights are scaled up by 10000.

I scale it down and convert units to thousands.

In [18]:
raw_df.head(10)

Unnamed: 0,weight_cf,weight,emp,sex,educa,race,month
0,17347552,17713809,4,1,38,2,jan
1,16756084,16864805,4,2,40,2,jan
2,21463402,20481802,4,1,40,1,jan
3,30966041,30137016,1,1,39,1,jan
4,17590812,17309391,1,2,43,1,jan
5,22252360,21931976,1,1,39,2,jan
6,20126642,19718618,1,2,39,2,jan
8,22147484,22450914,4,2,39,1,jan
9,28851877,33004731,4,2,39,2,jan
10,17379723,17492490,4,2,39,2,jan


http://www.dlt.ri.gov/lmi/laus/us/usunadj.htm

These are the non-seasonally adjusted unemployment rates

In [19]:
pd.options.display.float_format = '{:.5f}'.format

In [21]:
emp = raw_df.loc[raw_df['emp']==1]
unemp = raw_df.loc[raw_df['emp']==2]

In [22]:
tot_emp = emp['weight_cf'].groupby(emp["month"]).sum()
tot_unemp = unemp['weight_cf'].groupby(unemp["month"]).sum()
tot_unemp * 100 / (tot_emp + tot_unemp)

month
apr   14.44144
feb    3.78583
jan    3.97775
mar    4.53429
may   12.98540
Name: weight_cf, dtype: float64

In [23]:
print(tot_emp/10000000) # this matches exactly the data from DLT

month
apr   133325.80812
feb   158017.40388
jan   156993.73231
mar   155167.19227
may   137461.04772
Name: weight_cf, dtype: float64
