In [18]:
import numpy as np
import pandas as pd

# Null values check

In [19]:
def spot_nulls(df):
    if (df.isna().sum() > 0).any():
        print('Null values present in the following columns:')
        return df.columns[df.isnull().any()].tolist() # returns the column names that have null vals

In [20]:
mgra13 = pd.read_csv('data/mgra13_based_input2019_01.csv')
spot_nulls(mgra13)

In [21]:
mgra15 = pd.read_csv('data/mgra15_based_input2019.csv')
spot_nulls(mgra15)

Null values present in the following columns:


['totint',
 'duden',
 'empden',
 'popden',
 'retempden',
 'totintbin',
 'empdenbin',
 'dudenbin',
 'parkactive',
 'openspaceparkpreserve',
 'beachactive',
 'hotelroomtotal',
 'milestocoast',
 'acres',
 'effective_acres',
 'land_acres']

But, none of the cols are important so not flagging

# Internal Consistency Check

##### Checking if values in mgra equal to values in luz, jurisdiction, and region

In [30]:
def compare_totals(mgra_df, luz_df, jur_df, reg_df):

    #some data formatting
    mgra_tots = mgra_df.drop(['mgra', 'year', 'luz_id'], axis=1).sum()
    luz_tots = luz_df.drop('luz_id', axis=1).sum()
    jur_tots = jur_df.drop(['jurisdiction', 'year', 'luz_id'], axis=1).sum()
    reg_tots = reg_df.drop(['year', 'luz_id'], axis=1).sum()
    

    # comparing mgra and luz
    try:
        if (~np.isclose(mgra_tots, luz_tots)).any(): #if values in any cols between the two df differ
            print(f'columns between mgra and luz where values differ: {mgra_tots[~np.isclose(mgra_tots, luz_tots)].index}')    
    except ValueError:
        print(f'Columns between mgra and luz do not match')    

    
    # comparing mgra and jurisdiction
    try:
        if (~np.isclose(mgra_tots, jur_tots)).any(): #if values in any cols between the two df differ
            print(f'columns between mgra and jurisdiction where values differ: {mgra_tots[~np.isclose(mgra_tots, jur_tots)].index}')        
    except ValueError:
        print(f'Columns between mgra and jurisdiction do not match') 


    # comparing mgra and region
    try:
        if (~np.isclose(mgra_tots, reg_tots)).any(): #if values in any cols between the two df differ
            print(f'columns between mgra and region where values differ: {mgra_tots[~np.isclose(mgra_tots, reg_tots)].index}')          
    except ValueError:
        print(f'Columns between mgra and region do not match')

MGRA13 check

In [23]:
mgra13_mgra = pd.read_csv('data/mgra13_update_mgra_ind_QA.csv')
mgra13_mgra.head()

Unnamed: 0,mgra,year,taz,hs,hs_sf,hs_mf,hs_mh,hh,hh_sf,hh_mf,...,American Indian,Asian,Black,Hispanic,Other,Pacific Islander,Two or More,White,Female,Male
0,1,2019,3331,19,19,0,0,18,18,0,...,0,7,2,16,0,0,3,23,25,26
1,2,2019,3331,35,35,0,0,34,34,0,...,0,0,0,14,1,0,7,54,38,38
2,3,2019,3358,52,52,0,0,52,52,0,...,1,4,0,30,0,0,3,93,64,67
3,4,2019,3358,30,30,0,0,30,30,0,...,0,1,0,16,0,0,0,50,34,33
4,5,2019,3358,28,28,0,0,28,28,0,...,0,0,3,17,0,0,1,47,32,36


In [24]:
mgra13_luz = pd.read_csv('data/mgra13_update_luz_ind_QA.csv')
mgra13_luz.head()

Unnamed: 0,luz_id,taz,hs,hs_sf,hs_mf,hs_mh,hh,hh_sf,hh_mf,hh_mh,...,American Indian,Asian,Black,Hispanic,Other,Pacific Islander,Two or More,White,Female,Male
0,1,176334,10272,3707,5462,1103,8461,3153,4325,983,...,166,968,533,7524,42,89,612,12363,11162,11135
1,2,120961,11144,6448,4626,70,10639,6130,4441,68,...,118,1753,1396,13842,59,163,1024,12775,15696,15434
2,3,79319,21119,13581,5596,1942,19905,12757,5317,1831,...,233,4662,3864,22883,110,314,2132,24585,29872,28911
3,4,57118,11048,9936,1112,0,10564,9494,1070,0,...,146,2784,1155,9484,73,163,1115,16242,15751,15411
4,5,48787,7147,4992,2155,0,6969,4874,2095,0,...,120,1362,546,7419,34,106,692,9669,10028,9920


In [25]:
mgra13_reg = pd.read_csv('data/mgra13_update_region_ind_QA.csv')
mgra13_reg

Unnamed: 0,year,taz,hs,hs_sf,hs_mf,hs_mh,hh,hh_sf,hh_mf,hh_mh,...,American Indian,Asian,Black,Hispanic,Other,Pacific Islander,Two or More,White,Female,Male
0,2019,53370775,1204818,723587,439194,42037,1153376,694690,418569,40117,...,15076,353911,158292,1135321,7451,14609,111360,1537299,1652200,1681119


In [29]:
compare_totals(mgra13_mgra, mgra13_luz, mgra13_reg)

columns between mgra and luz where values differ: Index(['hhs', 'vacancy_rate'], dtype='object')
columns between mgra and region where values differ: Index(['hhs', 'vacancy_rate'], dtype='object')


Checking vacancy rate:

In [62]:
(mgra13_mgra['vacancy'].sum() - mgra13_mgra['unoccupiable'].sum()) * 100 / mgra13_mgra['units'].sum()

1.4812195700927444

In [65]:
(mgra13_luz['vacancy'].sum() - mgra13_luz['unoccupiable'].sum()) * 100 / mgra13_luz['units'].sum()

1.4812195700927444

In [67]:
(mgra13_reg['vacancy'].sum() - mgra13_reg['unoccupiable'].sum()) * 100 / mgra13_reg['units'].sum()

1.4812195700927444

Checking the household size descrepancy:

In [68]:
mgra13_mgra['hhp'].sum() / mgra13_mgra['hh'].sum()

2.800927884748772

In [69]:
mgra13_luz['hhp'].sum() / mgra13_luz['hh'].sum()

2.800927884748772

In [70]:
mgra13_reg['hhp'].sum() / mgra13_reg['hh'].sum()

2.800927884748772

MGRA 15 check

In [31]:
mgra15_mgra = pd.read_csv('data/mgra15_mgra_ind_QA.csv')
mgra15_mgra.head()

Unnamed: 0,mgra,year,hs,hs_sf,hs_mf,hs_mh,hh,hh_sf,hh_mf,hh_mh,...,American Indian,Asian,Black,Hispanic,Other,Pacific Islander,Two or More,White,Female,Male
0,1,2019,176.0,110.0,66.0,0.0,172.0,107.0,65.0,0.0,...,0.0,4.0,2.0,7.0,0.0,1.0,2.0,32.0,26.0,22.0
1,2,2019,56.0,0.0,56.0,0.0,52.0,0.0,52.0,0.0,...,0.0,5.0,3.0,4.0,0.0,0.0,5.0,56.0,39.0,34.0
2,3,2019,200.0,23.0,177.0,0.0,195.0,23.0,172.0,0.0,...,0.0,8.0,0.0,25.0,0.0,0.0,3.0,99.0,72.0,63.0
3,4,2019,4.0,4.0,0.0,0.0,2.0,2.0,0.0,0.0,...,0.0,4.0,0.0,15.0,1.0,1.0,0.0,55.0,34.0,42.0
4,5,2019,43.0,43.0,0.0,0.0,39.0,39.0,0.0,0.0,...,0.0,0.0,0.0,26.0,0.0,0.0,1.0,46.0,29.0,44.0


In [32]:
mgra15_luz = pd.read_csv('data/mgra15_luz_ind_QA.csv')
mgra15_luz.head()

Unnamed: 0,luz_id,hs,hs_sf,hs_mf,hs_mh,hh,hh_sf,hh_mf,hh_mh,i1,...,American Indian,Asian,Black,Hispanic,Other,Pacific Islander,Two or More,White,Female,Male
0,1,10272.0,3707.0,5462.0,1103.0,8461.0,3153.0,4325.0,983.0,987.0,...,215.0,3755.0,2461.0,14843.0,82.0,185.0,1600.0,20029.0,21655.0,21515.0
1,2,11144.0,6448.0,4626.0,70.0,10638.0,6129.0,4441.0,68.0,1068.0,...,75.0,1812.0,671.0,6165.0,43.0,74.0,545.0,9672.0,9677.0,9380.0
2,3,21119.0,13581.0,5596.0,1942.0,19906.0,12757.0,5318.0,1831.0,1577.0,...,81.0,488.0,197.0,3364.0,15.0,34.0,214.0,4983.0,4771.0,4605.0
3,4,11048.0,9937.0,1111.0,0.0,10566.0,9496.0,1070.0,0.0,787.0,...,126.0,3223.0,705.0,7502.0,59.0,112.0,911.0,15060.0,13951.0,13747.0
4,5,7147.0,4992.0,2155.0,0.0,6969.0,4874.0,2095.0,0.0,585.0,...,9.0,145.0,30.0,787.0,4.0,12.0,62.0,1916.0,1491.0,1474.0


In [33]:
mgra15_jur = pd.read_csv('data/mgra15_jur_ind_QA.csv')
mgra15_jur.head()

Unnamed: 0,jurisdiction,year,hs,hs_sf,hs_mf,hs_mh,hh,hh_sf,hh_mf,hh_mh,...,American Indian,Asian,Black,Hispanic,Other,Pacific Islander,Two or More,White,Female,Male
0,Carlsbad,2019,25582.0,17043.0,7064.0,1475.0,24620.0,16344.0,6881.0,1395.0,...,506.0,9821.0,1433.0,21655.0,307.0,516.0,3749.0,88172.0,63946.0,62213.0
1,Chula Vista,2019,58269.0,35639.0,21353.0,1277.0,56122.0,34534.0,20358.0,1230.0,...,916.0,35002.0,12390.0,168624.0,420.0,1092.0,8344.0,51681.0,140262.0,138207.0
2,Coronado,2019,13263.0,6564.0,6340.0,359.0,12884.0,6357.0,6171.0,356.0,...,184.0,1052.0,1398.0,4372.0,38.0,102.0,713.0,16922.0,10378.0,14403.0
3,Del Mar,2019,5890.0,4194.0,1696.0,0.0,5686.0,4067.0,1619.0,0.0,...,18.0,278.0,36.0,792.0,18.0,22.0,191.0,4910.0,3205.0,3060.0
4,El Cajon,2019,37200.0,19393.0,16643.0,1164.0,35556.0,18665.0,15755.0,1136.0,...,543.0,5334.0,6916.0,36957.0,281.0,623.0,4556.0,70802.0,63727.0,62285.0


In [34]:
mgra15_reg = pd.read_csv('data/mgra15_region_ind_QA.csv')
mgra15_reg

Unnamed: 0,year,hs,hs_sf,hs_mf,hs_mh,hh,hh_sf,hh_mf,hh_mh,i1,...,American Indian,Asian,Black,Hispanic,Other,Pacific Islander,Two or More,White,Female,Male
0,2019,1204471.0,723514.0,438921.0,42036.0,1153032.0,694623.0,418294.0,40115.0,102683.0,...,15092.0,354704.0,158563.0,1137700.0,7475.0,14621.0,111553.0,1540604.0,1655636.0,1684676.0


In [35]:
compare_totals(mgra15_mgra, mgra15_luz, mgra15_jur, mgra15_reg)

columns between mgra and luz where values differ: Index(['hhs', 'vacancy_rate'], dtype='object')
columns between mgra and jurisdiction where values differ: Index(['hs', 'hs_sf', 'hs_mf', 'hs_mh', 'hh', 'hh_sf', 'hh_mf', 'hh_mh', 'i1',
       'i2',
       ...
       'American Indian', 'Asian', 'Black', 'Hispanic', 'Other',
       'Pacific Islander', 'Two or More', 'White', 'Female', 'Male'],
      dtype='object', length=113)
columns between mgra and region where values differ: Index(['hhs', 'vacancy_rate'], dtype='object')


Checking vacany rate:

In [71]:
(mgra15_mgra['vacancy'].sum() - mgra15_mgra['unoccupiable'].sum()) * 100 / mgra15_mgra['units'].sum()

0.9916850511861542

In [72]:
(mgra15_luz['vacancy'].sum() - mgra15_luz['unoccupiable'].sum()) * 100 / mgra15_luz['units'].sum()

0.9916850511861542

In [73]:
(mgra15_jur['vacancy'].sum() - mgra15_reg['unoccupiable'].sum()) * 100 / mgra15_jur['units'].sum()

1.1868046007181565

In [74]:
(mgra15_reg['vacancy'].sum() - mgra15_reg['unoccupiable'].sum()) * 100 / mgra15_reg['units'].sum()

0.9916850511861542

In [40]:
mgra15_reg['vacancy_rate'] # let Calvin know

0   NaN
Name: vacancy_rate, dtype: float64

Checking the household size descrepancy:

In [75]:
mgra15_mgra['hhp'].sum() / mgra15_mgra['hh'].sum()

2.801763524342776

In [76]:
mgra15_luz['hhp'].sum() / mgra15_luz['hh'].sum()

2.801763524342776

In [79]:
mgra15_jur['hhp'].sum() / mgra15_jur['hh'].sum()

2.8172200154171434

In [78]:
mgra15_reg['hhp'].sum() / mgra15_reg['hh'].sum()

2.801763524342776

#### Checking vals between input file and mgra ind_QA file

For MGRA13:

In [45]:
not_in_input_file = list(set(mgra13_mgra.columns) - set(mgra13.columns))
len(not_in_input_file)

37

In [46]:
exists_in_both = set(mgra13).intersection(mgra13_mgra)
len(exists_in_both)

104

In [47]:
mgra13_mgra_filtered = mgra13_mgra.filter(exists_in_both)
mgra13_mgra_filtered.head()

Unnamed: 0,acres,i5,emp_trans,dudenbin,emp_pvt_hh,emp_mfg_prod,upscaleroom,budgetroom,duden,emp_state_local_gov_white,...,hs_mh,emp_cap_accts,i9,othercollegeenroll,mparkcost,emp_const_bldg_prod,emp_retail,emp_const_bldg_office,i6,hs_sf
0,16.615444,1,0,0,0,0,0,0,0,0,...,0,0,3,0,0,0,0,0,5,19
1,19.519185,3,0,0,0,0,0,0,0,0,...,0,0,6,0,0,0,0,0,2,35
2,27.845124,2,0,0,0,0,0,0,0,0,...,0,0,4,0,1,0,0,0,9,52
3,7.976178,0,0,0,0,0,0,0,0,0,...,0,0,9,0,1,0,0,0,2,30
4,7.072502,1,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,4,28


In [48]:
mgra13_mgra_filtered_sum = mgra13_mgra_filtered.sum()
mgra13_mgra_filtered_sum

acres                    2.727204e+06
i5                       1.114470e+05
emp_trans                4.221900e+04
dudenbin                 0.000000e+00
emp_pvt_hh               0.000000e+00
                             ...     
emp_const_bldg_prod      6.211700e+04
emp_retail               1.582460e+05
emp_const_bldg_office    2.105500e+04
i6                       1.601150e+05
hs_sf                    7.235870e+05
Length: 104, dtype: float64

In [49]:
mgra13_sum = mgra13.sum()
mgra13_sum

mgra               2.645575e+08
taz                5.337078e+07
hs                 1.204818e+06
hs_sf              7.235870e+05
hs_mf              4.391940e+05
                       ...     
district27         2.211000e+05
milestocoast       2.453073e+05
acres              2.727204e+06
effective_acres    1.251249e+06
land_acres         2.698589e+06
Length: 104, dtype: float64

In [50]:
for col in exists_in_both:
    if mgra13_sum[col] == mgra13_mgra_filtered_sum[col]:
        continue
    else:
        print(col) 

For MGRA15:

In [51]:
mgra15.head()

Unnamed: 0,mgra,hs,hs_sf,hs_mf,hs_mh,hh,hh_sf,hh_mf,hh_mh,i1,...,midpriceroom,upscaleroom,hotelroomtotal,luz_id,truckregiontype,district27,milestocoast,acres,effective_acres,land_acres
0,1,176,110,66,0,172,107,65,0,29,...,0,0,,87,1,27,,,,
1,2,56,0,56,0,52,0,52,0,9,...,0,0,,103,1,27,,,,
2,3,200,23,177,0,195,23,172,0,14,...,0,0,,142,1,27,,,,
3,4,4,4,0,0,2,2,0,0,2,...,0,0,,221,1,27,,,,
4,5,43,43,0,0,39,39,0,0,1,...,0,0,,221,1,27,,,,


In [52]:
mgra15_mgra.head()

Unnamed: 0,mgra,year,hs,hs_sf,hs_mf,hs_mh,hh,hh_sf,hh_mf,hh_mh,...,American Indian,Asian,Black,Hispanic,Other,Pacific Islander,Two or More,White,Female,Male
0,1,2019,176.0,110.0,66.0,0.0,172.0,107.0,65.0,0.0,...,0.0,4.0,2.0,7.0,0.0,1.0,2.0,32.0,26.0,22.0
1,2,2019,56.0,0.0,56.0,0.0,52.0,0.0,52.0,0.0,...,0.0,5.0,3.0,4.0,0.0,0.0,5.0,56.0,39.0,34.0
2,3,2019,200.0,23.0,177.0,0.0,195.0,23.0,172.0,0.0,...,0.0,8.0,0.0,25.0,0.0,0.0,3.0,99.0,72.0,63.0
3,4,2019,4.0,4.0,0.0,0.0,2.0,2.0,0.0,0.0,...,0.0,4.0,0.0,15.0,1.0,1.0,0.0,55.0,34.0,42.0
4,5,2019,43.0,43.0,0.0,0.0,39.0,39.0,0.0,0.0,...,0.0,0.0,0.0,26.0,0.0,0.0,1.0,46.0,29.0,44.0


In [53]:
not_in_input_file = list(set(mgra15_mgra.columns) - set(mgra15.columns))
len(not_in_input_file)

37

In [54]:
exists_in_both = set(mgra15).intersection(mgra15_mgra)
len(exists_in_both)

104

In [56]:
mgra15_mgra_filtered = mgra15_mgra.filter(exists_in_both)
mgra15_mgra_filtered.head()

Unnamed: 0,acres,i5,emp_trans,dudenbin,emp_pvt_hh,emp_mfg_prod,upscaleroom,budgetroom,duden,emp_state_local_gov_white,...,hs_mh,emp_cap_accts,i9,othercollegeenroll,mparkcost,emp_const_bldg_prod,emp_retail,emp_const_bldg_office,i6,hs_sf
0,0.0,21.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,21.0,110.0
1,0.0,7.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0
2,0.0,25.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,...,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,23.0,23.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,4.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,...,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,6.0,43.0


In [57]:
mgra15_mgra_filtered_sum = mgra15_mgra_filtered.sum()
mgra15_mgra_filtered_sum

acres                         0.0
i5                       111444.0
emp_trans                 42193.0
dudenbin                      0.0
emp_pvt_hh                    0.0
                           ...   
emp_const_bldg_prod       62091.0
emp_retail               158284.0
emp_const_bldg_office     21041.0
i6                       160048.0
hs_sf                    723514.0
Length: 104, dtype: float64

In [58]:
mgra15_sum = mgra15.sum()
mgra15_sum

mgra               295767681.0
hs                   1204471.0
hs_sf                 723514.0
hs_mf                 438921.0
hs_mh                  42036.0
                      ...     
district27            222419.0
milestocoast               0.0
acres                      0.0
effective_acres            0.0
land_acres                 0.0
Length: 104, dtype: float64

In [60]:
for col in exists_in_both:
    if mgra15_sum[col] == mgra15_mgra_filtered_sum[col]:
        continue
    else:
        print(col) 

Both files have same values

# Summary of findings that need to be flagged:

- hhs differs for jur for MGRA15
- vacancy rate differs for jur for MGRA15
- jur vals don't match up with mgra for MGRA15
    - Not sure about MGRA13 because there isn't a jur file for it yet