# Population Analysis Script

## Libraries

In [1]:
import pandas as pd
import numpy as np 
import pyodbc

## Download the households

In [2]:
households = pd.read_csv('C:/Users/cra/San Diego Association of Governments/SANDAG QA QC - Documents/Projects/2022/2022-47 Base Year Forecast Output QC/Data/households_2019_01.csv')

In [3]:
persons = pd.read_csv('C:/Users/cra/San Diego Association of Governments/SANDAG QA QC - Documents/Projects/2022/2022-47 Base Year Forecast Output QC/Data/persons_2019_01.csv')

In [4]:
mgra_ind = pd.read_csv('C:/Users/cra/San Diego Association of Governments/SANDAG QA QC - Documents/Projects/2022/2022-47 Base Year Forecast Output QC/Data/mgra_ind.csv')

In [5]:
cpa_ind = pd.read_csv('C:/Users/cra/San Diego Association of Governments/SANDAG QA QC - Documents/Projects/2022/2022-47 Base Year Forecast Output QC/Data/cpa_ind.csv')

In [6]:
jur_ind = pd.read_csv('C:/Users/cra/San Diego Association of Governments/SANDAG QA QC - Documents/Projects/2022/2022-47 Base Year Forecast Output QC/Data/jur_ind.csv')

In [7]:
region_ind = pd.read_csv('C:/Users/cra/San Diego Association of Governments/SANDAG QA QC - Documents/Projects/2022/2022-47 Base Year Forecast Output QC/Data/region_ind.csv')

# Grace QC Request

## Comparing DS ID 44 and 43
Here I will be using data from the demographic warehouse to compare total values between DSIDs. Looking at the region level. 

### Household Type 1 (Households)

In [6]:
# DSID 43
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=demographic_warehouse;'
                      'Trusted_Connection=yes;')

qry = '''SELECT [yr_id]
      ,[population]
  FROM [demographic_warehouse].[fact].[population]
  Where datasource_id = 43 AND housing_type_id = 1'''


type_1_df_43 = pd.read_sql_query(qry, conn)
type_1_df_43 = type_1_df_43.groupby('yr_id').sum()
type_1_df_43

Unnamed: 0_level_0,population
yr_id,Unnamed: 1_level_1
2010,2993348
2011,3023755
2012,3058317
2013,3098470
2014,3127530
2015,3156961
2016,3177681
2017,3200591
2018,3215544
2019,3230515


In [7]:
# DSID 44
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=demographic_warehouse;'
                      'Trusted_Connection=yes;')

qry = '''SELECT [yr_id]
      ,[population]
  FROM [demographic_warehouse].[fact].[population]
  Where datasource_id = 44 AND housing_type_id = 1'''


type_1_df_44 = pd.read_sql_query(qry, conn)
type_1_df_44 = type_1_df_44.groupby('yr_id').sum()
type_1_df_44

Unnamed: 0_level_0,population
yr_id,Unnamed: 1_level_1
2010,2993348
2011,3023755
2012,3058317
2013,3098470
2014,3127530
2015,3156961
2016,3177681
2017,3200591
2018,3215545
2019,3230523


In [10]:
# Checking differences
population_1_cleaned = type_1_df_43.merge(type_1_df_44, how='left', left_index=True, right_index=True, suffixes=['_43', '_44'])
population_1_cleaned["Diff"] = population_1_cleaned['population_43'] - population_1_cleaned['population_44']
population_1_cleaned[population_1_cleaned["Diff"] != 0]

Unnamed: 0_level_0,population_43,population_44,Diff
yr_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,3215544,3215545,-1
2019,3230515,3230523,-8
2020,3230936,3230945,-9


### Household type 2,3,4 (Group Quarter)

In [12]:
# DSID 43
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=demographic_warehouse;'
                      'Trusted_Connection=yes;')

qry = '''SELECT [yr_id]
      ,[population]
  FROM [demographic_warehouse].[fact].[population]
  Where datasource_id = 43 AND housing_type_id IN (2,3,4)'''


gq_df_43 = pd.read_sql_query(qry, conn)
gq_df_43 = gq_df_43.groupby('yr_id').sum()
gq_df_43

Unnamed: 0_level_0,population
yr_id,Unnamed: 1_level_1
2010,101966
2011,101900
2012,103492
2013,101431
2014,105233
2015,108078
2016,107470
2017,106299
2018,110773
2019,109789


In [13]:
# DSID 43
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=demographic_warehouse;'
                      'Trusted_Connection=yes;')

qry = '''SELECT [yr_id]
      ,[population]
  FROM [demographic_warehouse].[fact].[population]
  Where datasource_id = 44 AND housing_type_id IN (2,3,4)'''


gq_df_44 = pd.read_sql_query(qry, conn)
gq_df_44 = gq_df_44.groupby('yr_id').sum()
gq_df_44

Unnamed: 0_level_0,population
yr_id,Unnamed: 1_level_1
2010,101966
2011,101900
2012,103492
2013,101431
2014,105233
2015,108078
2016,107470
2017,106299
2018,110773
2019,109789


In [15]:
# Checking differences
gq_cleaned = gq_df_43.merge(gq_df_44, how='left', left_index=True, right_index=True, suffixes=['_43', '_44'])
gq_cleaned["Diff"] = gq_cleaned['population_43'] - gq_cleaned['population_44']
gq_cleaned[gq_cleaned["Diff"] != 0]

Unnamed: 0_level_0,population_43,population_44,Diff
yr_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


### Part 3: Check 1
Compare REGION level population by age group between [estimates ].[est_2020_06]. [dw_age] and [estimates ].[est_2020_05]. [dw_age] and [demographic_warehouse].[fact].[age] (WHERE DS ID=44)

In [None]:
# DSID 43
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=demographic_warehouse;'
                      'Trusted_Connection=yes;')

qry = '''SELECT [yr_id]
      ,[population]
  FROM [demographic_warehouse].[fact].[population]
  Where datasource_id = 43 AND housing_type_id = 1'''


type_1_df_43 = pd.read_sql_query(qry, conn)
type_1_df_43 = type_1_df_43.groupby('yr_id').sum()
type_1_df_43

### Age

In [44]:
# DSID 43
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=demographic_warehouse;'
                      'Trusted_Connection=yes;')

qry = '''SELECT
      [yr_id]
      ,SUM([population]) AS 'Population'
      ,C.name
  FROM [demographic_warehouse].[fact].[age] AS A
LEFT JOIN 
	(SELECT [age_group_id]
      ,[name]
  FROM [demographic_warehouse].[dim].[age_group]) C
  ON (A.age_group_id = C.age_group_id)
  WHERE A.datasource_id = 43
  GROUP BY A.yr_id, C.name'''


age_groups_43 = pd.read_sql_query(qry, conn)
age_groups_43

Unnamed: 0,yr_id,Population,name
0,2011,209095,35 to 39
1,2014,210707,35 to 39
2,2017,224897,35 to 39
3,2020,232244,35 to 39
4,2010,128000,15 to 17
...,...,...,...
215,2014,216798,5 to 9
216,2017,228181,5 to 9
217,2010,180305,55 to 59
218,2012,66248,60 and 61


In [45]:
# DSID 44
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=demographic_warehouse;'
                      'Trusted_Connection=yes;')

qry = '''SELECT
      [yr_id]
      ,SUM([population]) AS 'Population'
      ,C.name
  FROM [demographic_warehouse].[fact].[age] AS A
LEFT JOIN 
	(SELECT [age_group_id]
      ,[name]
  FROM [demographic_warehouse].[dim].[age_group]) C
  ON (A.age_group_id = C.age_group_id)
  WHERE A.datasource_id = 44
  GROUP BY A.yr_id, C.name'''


age_groups_44 = pd.read_sql_query(qry, conn)
age_groups_44

Unnamed: 0,yr_id,Population,name
0,2011,64763,75 to 79
1,2014,69690,75 to 79
2,2017,78080,75 to 79
3,2010,128000,15 to 17
4,2013,125993,65 to 69
...,...,...,...
215,2010,77313,70 to 74
216,2013,88669,70 to 74
217,2012,65707,75 to 79
218,2015,72596,75 to 79


In [53]:
# Checking differences
age_cleaned = age_groups_43.merge(age_groups_44, on=['yr_id', 'name'], how='left', suffixes=['_43', '_44'])
age_cleaned = age_cleaned.groupby(['yr_id', 'name']).sum()
age_cleaned["Diff"] = abs(age_cleaned['Population_43'] - age_cleaned['Population_44'])
age_cleaned[age_cleaned["Diff"] > 0]

In [57]:
age_cleaned[age_cleaned["Diff"] > 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Population_43,Population_44,Diff
yr_id,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,55 to 59,210361,210362,1
2019,10 to 14,220579,220580,1
2019,15 to 17,132501,132502,1
2019,20 to 24,263896,263897,1
2019,30 to 34,216584,216586,2
2019,35 to 39,233120,233121,1
2019,5 to 9,230599,230600,1
2019,55 to 59,210307,210308,1
2020,10 to 14,223916,223917,1
2020,15 to 17,133978,133979,1


In [55]:
age_cleaned["Diff"] = abs(age_cleaned['Population_43'] - age_cleaned['Population_44'])

### Ethnicity

In [58]:
# DSID 43
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=demographic_warehouse;'
                      'Trusted_Connection=yes;')

qry = '''SELECT
      [yr_id]
      ,SUM([population]) AS 'Population'
      ,C.short_name AS name
  FROM [demographic_warehouse].[fact].[ethnicity] AS A
LEFT JOIN 
	(SELECT [ethnicity_id]
      ,[short_name]
  FROM [demographic_warehouse].[dim].[ethnicity]) C
  ON (A.ethnicity_id = C.ethnicity_id)
  WHERE A.datasource_id = 43
  GROUP BY A.yr_id, C.short_name'''


eth_groups_43 = pd.read_sql_query(qry, conn)
eth_groups_43

Unnamed: 0,yr_id,Population,name
0,2018,15018,American Indian
1,2013,1048016,Hispanic
2,2016,1100161,Hispanic
3,2019,1137700,Hispanic
4,2020,112958,Two or More
...,...,...,...
83,2012,149893,Black
84,2015,154818,Black
85,2014,153279,Black
86,2017,156562,Black


In [59]:
# DSID 44
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=demographic_warehouse;'
                      'Trusted_Connection=yes;')

qry = '''SELECT
      [yr_id]
      ,SUM([population]) AS 'Population'
      ,C.short_name AS name
  FROM [demographic_warehouse].[fact].[ethnicity] AS A
LEFT JOIN 
	(SELECT [ethnicity_id]
      ,[short_name]
  FROM [demographic_warehouse].[dim].[ethnicity]) C
  ON (A.ethnicity_id = C.ethnicity_id)
  WHERE A.datasource_id = 44
  GROUP BY A.yr_id, C.short_name'''


eth_groups_44 = pd.read_sql_query(qry, conn)
eth_groups_44

Unnamed: 0,yr_id,Population,name
0,2014,14716,American Indian
1,2017,14972,American Indian
2,2020,15160,American Indian
3,2020,112918,Two or More
4,2012,149893,Black
...,...,...,...
83,2018,7439,Other
84,2010,1500048,White
85,2011,1007319,Hispanic
86,2014,1067176,Hispanic


In [60]:
# Checking differences
ethnicity_cleaned = eth_groups_43.merge(eth_groups_44, on=['yr_id', 'name'], how='left', suffixes=['_43', '_44'])
ethnicity_cleaned = ethnicity_cleaned.groupby(['yr_id', 'name']).sum()
ethnicity_cleaned["Diff"] = abs(ethnicity_cleaned['Population_43'] - ethnicity_cleaned['Population_44'])
ethnicity_cleaned[ethnicity_cleaned["Diff"] > 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Population_43,Population_44,Diff
yr_id,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,American Indian,14967,14972,5
2017,Asian,351307,351299,8
2017,Black,156562,156571,9
2017,Other,7382,7371,11
2017,Two or More,108295,108311,16
2017,White,1539218,1539207,11
2018,American Indian,15018,15027,9
2018,Asian,353447,353430,17
2018,Black,157571,157574,3
2018,Other,7461,7439,22


### Household Income

In [61]:
# DSID 43
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=demographic_warehouse;'
                      'Trusted_Connection=yes;')

qry = '''SELECT [yr_id]
      ,SUM([households]) AS 'Number of Households'
	  ,C.name
  FROM [demographic_warehouse].[fact].[household_income] A
  LEFT JOIN 
	(SELECT [income_group_id]
      ,[name]
  FROM [demographic_warehouse].[dim].[income_group]) C
  ON (A.[income_group_id] = C.[income_group_id])
  WHERE A.datasource_id = 43
  GROUP BY A.yr_id, C.name'''


HHI_groups_43 = pd.read_sql_query(qry, conn)
HHI_groups_43

Unnamed: 0,yr_id,Number of Households,name
0,2015,126271,"$45,000 to $59,999"
1,2017,128844,"$45,000 to $59,999"
2,2013,94698,"$100,000 to $124,999"
3,2012,134432,"Less than $15,000"
4,2019,128470,"$45,000 to $59,999"
...,...,...,...
105,2016,110768,"$60,000 to $74,999"
106,2016,115619,"Less than $15,000"
107,2015,104614,"$100,000 to $124,999"
108,2018,103619,"$200,000 or more"


In [62]:
# DSID 43
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=demographic_warehouse;'
                      'Trusted_Connection=yes;')

qry = '''SELECT [yr_id]
      ,SUM([households]) AS 'Number of Households'
	  ,C.name
  FROM [demographic_warehouse].[fact].[household_income] A
  LEFT JOIN 
	(SELECT [income_group_id]
      ,[name]
  FROM [demographic_warehouse].[dim].[income_group]) C
  ON (A.[income_group_id] = C.[income_group_id])
  WHERE A.datasource_id = 44
  GROUP BY A.yr_id, C.name'''


HHI_groups_44 = pd.read_sql_query(qry, conn)
HHI_groups_44

Unnamed: 0,yr_id,Number of Households,name
0,2020,125023,"$45,000 to $59,999"
1,2020,122823,"$100,000 to $124,999"
2,2017,66105,"$125,000 to $149,999"
3,2014,99016,"$100,000 to $124,999"
4,2016,133206,"$45,000 to $59,999"
...,...,...,...
105,2016,144916,"$30,000 to $44,999"
106,2011,117657,"Less than $15,000"
107,2011,63130,"$125,000 to $149,999"
108,2012,60916,"$200,000 or more"


In [63]:
# Checking differences
HHI_cleaned = HHI_groups_43.merge(HHI_groups_44, on=['yr_id', 'name'], how='left', suffixes=['_43', '_44'])
HHI_cleaned = HHI_cleaned.groupby(['yr_id', 'name']).sum()
HHI_cleaned["Diff"] = abs(HHI_cleaned['Number of Households_43'] - HHI_cleaned['Number of Households_44'])
HHI_cleaned[HHI_cleaned["Diff"] > 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of Households_43,Number of Households_44,Diff
yr_id,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010,"$15,000 to $29,999",149058,149059,1
2010,"$30,000 to $44,999",153012,153011,1
2011,"$15,000 to $29,999",149511,149510,1
2011,"$150,000 to $199,999",64156,64157,1
2011,"$200,000 or more",62051,62050,1
2011,"$75,000 to $99,999",142888,142889,1
2012,"$100,000 to $124,999",97463,97464,1
2012,"$125,000 to $149,999",59503,59502,1
2014,"$200,000 or more",72728,72729,1
2014,"$45,000 to $59,999",130557,130555,2


# Checking Totals
1. MGRA/Jurisdiction Comparison 
2. MGRA/Region Comparison 
3. Jurisdiction/Region Comparison 

In [8]:
# MGRA/Jurisdiction Comparison 
# I will have to use MGRA_ID, CSV doesn't have MGRA ID 
# PSI will check: this data has mgra ID information in the demographic warehouse database 
mgra_sums = pd.DataFrame(mgra_ind.drop(['mgra', 'year', 'taz'], axis=1).sum(), columns = ['mgra_csv'])
jur_sums = pd.DataFrame(jur_ind.drop(['jurisdiction', 'year', 'taz'], axis=1).sum(), columns = ['jur_csv'])
mgra_jur_sums_comparison = mgra_sums.merge(jur_sums, how='left', left_index=True, right_index=True)
mgra_jur_sums_comparison['Diff'] = abs(mgra_jur_sums_comparison['mgra_csv'] - mgra_jur_sums_comparison['jur_csv'])
mgra_jur_sums_comparison[mgra_jur_sums_comparison['Diff']> .001]

Unnamed: 0,mgra_csv,jur_csv,Diff
hs,1204818.0,1310578.0,105760.0
hs_Single_Family,723587.0,804228.0,80641.0
hs_Multiple_Family,439194.0,458514.0,19320.0
hs_Mobile_Homes,42037.0,47836.0,5799.0
Household Population (hh),1153376.0,1255542.0,102166.0
...,...,...,...
Pacific Islander,14626.0,16173.0,1547.0
Two or More,111558.0,122889.0,11331.0
White,1540559.0,1696056.0,155497.0
Female,1655631.0,1813807.0,158176.0


In [9]:
# MGRA/Region Comparison 
mgra_sums = pd.DataFrame(mgra_ind.drop(['mgra', 'year', 'taz'], axis=1).sum(), columns = ['mgra_csv'])
region_sum = region_ind.drop(['year', 'taz'], axis=1).T
region_sum.columns = ['reg_csv']
mgra_reg_sums_comparison = mgra_sums.merge(region_sum, how='left', left_index=True, right_index=True)
mgra_reg_sums_comparison['Diff'] = abs(mgra_reg_sums_comparison['mgra_csv'] - mgra_reg_sums_comparison['reg_csv'])
mgra_reg_sums_comparison[mgra_reg_sums_comparison['Diff'] > 0.001] #Few were much smaller than .001 counting these as being practicly zero

Unnamed: 0,mgra_csv,reg_csv,Diff


In [10]:
# Jurisdiction/Region Comparison 
jur_sums = pd.DataFrame(jur_ind.drop(['jurisdiction', 'year', 'taz'], axis=1).sum(), columns = ['jur_csv'])
region_sums = region_ind.drop(['year', 'taz'], axis=1).T
region_sums.columns = ['reg_csv']
jur_reg_sums_comparison = jur_sums.merge(region_sums, how='left', left_index=True, right_index=True)
jur_reg_sums_comparison['Diff'] = abs(jur_reg_sums_comparison['jur_csv'] - jur_reg_sums_comparison['reg_csv'])
jur_reg_sums_comparison[jur_reg_sums_comparison['Diff'] > 0.001] #Few were much smaller than .001 counting these as being practicly zero

Unnamed: 0,jur_csv,reg_csv,Diff
hs,1310578.0,1204818.0,105760.0
hs_Single_Family,804228.0,723587.0,80641.0
hs_Multiple_Family,458514.0,439194.0,19320.0
hs_Mobile_Homes,47836.0,42037.0,5799.0
Household Population (hh),1255542.0,1153376.0,102166.0
...,...,...,...
Pacific Islander,16173.0,14626.0,1547.0
Two or More,122889.0,111558.0,11331.0
White,1696056.0,1540559.0,155497.0
Female,1813807.0,1655631.0,158176.0


# Check for Nulls
Check for null values in the columns that are in the test plan. 

In [11]:
null_check = pd.DataFrame(mgra_ind.isna().sum(), columns=['Number of Null Values'])
null_check[null_check['Number of Null Values']>0]

Unnamed: 0,Number of Null Values


# Household Income Category Totals Equals Household Totals 
Confirm that the total number of households for all the income catgeories is equal to the number of households HH

In [12]:
# Subsetting for the income groups I am interested in 
mgra_income_df = mgra_ind[['Household Population (hh)', 'Less than $15,000', '$15,000 to $29,999','$30,000 to $44,999','$45,000 to $59,999','$60,000 to $74,999','$75,000 to $99,999','$100,000 to $124,999','$125,000 to $149,999','$150,000 to $199,999','$200,000 or more']]

# Sum up the values
mgra_income_df['Category Total'] = mgra_income_df['Less than $15,000'] + mgra_income_df['$15,000 to $29,999'] + mgra_income_df['$30,000 to $44,999'] + mgra_income_df['$45,000 to $59,999'] + mgra_income_df['$60,000 to $74,999'] + mgra_income_df['$75,000 to $99,999'] + mgra_income_df['$100,000 to $124,999'] + mgra_income_df['$125,000 to $149,999'] + mgra_income_df['$150,000 to $199,999'] + mgra_income_df['$200,000 or more']

mgra_income_df['Flag'] = mgra_income_df['Household Population (hh)'] == mgra_income_df['Category Total']

# Checking to see where there are differences 
mgra_income_df[mgra_income_df['Flag'] == False]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mgra_income_df['Category Total'] = mgra_income_df['Less than $15,000'] + mgra_income_df['$15,000 to $29,999'] + mgra_income_df['$30,000 to $44,999'] + mgra_income_df['$45,000 to $59,999'] + mgra_income_df['$60,000 to $74,999'] + mgra_income_df['$75,000 to $99,999'] + mgra_income_df['$100,000 to $124,999'] + mgra_income_df['$125,000 to $149,999'] + mgra_income_df['$150,000 to $199,999'] + mgra_income_df['$200,000 or more']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mgra_income_df['Flag'] = mgra_income_df['Hou

Unnamed: 0,Household Population (hh),"Less than $15,000","$15,000 to $29,999","$30,000 to $44,999","$45,000 to $59,999","$60,000 to $74,999","$75,000 to $99,999","$100,000 to $124,999","$125,000 to $149,999","$150,000 to $199,999","$200,000 or more",Category Total,Flag


## Count of unique hhids by MGRA 
This is county number of persons 

In [13]:
# This should be identical in persons householdsset 
mgra_count = households[['mgra','hhid']].groupby(['mgra']).count()
mgra_count

Unnamed: 0_level_0,hhid
mgra,Unnamed: 1_level_1
1,18
2,34
3,52
4,30
5,28
...,...
22995,37
22996,103
22998,87
23000,126


# Number of Household Comparison 

In [14]:
# HHID Count appended to MGRA
new_df = mgra_ind.merge(mgra_count, how='left', on='mgra')
new_df

Unnamed: 0,mgra,year,taz,hs,hs_Single_Family,hs_Multiple_Family,hs_Mobile_Homes,Household Population (hh),hh_Single_Family,hh_Multiple_Family,...,Asian,Black,Hispanic,Other,Pacific Islander,Two or More,White,Female,Male,hhid
0,1,2019,3331,19,19,0,0,18,18,0,...,4,2,3,0,1,3,31,22,22,18.0
1,2,2019,3331,35,35,0,0,34,34,0,...,2,3,7,0,1,2,63,39,39,34.0
2,3,2019,3358,52,52,0,0,52,52,0,...,4,0,31,0,1,4,99,66,73,52.0
3,4,2019,3358,30,30,0,0,30,30,0,...,0,0,18,0,2,0,50,31,40,30.0
4,5,2019,3358,28,28,0,0,28,28,0,...,5,0,15,0,0,0,51,31,40,28.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22997,22998,2019,1290,90,90,0,0,87,87,0,...,43,5,70,0,2,1,120,132,109,87.0
22998,22999,2019,1290,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
22999,23000,2019,1290,131,131,0,0,126,126,0,...,28,9,73,0,0,10,194,150,169,126.0
23000,23001,2019,1254,83,83,0,0,81,81,0,...,27,3,49,0,1,7,121,106,102,81.0


In [15]:
household_check_df = new_df[['Household Population (hh)', 'gq_civ', 'Group Quarters - Military (gq_mil)', 'hhid']]

household_check_df['Household Sums'] = household_check_df['Household Population (hh)'] + household_check_df['gq_civ'] + household_check_df['Group Quarters - Military (gq_mil)']

household_check_df['Flag'] = household_check_df['Household Sums'] == household_check_df['hhid']

household_check_df[household_check_df['Flag'] == False]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  household_check_df['Household Sums'] = household_check_df['Household Population (hh)'] + household_check_df['gq_civ'] + household_check_df['Group Quarters - Military (gq_mil)']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  household_check_df['Flag'] = household_check_df['Household Sums'] == household_check_df['hhid']


Unnamed: 0,Household Population (hh),gq_civ,Group Quarters - Military (gq_mil),hhid,Household Sums,Flag
81,0,0,0,,0,False
94,0,0,0,,0,False
112,0,0,0,,0,False
120,34,6,0,34.0,40,False
131,69,68,0,69.0,137,False
...,...,...,...,...,...,...
22977,0,0,0,,0,False
22978,73,5,0,73.0,78,False
22996,0,6,0,,6,False
22998,0,0,0,,0,False


# Work from 7/9/22 with Purva

In [26]:
households_mgra = list(mgra_count.index)

csv_mgras = list(mgra_ind[mgra_ind['Household Population (hh)'] == 0]['mgra'])


all_mgras = list(mgra_ind['mgra'])

# Goal: Find MGRAs that are showing up in the csv as having no 'household population (hh)' but are

In [35]:
no_HHID_mgra = list(set(all_mgras) - set(households_mgra)) # Households that don't have HHIDs
print(len(no_HHID_mgra))


no_HHID_with_HHS = list(set(no_HHID_mgra) - set(csv_mgras))
print(len(no_HHID_with_HHS))
no_HHID_with_HHS

5158
0


[]

In [45]:
mgra_with_no_hhid = list(set(csv_mgras) - set(no_HHID_mgra)) # MGRA household number of zero subtracted by mgras with no ID
mgra_with_no_hhid
#Confirm these don't exists in the households.csv

[6276,
 5897,
 6281,
 19214,
 3344,
 10266,
 2726,
 6055,
 2090,
 2095,
 1072,
 2096,
 22705,
 1076,
 1205,
 22840,
 3005,
 1213,
 22593,
 3017,
 4170,
 4172,
 4173,
 7653]

In [48]:
for mgra in mgra_with_no_hhid:
    print(mgra in households_mgra)

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [52]:
output = mgra_ind[mgra_ind.mgra.isin(list(set(csv_mgras) - set(no_HHID_mgra)))]
output

Unnamed: 0,mgra,year,taz,hs,hs_Single_Family,hs_Multiple_Family,hs_Mobile_Homes,Household Population (hh),hh_Single_Family,hh_Multiple_Family,...,American Indian,Asian,Black,Hispanic,Other,Pacific Islander,Two or More,White,Female,Male
1071,1072,2019,3176,0,0,0,0,0,0,0,...,0,115,10,50,2,1,20,245,252,191
1075,1076,2019,3098,0,0,0,0,0,0,0,...,1,110,12,78,0,1,28,348,272,306
1204,1205,2019,3111,0,0,0,0,0,0,0,...,0,30,18,50,0,0,13,181,152,140
1212,1213,2019,3142,0,0,0,0,0,0,0,...,1,249,47,292,7,4,82,953,953,682
2089,2090,2019,4314,0,0,0,0,0,0,0,...,6,34,58,64,0,3,22,258,57,388
2094,2095,2019,4330,0,0,0,0,0,0,0,...,43,273,665,748,15,28,180,2574,536,3990
2095,2096,2019,4341,0,0,0,0,0,0,0,...,20,131,327,371,7,8,87,1303,242,2012
2725,2726,2019,3776,0,0,0,0,0,0,0,...,2,26,42,55,3,1,13,172,35,279
3004,3005,2019,3789,0,0,0,0,0,0,0,...,0,5,8,6,0,0,2,26,4,43
3016,3017,2019,3563,0,0,0,0,0,0,0,...,42,171,328,759,15,23,149,2777,239,4025


In [57]:
nothing = mgra_ind[(mgra_ind['Household Population (hh)'] == 0) & (mgra_ind['gq_civ'] == 0) & (mgra_ind['Group Quarters - Military (gq_mil)'] == 0)]
nothing

Unnamed: 0,mgra,year,taz,hs,hs_Single_Family,hs_Multiple_Family,hs_Mobile_Homes,Household Population (hh),hh_Single_Family,hh_Multiple_Family,...,American Indian,Asian,Black,Hispanic,Other,Pacific Islander,Two or More,White,Female,Male
81,82,2019,3431,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
94,95,2019,3371,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
112,113,2019,3470,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
157,158,2019,3547,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
163,164,2019,3515,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22967,22968,2019,1212,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22971,22972,2019,1212,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22977,22978,2019,1161,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22998,22999,2019,1290,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
len(list(set(no_HHID_mgra) - set(list(nothing['mgra']))))

266

In [61]:
output = mgra_ind[mgra_ind.mgra.isin(list(set(no_HHID_mgra) - set(list(nothing['mgra']))))]
output

Unnamed: 0,mgra,year,taz,hs,hs_Single_Family,hs_Multiple_Family,hs_Mobile_Homes,Household Population (hh),hh_Single_Family,hh_Multiple_Family,...,American Indian,Asian,Black,Hispanic,Other,Pacific Islander,Two or More,White,Female,Male
192,193,2019,3325,0,0,0,0,0,0,0,...,0,5,8,25,0,0,3,39,41,39
218,219,2019,3325,0,0,0,0,0,0,0,...,0,5,10,28,2,1,1,54,34,67
403,404,2019,3311,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
419,420,2019,3392,0,0,0,0,0,0,0,...,0,1,3,2,0,0,0,1,1,6
506,507,2019,3396,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22884,22885,2019,4358,0,0,0,0,0,0,0,...,1,8,10,29,0,0,0,41,63,26
22906,22907,2019,961,0,0,0,0,0,0,0,...,0,0,0,3,0,0,0,0,0,3
22914,22915,2019,1002,0,0,0,0,0,0,0,...,0,0,0,2,0,0,1,0,2,1
22925,22926,2019,1040,0,0,0,0,0,0,0,...,0,0,0,8,0,0,0,1,1,8


In [55]:
a = list(mgra_ind.columns)

In [44]:
mgra_ind[mgra_ind.mgra.isin(list(set(csv_mgras) - set(no_HHID_mgra)))].to_csv('C:/Users/cra/OneDrive - San Diego Association of Governments/QA_Repository/2022/2022-47 Base Year Forecast Output QC/output.csv')

In [64]:
new_df.to_csv('C:/Users/cra/OneDrive - San Diego Association of Governments/QA_Repository/2022/2022-47 Base Year Forecast Output QC/new_df.csv')

In [40]:
mgra_ind[mgra_ind['mgra'] == 6276]

Unnamed: 0,mgra,year,taz,hs,hs_Single_Family,hs_Multiple_Family,hs_Mobile_Homes,Household Population (hh),hh_Single_Family,hh_Multiple_Family,...,American Indian,Asian,Black,Hispanic,Other,Pacific Islander,Two or More,White,Female,Male
6275,6276,2019,2206,0,0,0,0,0,0,0,...,55,192,446,775,9,30,145,2857,405,4104


## Count of unique hinccat1 by mgra 
This is count of number of houseold income types 

In [69]:
# Unpiviting and concatonating what is needed
pivot = households[['mgra', 'hinccat1']].pivot(columns=['hinccat1'])

# This is a multicolumn householdsframe, so I am removing that top column and only dealing with the households underneath
pivot = pivot['mgra']

pivot[pivot > 0] = 1 # The households in its current form has the hinccat1 value in the column then the MGRA in the row. I don't care about the specific MGRA here so I set all values greater than zero to one, this is so summing is easier. 

unique_hinccat1 = pd.concat([households[['mgra']], pivot], axis = 1) # Appending this new pivot to the MGRAs on the initial download
unique_hinccat1 = unique_hinccat1.groupby('mgra').sum() # Now summing up the unique values in each 
unique_hinccat1

Unnamed: 0_level_0,1,2,3,4,5
mgra,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2.0,2.0,1.0,5.0,8.0
2,2.0,4.0,3.0,5.0,20.0
3,1.0,12.0,4.0,6.0,29.0
4,6.0,1.0,7.0,5.0,11.0
5,3.0,6.0,6.0,3.0,10.0
...,...,...,...,...,...
22995,2.0,5.0,6.0,4.0,20.0
22996,15.0,23.0,16.0,8.0,41.0
22998,15.0,16.0,11.0,12.0,33.0
23000,12.0,15.0,30.0,18.0,51.0


In [65]:
# Income Group IDs - How do I know which these pertain to the five points above 
import pyodbc

# Connect to DDAMWSQL16 to access staging table and create separate dataframes for each year
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=demographic_warehouse;'
                      'Trusted_Connection=yes;')

qry = '''SELECT [income_group_id]
      ,[income_group]
      ,[name]
      ,[constant_dollars_year]
      ,[lower_bound]
      ,[upper_bound]
      ,[categorization]
  FROM [demographic_warehouse].[dim].[income_group]'''


income_groups = pd.read_sql_query(qry, conn)
subset_income = income_groups.iloc[30:,]
subset_income

Unnamed: 0,income_group_id,income_group,name,constant_dollars_year,lower_bound,upper_bound,categorization
30,31,1,< $30k,2010,0,29999,5
31,32,2,$30-60k,2010,30000,59999,5
32,33,3,$60-100k,2010,60000,99999,5
33,34,4,$100-150k,2010,100000,149999,5
34,35,5,$150k+,2010,150000,349999,5


In [70]:
unique_hinccat1.columns = list(subset_income['name'])

In [71]:
unique_hinccat1

Unnamed: 0_level_0,< $30k,$30-60k,$60-100k,$100-150k,$150k+
mgra,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2.0,2.0,1.0,5.0,8.0
2,2.0,4.0,3.0,5.0,20.0
3,1.0,12.0,4.0,6.0,29.0
4,6.0,1.0,7.0,5.0,11.0
5,3.0,6.0,6.0,3.0,10.0
...,...,...,...,...,...
22995,2.0,5.0,6.0,4.0,20.0
22996,15.0,23.0,16.0,8.0,41.0
22998,15.0,16.0,11.0,12.0,33.0
23000,12.0,15.0,30.0,18.0,51.0


In [74]:
mgra_ind[['Less than $15,000',
'$15,000 to $29,999',
'$30,000 to $44,999',
'$45,000 to $59,999',
'$60,000 to $74,999',
'$75,000 to $99,999',
'$100,000 to $124,999',
'$125,000 to $149,999',
'$150,000 to $199,999',
'$200,000 or more']]

Unnamed: 0,"Less than $15,000","$15,000 to $29,999","$30,000 to $44,999","$45,000 to $59,999","$60,000 to $74,999","$75,000 to $99,999","$100,000 to $124,999","$125,000 to $149,999","$150,000 to $199,999","$200,000 or more"
0,0,2,2,0,0,1,3,2,2,6
1,1,1,3,1,1,2,3,2,5,15
2,0,1,7,5,2,2,3,3,14,15
3,4,2,0,1,4,3,3,2,2,9
4,1,2,3,3,1,5,3,0,3,7
...,...,...,...,...,...,...,...,...,...,...
22997,6,9,11,5,6,5,5,7,15,18
22998,0,0,0,0,0,0,0,0,0,0
22999,5,7,11,4,10,20,9,9,17,34
23000,8,2,11,6,4,13,9,4,9,15


In [76]:
# Add all of these on there 
# mgra_rebuilt = pd.DataFrame()
# mgra_rebuilt['< $30k']

# mgra_rebuilt

# Compare Income Groups Between Households CSV and MGRA File
Calculate the number of houeholds under each income category (hhinccat1) by mgra from the housholds.csv file and compare these values by mgra from the mgra.csv file  

# Count of hhid by untittype
This is by group quarter 

- should compare with csv 
0: non-group-quarter
1: Group quarter

- Do this inside CSV 

In [6]:
# Unpiviting and concatonating what is needed
pivot = households[['hhid', 'unittype']].pivot(columns=['unittype'])

# This is a multicolumn householdsframe, so I am removing that top column and only dealing with the households underneath
pivot = pivot['hhid']

pivot[pivot > 0] = 1 # The households in its current form has the hinccat1 value in the column then the MGRA in the row. I don't care about the specific MGRA here so I set all values greater than zero to one, this is so summing is easier. 

unique_hhid_by_unittype = pd.concat([households[['mgra']], pivot], axis = 1) # Appending this new pivot to the MGRAs on the initial download
unique_hhid_by_unittype = unique_hhid_by_unittype.groupby('mgra').sum() # Now summing up the unique values in each 
unique_hhid_by_unittype

Unnamed: 0_level_0,0,1
mgra,Unnamed: 1_level_1,Unnamed: 2_level_1
1,18.0,0.0
2,34.0,0.0
3,52.0,0.0
4,30.0,0.0
5,28.0,0.0
...,...,...
22995,37.0,0.0
22996,103.0,0.0
22998,87.0,0.0
23000,126.0,0.0


## Sum of persons by MGRA 
Number of people in each MGRA

In [7]:
sum_person_by_mgra = households[['mgra', 'persons']].groupby('mgra').sum()
sum_person_by_mgra

Unnamed: 0_level_0,persons
mgra,Unnamed: 1_level_1
1,44
2,78
3,139
4,71
5,71
...,...
22995,85
22996,283
22998,241
23000,319
