# Number of Households in Assembly Districts 
#### Purpose:
This notebook shows the process of transforming and manipulating Claritas census block group data to create a table with displaying the total households, percentage of households by group, percentage of households by district, and district indexes.

#### Data Source:
Data for households by group comes from Claritas December 2020 Data and is located in the csv file:
- `J:\DataScience\DSEconProdDessem\Poli_Geog\San_Diego_Region_by_BG_2020_Claritas_HH_Ct_by_SANDAG_Groups_Dec2020.csv`

Data for block groups and assembly districts is located on the SQL server `sql2014b8`.

#### Transformations being performed:
Household data from Claritas was read and merged with Assembly District crosswalk data to obtain household data by Assembly Districts. A series of transposes and summations was done to obtain the **total households**, **percentage of households by group**, **percentage of households by district**, and the **district indexes**. 

#### Location of Outputs:
The outputs are to be placed on the path `J:\DataScience\DSEconProdDessem\Poli_Geog\State Assembly Districts\HH_by_Groups_&_Districts\HH_by_Groups_and_Assembly_Districts.csv`

#### Author: 
Jeffrey Chu (jeffrey.chu@sandag.org)

#### Data Created 
3/28/2023

# Importing required modules

In [17]:
import numpy as np
import pandas as pd
import pyodbc

# Path variable for reading files and connection to SQL server
PATH = 'J:\DataScience\DSEconProdDessem\Poli_Geog\\'
connection = pyodbc.connect('Driver={SQL Server};'
                          'Server=sql2014b8.sandag.org;' 
                          'Trusted_Connection=yes;')

# Reading and merging data

In [18]:
hh = pd.read_csv(PATH + r'San_Diego_Region_by_BG_2020_Claritas_HH_Ct_by_SANDAG_Groups_Dec2020.csv')
hh

Unnamed: 0,CTBLOCKGROUP,CT,BLOCKGROUP,Total,Grp_1,Grp_2,Grp_3,Grp_4,Grp_5,Grp_6,Grp_7,Grp_8,Grp_9,Grp_10,Grp_11,Unnamed: 15
0,1001,100,1,635,0,0,0,1,0,525,2,0,0,107,0,
1,1002,100,2,747,0,0,210,8,0,519,10,0,0,0,0,
2,2011,201,1,1111,0,103,268,156,0,354,230,0,0,0,0,
3,2021,202,1,734,0,160,100,178,0,149,147,0,0,0,0,
4,2022,202,2,437,0,19,83,141,0,64,130,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1789,220001,22000,1,480,480,0,0,0,0,0,0,0,0,0,0,
1790,220002,22000,2,819,819,0,0,0,0,0,0,0,0,0,0,
1791,221001,22100,1,1208,0,0,159,265,0,466,12,32,0,274,0,
1792,221002,22100,2,1326,17,47,18,189,95,87,24,104,466,279,0,


In [19]:
# Merge census block groups with districts
# based on largest intersection between census block groups and districts
bg_dist = pd.read_sql_query('''SELECT bg.[OBJECTID]
      ,[CTBLOCKGROUP]
      ,[CT]
      ,[BLOCKGROUP]
      ,[DISTRICT]
      ,ad.Shape.STIntersection(bg.Shape).STArea() as area
  FROM [GeoDepot].[gis].[CENSUSBLOCKGROUPS2010] as bg
  JOIN [GeoDepot].[gis].[STATEASSEMBLYDISTRICTS] as ad
  ON bg.Shape.STIntersects(ad.Shape) = 1''',connection)
bg_dist = bg_dist.sort_values('area', ascending = False).reset_index(drop=True)

# Drop duplicate CTBLOCKGROUPs - take only the highest area intersections
bg_dist = bg_dist.drop_duplicates('CTBLOCKGROUP')[['CTBLOCKGROUP', 'DISTRICT']]
bg_dist = bg_dist.sort_values('CTBLOCKGROUP').reset_index(drop=True)
bg_dist



Unnamed: 0,CTBLOCKGROUP,DISTRICT
0,1001,78
1,1002,78
2,2011,78
3,2021,78
4,2022,78
...,...,...
1789,220001,80
1790,220002,80
1791,221001,77
1792,221002,77


In [20]:
result = hh.merge(bg_dist, how='inner', left_on='CTBLOCKGROUP', right_on='CTBLOCKGROUP')
result.sort_values('DISTRICT').reset_index(drop=True)
result

Unnamed: 0,CTBLOCKGROUP,CT,BLOCKGROUP,Total,Grp_1,Grp_2,Grp_3,Grp_4,Grp_5,Grp_6,Grp_7,Grp_8,Grp_9,Grp_10,Grp_11,Unnamed: 15,DISTRICT
0,1001,100,1,635,0,0,0,1,0,525,2,0,0,107,0,,78
1,1002,100,2,747,0,0,210,8,0,519,10,0,0,0,0,,78
2,2011,201,1,1111,0,103,268,156,0,354,230,0,0,0,0,,78
3,2021,202,1,734,0,160,100,178,0,149,147,0,0,0,0,,78
4,2022,202,2,437,0,19,83,141,0,64,130,0,0,0,0,,78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1789,220001,22000,1,480,480,0,0,0,0,0,0,0,0,0,0,,80
1790,220002,22000,2,819,819,0,0,0,0,0,0,0,0,0,0,,80
1791,221001,22100,1,1208,0,0,159,265,0,466,12,32,0,274,0,,77
1792,221002,22100,2,1326,17,47,18,189,95,87,24,104,466,279,0,,77


# Total Households

In [21]:
df = result[['DISTRICT',
        'Grp_1',
        'Grp_2',
        'Grp_3',
        'Grp_4',
        'Grp_5',
        'Grp_6',
        'Grp_7',
        'Grp_8',
        'Grp_9',
        'Grp_10',
        'Grp_11']].groupby('DISTRICT').sum().reset_index()
df

Unnamed: 0,DISTRICT,Grp_1,Grp_2,Grp_3,Grp_4,Grp_5,Grp_6,Grp_7,Grp_8,Grp_9,Grp_10,Grp_11
0,74,2156,9227,4952,11896,24335,2485,14393,10992,9352,10841,1231
1,75,846,5252,3485,5783,14911,20621,24666,22133,20396,43599,10313
2,76,1646,6497,7539,10697,23101,30111,13926,18450,10315,37307,997
3,77,590,10524,18578,54975,2444,74110,14998,4754,4442,28780,1
4,78,940,21194,20983,56214,12112,22848,28421,9826,16454,21331,0
5,79,48496,10711,5085,41697,19878,4836,13511,12093,3060,3256,0
6,80,58889,5697,8939,22045,13552,6097,14165,7404,4197,15053,304


In [22]:
df2 = df.melt(id_vars=['DISTRICT'], var_name = 'Group', value_name = 'HH')
df2

Unnamed: 0,DISTRICT,Group,HH
0,74,Grp_1,2156
1,75,Grp_1,846
2,76,Grp_1,1646
3,77,Grp_1,590
4,78,Grp_1,940
...,...,...,...
72,76,Grp_11,997
73,77,Grp_11,1
74,78,Grp_11,0
75,79,Grp_11,0


In [23]:
total_hh = df2.pivot(index='Group', columns='DISTRICT', values = 'HH')
total_hh

DISTRICT,74,75,76,77,78,79,80
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Grp_1,2156,846,1646,590,940,48496,58889
Grp_10,10841,43599,37307,28780,21331,3256,15053
Grp_11,1231,10313,997,1,0,0,304
Grp_2,9227,5252,6497,10524,21194,10711,5697
Grp_3,4952,3485,7539,18578,20983,5085,8939
Grp_4,11896,5783,10697,54975,56214,41697,22045
Grp_5,24335,14911,23101,2444,12112,19878,13552
Grp_6,2485,20621,30111,74110,22848,4836,6097
Grp_7,14393,24666,13926,14998,28421,13511,14165
Grp_8,10992,22133,18450,4754,9826,12093,7404


In [24]:
regional_total_hh = total_hh.T.sum()
regional_total_hh

Group
Grp_1     113563
Grp_10    160167
Grp_11     12846
Grp_2      69102
Grp_3      69561
Grp_4     203307
Grp_5     110333
Grp_6     161108
Grp_7     124080
Grp_8      85652
Grp_9      68216
dtype: int64

# Percentage of households

In [25]:
# Percentage of households = Number of households / Total households
# Unrounded percentages will be used in calculating District Index
percent_hh_unrounded = (total_hh/total_hh.sum()*100)
percent_hh = percent_hh_unrounded.round(1)
percent_hh

DISTRICT,74,75,76,77,78,79,80
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Grp_1,2.1,0.5,1.0,0.3,0.4,29.8,37.7
Grp_10,10.6,25.3,23.2,13.4,10.1,2.0,9.6
Grp_11,1.2,6.0,0.6,0.0,0.0,0.0,0.2
Grp_2,9.1,3.1,4.0,4.9,10.1,6.6,3.6
Grp_3,4.9,2.0,4.7,8.7,10.0,3.1,5.7
Grp_4,11.7,3.4,6.7,25.7,26.7,25.6,14.1
Grp_5,23.9,8.7,14.4,1.1,5.8,12.2,8.7
Grp_6,2.4,12.0,18.8,34.6,10.9,3.0,3.9
Grp_7,14.1,14.3,8.7,7.0,13.5,8.3,9.1
Grp_8,10.8,12.9,11.5,2.2,4.7,7.4,4.7


In [26]:
regional_percent_hh_unrounded = (total_hh.T.sum()/total_hh.sum().sum()*100)
regional_percent_hh = (total_hh.T.sum()/total_hh.sum().sum()*100).round(1)
regional_percent_hh

Group
Grp_1      9.6
Grp_10    13.6
Grp_11     1.1
Grp_2      5.9
Grp_3      5.9
Grp_4     17.3
Grp_5      9.4
Grp_6     13.7
Grp_7     10.5
Grp_8      7.3
Grp_9      5.8
dtype: float64

# Percent of households (by district)

In [27]:
# Percentage of households (by district) = Number of households in a group / Total households in that group
pen_dist = (total_hh.T/total_hh.T.sum()*100).T.round(1)
pen_dist

DISTRICT,74,75,76,77,78,79,80
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Grp_1,1.9,0.7,1.4,0.5,0.8,42.7,51.9
Grp_10,6.8,27.2,23.3,18.0,13.3,2.0,9.4
Grp_11,9.6,80.3,7.8,0.0,0.0,0.0,2.4
Grp_2,13.4,7.6,9.4,15.2,30.7,15.5,8.2
Grp_3,7.1,5.0,10.8,26.7,30.2,7.3,12.9
Grp_4,5.9,2.8,5.3,27.0,27.6,20.5,10.8
Grp_5,22.1,13.5,20.9,2.2,11.0,18.0,12.3
Grp_6,1.5,12.8,18.7,46.0,14.2,3.0,3.8
Grp_7,11.6,19.9,11.2,12.1,22.9,10.9,11.4
Grp_8,12.8,25.8,21.5,5.6,11.5,14.1,8.6


# District index

In [28]:
# District Index = (Percent of households / Regional household percentages)*100
index_dist = (percent_hh_unrounded.T/regional_percent_hh_unrounded*100).T.round()
index_dist

DISTRICT,74,75,76,77,78,79,80
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Grp_1,22.0,5.0,11.0,3.0,5.0,309.0,391.0
Grp_10,78.0,186.0,171.0,99.0,75.0,15.0,71.0
Grp_11,111.0,550.0,57.0,0.0,0.0,0.0,18.0
Grp_2,154.0,52.0,69.0,84.0,172.0,112.0,62.0
Grp_3,82.0,34.0,79.0,147.0,169.0,53.0,97.0
Grp_4,68.0,19.0,39.0,149.0,155.0,149.0,82.0
Grp_5,255.0,93.0,154.0,12.0,61.0,130.0,93.0
Grp_6,18.0,88.0,137.0,253.0,79.0,22.0,29.0
Grp_7,134.0,136.0,82.0,66.0,128.0,79.0,86.0
Grp_8,148.0,177.0,158.0,31.0,64.0,102.0,65.0


# Merging Dataframes Together

In [29]:
# Add regional totals
regional_totals = pd.concat([regional_percent_hh, regional_total_hh], axis=1)\
                    .rename(columns={0:'%total_Region', 1:'#HH_Region'})
regional_totals

Unnamed: 0_level_0,%total_Region,#HH_Region
Group,Unnamed: 1_level_1,Unnamed: 2_level_1
Grp_1,9.6,113563
Grp_10,13.6,160167
Grp_11,1.1,12846
Grp_2,5.9,69102
Grp_3,5.9,69561
Grp_4,17.3,203307
Grp_5,9.4,110333
Grp_6,13.7,161108
Grp_7,10.5,124080
Grp_8,7.3,85652


In [30]:
# Join individual calculations
households = total_hh.join(percent_hh, how='outer', lsuffix='_#HH', rsuffix='_%HH')\
                     .join(regional_totals, how='outer')\
                     .join(index_dist, how='outer')\
                     .join(pen_dist, how='outer', lsuffix='_Ind', rsuffix='_%Dist')
households

Unnamed: 0_level_0,74_#HH,75_#HH,76_#HH,77_#HH,78_#HH,79_#HH,80_#HH,74_%HH,75_%HH,76_%HH,...,78_Ind,79_Ind,80_Ind,74_%Dist,75_%Dist,76_%Dist,77_%Dist,78_%Dist,79_%Dist,80_%Dist
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Grp_1,2156,846,1646,590,940,48496,58889,2.1,0.5,1.0,...,5.0,309.0,391.0,1.9,0.7,1.4,0.5,0.8,42.7,51.9
Grp_10,10841,43599,37307,28780,21331,3256,15053,10.6,25.3,23.2,...,75.0,15.0,71.0,6.8,27.2,23.3,18.0,13.3,2.0,9.4
Grp_11,1231,10313,997,1,0,0,304,1.2,6.0,0.6,...,0.0,0.0,18.0,9.6,80.3,7.8,0.0,0.0,0.0,2.4
Grp_2,9227,5252,6497,10524,21194,10711,5697,9.1,3.1,4.0,...,172.0,112.0,62.0,13.4,7.6,9.4,15.2,30.7,15.5,8.2
Grp_3,4952,3485,7539,18578,20983,5085,8939,4.9,2.0,4.7,...,169.0,53.0,97.0,7.1,5.0,10.8,26.7,30.2,7.3,12.9
Grp_4,11896,5783,10697,54975,56214,41697,22045,11.7,3.4,6.7,...,155.0,149.0,82.0,5.9,2.8,5.3,27.0,27.6,20.5,10.8
Grp_5,24335,14911,23101,2444,12112,19878,13552,23.9,8.7,14.4,...,61.0,130.0,93.0,22.1,13.5,20.9,2.2,11.0,18.0,12.3
Grp_6,2485,20621,30111,74110,22848,4836,6097,2.4,12.0,18.8,...,79.0,22.0,29.0,1.5,12.8,18.7,46.0,14.2,3.0,3.8
Grp_7,14393,24666,13926,14998,28421,13511,14165,14.1,14.3,8.7,...,128.0,79.0,86.0,11.6,19.9,11.2,12.1,22.9,10.9,11.4
Grp_8,10992,22133,18450,4754,9826,12093,7404,10.8,12.9,11.5,...,64.0,102.0,65.0,12.8,25.8,21.5,5.6,11.5,14.1,8.6


In [31]:
households.columns

Index(['74_#HH', '75_#HH', '76_#HH', '77_#HH', '78_#HH', '79_#HH', '80_#HH',
       '74_%HH', '75_%HH', '76_%HH', '77_%HH', '78_%HH', '79_%HH', '80_%HH',
       '%total_Region', '#HH_Region', '74_Ind', '75_Ind', '76_Ind', '77_Ind',
       '78_Ind', '79_Ind', '80_Ind', '74_%Dist', '75_%Dist', '76_%Dist',
       '77_%Dist', '78_%Dist', '79_%Dist', '80_%Dist'],
      dtype='object')

In [32]:
# Rename columns
def rename_columns(col_temp):
    col = str(col_temp)
    if '_#HH' in col:
        return 'HH_Dist_' + col[:2]
    elif '_%HH' in col:
        return 'PercentHH_Dist_' + col[:2]
    elif '_%Dist' in col:
        return 'PercentGrp_Dist_' + col[:2]
    elif '_Ind' in col:
        return 'Index_Dist_' + col[:2]
    elif 'Group' in col:
        return 'Group_number'
    else:
        return col
results = households.reset_index()
renamed_columns = dict(zip(results.columns, map(rename_columns, results.columns)))
results = results.rename(columns = renamed_columns)

# Rename Group numbers
results = results.assign(Group_number = results['Group_number'].apply(lambda x: int(x.split('_')[1])))
results = results.sort_values('Group_number').reset_index(drop=True)

# Output to excel
results.to_excel("HH_by_Groups_&_Districts_QC.xlsx",
             sheet_name='HH_by_Groups_&_Districts')  
results

Unnamed: 0,Group_number,HH_Dist_74,HH_Dist_75,HH_Dist_76,HH_Dist_77,HH_Dist_78,HH_Dist_79,HH_Dist_80,PercentHH_Dist_74,PercentHH_Dist_75,...,Index_Dist_78,Index_Dist_79,Index_Dist_80,PercentGrp_Dist_74,PercentGrp_Dist_75,PercentGrp_Dist_76,PercentGrp_Dist_77,PercentGrp_Dist_78,PercentGrp_Dist_79,PercentGrp_Dist_80
0,1,2156,846,1646,590,940,48496,58889,2.1,0.5,...,5.0,309.0,391.0,1.9,0.7,1.4,0.5,0.8,42.7,51.9
1,2,9227,5252,6497,10524,21194,10711,5697,9.1,3.1,...,172.0,112.0,62.0,13.4,7.6,9.4,15.2,30.7,15.5,8.2
2,3,4952,3485,7539,18578,20983,5085,8939,4.9,2.0,...,169.0,53.0,97.0,7.1,5.0,10.8,26.7,30.2,7.3,12.9
3,4,11896,5783,10697,54975,56214,41697,22045,11.7,3.4,...,155.0,149.0,82.0,5.9,2.8,5.3,27.0,27.6,20.5,10.8
4,5,24335,14911,23101,2444,12112,19878,13552,23.9,8.7,...,61.0,130.0,93.0,22.1,13.5,20.9,2.2,11.0,18.0,12.3
5,6,2485,20621,30111,74110,22848,4836,6097,2.4,12.0,...,79.0,22.0,29.0,1.5,12.8,18.7,46.0,14.2,3.0,3.8
6,7,14393,24666,13926,14998,28421,13511,14165,14.1,14.3,...,128.0,79.0,86.0,11.6,19.9,11.2,12.1,22.9,10.9,11.4
7,8,10992,22133,18450,4754,9826,12093,7404,10.8,12.9,...,64.0,102.0,65.0,12.8,25.8,21.5,5.6,11.5,14.1,8.6
8,9,9352,20396,10315,4442,16454,3060,4197,9.2,11.9,...,135.0,32.0,46.0,13.7,29.9,15.1,6.5,24.1,4.5,6.2
9,10,10841,43599,37307,28780,21331,3256,15053,10.6,25.3,...,75.0,15.0,71.0,6.8,27.2,23.3,18.0,13.3,2.0,9.4
