![SolasAI Banner Image](../images/SolasAI-Logo.png)

<br>

# Demographic Estimation

## Sample data

In [1]:
import pandas as pd

data = pd.read_csv(
    "data/demographic_test_data.csv", index_col=0, dtype={
        "GEOID": str,
        "ZIP9": str,
        "ZIP5": str,
    }
)

data.head()

Unnamed: 0,UNIQUE_ID,FIRSTNAME,LASTNAME,ZIP9,GEOID,LATITUDE,LONGITUDE,ZIP5
0,100216,ARVY,BADORE,50115-1235,60371174051,41.67,-94.49,50115
1,354368,JOSIPHINE,TREHARNE,50036-0112,88045,42.05,-93.87,50036
2,8178,NATERRIA,KLOSINSKI,50210-9009,6041990100,41.18,-93.73,50210
3,288544,ANGELE,SHEWMAKE,50565-8802,480990106013,42.86,-94.98,50565
4,163601,RIHAM,VEERMAN,50613-2951,180390003022,42.52,-92.45,50613


# About the GeoID

The GeoID is a 12 character ID specifying the census geography in the following format. Block group can be omitted to provide Census Tract alone, but should be padded with a trailing space.

```
AABBBCCCCCCD
||---------- (A) 2 Digit State FIPS Code (Left Zero Padded)
--|||------- (B) 3 Digit County FIPS Code (Left Zero Padded)
-----||||||- (C) 6 Digit Census Tract (Left Zero Padded Implied 2 Decimal Places)
-----------| (D) 1 Digit Block Group Code
```

## Gender Estimation

In [2]:
from solas_disparity import demographic_estimation

ge = demographic_estimation(
    estimate_race_proportion=False,
    input_data=data,
    unique_id_column="UNIQUE_ID",
    first_name_column="FIRSTNAME",
)
ge.output.head()

Unnamed: 0,UNIQUE_ID,FIRSTNAME,LASTNAME,ZIP9,GEOID,LATITUDE,LONGITUDE,ZIP5,Female Pct
0,100216,ARVY,BADORE,50115-1235,60371174051,41.67,-94.49,50115,0.0
1,354368,JOSIPHINE,TREHARNE,50036-0112,88045,42.05,-93.87,50036,1.0
2,8178,NATERRIA,KLOSINSKI,50210-9009,6041990100,41.18,-93.73,50210,1.0
3,288544,ANGELE,SHEWMAKE,50565-8802,480990106013,42.86,-94.98,50565,1.0
4,163601,RIHAM,VEERMAN,50613-2951,180390003022,42.52,-92.45,50613,1.0


In [3]:
ge.report

Unnamed: 0,Unnamed: 1,Count,Percent
Gender Estimate: Forename Match Results,Exact Match,3000,100.00%
Gender Estimate: Forename Match Results,Unmatched,0,0.00%


## Race Estimation using GEOID

In [4]:
re_geoid = demographic_estimation(
    census_year=2020,
    input_data=data,
    last_name_column="LASTNAME",
    geoid_column="GEOID",
    unique_id_column="UNIQUE_ID",
    estimate_gender_proportion=False,
)
re_geoid.output.head()

Unnamed: 0,UNIQUE_ID,FIRSTNAME,LASTNAME,ZIP9,GEOID,LATITUDE,LONGITUDE,ZIP5,Cleaned Name,Geo Basis,Summary Level,White,Black,Asian/PI,AI/AN,Multi-Race,Hispanic
0,100216,ARVY,BADORE,50115-1235,60371174051,41.67,-94.49,50115,BADORE,GEO_ID,BLKGRP,0.846624,0.016026,0.0,0.0,0.0,0.13735
1,354368,JOSIPHINE,TREHARNE,50036-0112,88045,42.05,-93.87,50036,TREHARNE,GEO_ID,ZCTA,0.765501,0.000169,0.000478,0.0,0.006631,0.227221
2,8178,NATERRIA,KLOSINSKI,50210-9009,6041990100,41.18,-93.73,50210,KLOSINSKI,GEO_ID,,,,,,,
3,288544,ANGELE,SHEWMAKE,50565-8802,480990106013,42.86,-94.98,50565,SHEWMAKE,GEO_ID,BLKGRP,0.858475,0.035733,0.010962,0.004093,0.053001,0.037736
4,163601,RIHAM,VEERMAN,50613-2951,180390003022,42.52,-92.45,50613,VEERMAN,GEO_ID,BLKGRP,0.89377,0.001619,0.002115,0.0,0.073895,0.028602


In [5]:
re_geoid.report

Unnamed: 0,Unnamed: 1,Count,Percent
GeoID Validation results,"Tract, Block Group and ZIP Code",0,0.00%
GeoID Validation results,"Tract & Block Group, No ZIP Code",2008,66.93%
GeoID Validation results,"Tract & ZIP Code, No Block Group",0,0.00%
GeoID Validation results,"Tract Only, No Block Group or ZIP Code",706,23.53%
GeoID Validation results,"ZIP Code Only, No Tract or Block Group",286,9.53%
GeoID Validation results,Invalid Geocoding,0,0.00%
Surname match counts,Exact matches,2999,99.97%
Surname match counts,Adjusted names,0,0.00%
Surname match counts,Default names,1,0.03%
Surname match counts,Unmatched names,0,0.00%


## Race Estimation using Zip9

In [6]:
re_zip9 = demographic_estimation(
    census_year=2018,
    input_data=data,
    last_name_column="LASTNAME",
    zip9_column="ZIP9",
    unique_id_column="UNIQUE_ID",
    estimate_gender_proportion=False,
    verbose=True,
)
re_zip9.output.head()

2024-01-10 09:31:09 - Input data Zip 9 sanity check
Valid Zip9 pattern (99999-9999)        3000 100.00%
  Records not matching pattern           0   0.00%
2024-01-10 09:31:09 - Starting Zip9 to Census Geocoding Match
	Geocoded zip2: 01	count:          50  (  1.67% )
	Geocoded zip2: 02	count:          50  (  1.67% )
	Geocoded zip2: 03	count:          50  (  1.67% )
	Geocoded zip2: 04	count:          50  (  1.67% )
	Geocoded zip2: 05	count:          50  (  1.67% )
	Geocoded zip2: 06	count:          50  (  1.67% )
	Geocoded zip2: 07	count:          50  (  1.67% )
	Geocoded zip2: 09	count:          50  (  1.67% )
	Geocoded zip2: 11	count:          50  (  1.67% )
	Geocoded zip2: 12	count:          50  (  1.67% )
	Geocoded zip2: 13	count:          50  (  1.67% )
	Geocoded zip2: 15	count:          50  (  1.67% )
	Geocoded zip2: 23	count:          50  (  1.67% )
	Geocoded zip2: 24	count:          50  (  1.67% )
	Geocoded zip2: 27	count:          50  (  1.67% )
	Geocoded zip2: 29	count:        

Unnamed: 0,UNIQUE_ID,FIRSTNAME,LASTNAME,ZIP9,GEOID,LATITUDE,LONGITUDE,ZIP5,Cleaned Name,Geo Basis,Summary Level,White,Black,Asian/PI,AI/AN,Multi-Race,Hispanic
0,100216,ARVY,BADORE,50115-1235,60371174051,41.67,-94.49,50115,BADORE,ZIP_9,BLKGRP,0.999552,5.5e-05,0.0,0.0,0.0,0.000393
1,354368,JOSIPHINE,TREHARNE,50036-0112,88045,42.05,-93.87,50036,TREHARNE,ZIP_9,BLKGRP,0.99789,0.0,0.0,0.0,0.0,0.00211
2,8178,NATERRIA,KLOSINSKI,50210-9009,6041990100,41.18,-93.73,50210,KLOSINSKI,ZIP_9,BLKGRP,0.998438,0.0,0.000289,0.0,0.000705,0.000568
3,288544,ANGELE,SHEWMAKE,50565-8802,480990106013,42.86,-94.98,50565,SHEWMAKE,ZIP_9,BLKGRP,0.986777,0.0,0.0,0.0,0.004122,0.009101
4,163601,RIHAM,VEERMAN,50613-2951,180390003022,42.52,-92.45,50613,VEERMAN,ZIP_9,BLKGRP,0.987022,0.0,0.012978,0.0,0.0,0.0


In [7]:
re_zip9.report

Unnamed: 0,Unnamed: 1,Count,Percent
ZIP9 Validation Results,Valid Zip9 pattern (99999-9999),3000,100.00%
ZIP9 Validation Results,Records not matching pattern,0,0.00%
Zip9 to Census Geocoding Match,Matched ZIP2: 01,50,1.67%
Zip9 to Census Geocoding Match,Matched ZIP2: 02,50,1.67%
Zip9 to Census Geocoding Match,Matched ZIP2: 03,50,1.67%
Zip9 to Census Geocoding Match,Matched ZIP2: 04,50,1.67%
Zip9 to Census Geocoding Match,Matched ZIP2: 05,50,1.67%
Zip9 to Census Geocoding Match,Matched ZIP2: 06,50,1.67%
Zip9 to Census Geocoding Match,Matched ZIP2: 07,50,1.67%
Zip9 to Census Geocoding Match,Matched ZIP2: 09,50,1.67%


## Race Estimation using ZIP9 + GEOID

In [8]:
re_zip9_plus_geoid = demographic_estimation(
    census_year=2015,
    input_data=data,
    last_name_column="LASTNAME",
    zip9_column="ZIP9",
    geoid_column="GEOID",
    unique_id_column="UNIQUE_ID",
    estimate_gender_proportion=False,
    verbose=False,
)
re_zip9_plus_geoid.output.head()

Unnamed: 0,UNIQUE_ID,FIRSTNAME,LASTNAME,ZIP9,GEOID,LATITUDE,LONGITUDE,ZIP5,Cleaned Name,Geo Basis,Summary Level,White,Black,Asian/PI,AI/AN,Multi-Race,Hispanic
0,100216,ARVY,BADORE,50115-1235,60371174051,41.67,-94.49,50115,BADORE,GEO_ID,BLKGRP,0.811433,0.000892,0.0,0.0,0.0,0.187676
1,354368,JOSIPHINE,TREHARNE,50036-0112,88045,42.05,-93.87,50036,TREHARNE,ZIP_9,BLKGRP,1.0,0.0,0.0,0.0,0.0,0.0
2,8178,NATERRIA,KLOSINSKI,50210-9009,6041990100,41.18,-93.73,50210,KLOSINSKI,ZIP_9,BLKGRP,0.999526,0.0,0.000246,0.0,0.0,0.000228
3,288544,ANGELE,SHEWMAKE,50565-8802,480990106013,42.86,-94.98,50565,SHEWMAKE,GEO_ID,BLKGRP,0.856485,0.035979,0.018051,0.0,0.043332,0.046154
4,163601,RIHAM,VEERMAN,50613-2951,180390003022,42.52,-92.45,50613,VEERMAN,GEO_ID,BLKGRP,0.985934,0.0,0.0,0.0,0.0,0.014066


In [9]:
re_zip9_plus_geoid.report

Unnamed: 0,Unnamed: 1,Count,Percent
GeoID Validation results,"Tract, Block Group and ZIP Code",0,0.00%
GeoID Validation results,"Tract & Block Group, No ZIP Code",2008,66.93%
GeoID Validation results,"Tract & ZIP Code, No Block Group",0,0.00%
GeoID Validation results,"Tract Only, No Block Group or ZIP Code",706,23.53%
GeoID Validation results,"ZIP Code Only, No Tract or Block Group",286,9.53%
GeoID Validation results,Invalid Geocoding,0,0.00%
Surname match counts,Exact matches,2999,99.97%
Surname match counts,Adjusted names,0,0.00%
Surname match counts,Default names,1,0.03%
Surname match counts,Unmatched names,0,0.00%


## Race Estimation using Latitude/Longitude and optional Zip5

In [10]:
re_lat_lon = demographic_estimation(
    census_year=2021,
    input_data=data,
    last_name_column="LASTNAME",
    unique_id_column="UNIQUE_ID",
    latitude_column="LATITUDE",
    longitude_column="LONGITUDE",
    zip5_column="ZIP5",
    estimate_gender_proportion=False,
)
re_lat_lon.output.head()

Unnamed: 0,UNIQUE_ID,FIRSTNAME,LASTNAME,ZIP9,GEOID,LATITUDE,LONGITUDE,ZIP5,Cleaned Name,Geo Basis,Summary Level,White,Black,Asian/PI,AI/AN,Multi-Race,Hispanic
0,100216,ARVY,BADORE,50115-1235,60371174051,41.67,-94.49,50115,BADORE,LAT_LON,BLKGRP,0.999526,3.6e-05,0.0,0.0,0.0,0.000438
1,354368,JOSIPHINE,TREHARNE,50036-0112,88045,42.05,-93.87,50036,TREHARNE,LAT_LON,BLKGRP,0.969115,0.0,0.001083,0.0,0.022448,0.007353
2,8178,NATERRIA,KLOSINSKI,50210-9009,6041990100,41.18,-93.73,50210,KLOSINSKI,LAT_LON,BLKGRP,0.996175,0.0,0.0,0.0,0.000635,0.00319
3,288544,ANGELE,SHEWMAKE,50565-8802,480990106013,42.86,-94.98,50565,SHEWMAKE,LAT_LON,BLKGRP,0.979479,0.0,0.0,0.0,0.018394,0.002127
4,163601,RIHAM,VEERMAN,50613-2951,180390003022,42.52,-92.45,50613,VEERMAN,LAT_LON,BLKGRP,0.966731,0.001173,0.0,0.0,0.030457,0.001639


In [11]:
re_lat_lon.report

Unnamed: 0,Unnamed: 1,Count,Percent
GeoID Validation results,"Tract, Block Group and ZIP Code",2942,98.07%
GeoID Validation results,"Tract & Block Group, No ZIP Code",0,0.00%
GeoID Validation results,"Tract & ZIP Code, No Block Group",0,0.00%
GeoID Validation results,"Tract Only, No Block Group or ZIP Code",0,0.00%
GeoID Validation results,"ZIP Code Only, No Tract or Block Group",58,1.93%
GeoID Validation results,Invalid Geocoding,0,0.00%
Surname match counts,Exact matches,2999,99.97%
Surname match counts,Adjusted names,0,0.00%
Surname match counts,Default names,1,0.03%
Surname match counts,Unmatched names,0,0.00%


## Gender and Race Estimation

In [12]:
de_full = demographic_estimation(
    census_year=2021,
    input_data=data,
    first_name_column="FIRSTNAME",
    last_name_column="LASTNAME",
    unique_id_column="UNIQUE_ID",
    latitude_column="LATITUDE",
    longitude_column="LONGITUDE",
    zip5_column="ZIP5",
)
de_full.output.head()

Unnamed: 0,UNIQUE_ID,FIRSTNAME,LASTNAME,ZIP9,GEOID,LATITUDE,LONGITUDE,ZIP5,Female Pct,Cleaned Name,Geo Basis,Summary Level,White,Black,Asian/PI,AI/AN,Multi-Race,Hispanic
0,100216,ARVY,BADORE,50115-1235,60371174051,41.67,-94.49,50115,0.0,BADORE,LAT_LON,BLKGRP,0.999526,3.6e-05,0.0,0.0,0.0,0.000438
1,354368,JOSIPHINE,TREHARNE,50036-0112,88045,42.05,-93.87,50036,1.0,TREHARNE,LAT_LON,BLKGRP,0.969115,0.0,0.001083,0.0,0.022448,0.007353
2,8178,NATERRIA,KLOSINSKI,50210-9009,6041990100,41.18,-93.73,50210,1.0,KLOSINSKI,LAT_LON,BLKGRP,0.996175,0.0,0.0,0.0,0.000635,0.00319
3,288544,ANGELE,SHEWMAKE,50565-8802,480990106013,42.86,-94.98,50565,1.0,SHEWMAKE,LAT_LON,BLKGRP,0.979479,0.0,0.0,0.0,0.018394,0.002127
4,163601,RIHAM,VEERMAN,50613-2951,180390003022,42.52,-92.45,50613,1.0,VEERMAN,LAT_LON,BLKGRP,0.966731,0.001173,0.0,0.0,0.030457,0.001639


In [13]:
de_full.report

Unnamed: 0,Unnamed: 1,Count,Percent
Gender Estimate: Forename Match Results,Exact Match,3000,100.00%
Gender Estimate: Forename Match Results,Unmatched,0,0.00%
GeoID Validation results,"Tract, Block Group and ZIP Code",2942,98.07%
GeoID Validation results,"Tract & Block Group, No ZIP Code",0,0.00%
GeoID Validation results,"Tract & ZIP Code, No Block Group",0,0.00%
GeoID Validation results,"Tract Only, No Block Group or ZIP Code",0,0.00%
GeoID Validation results,"ZIP Code Only, No Tract or Block Group",58,1.93%
GeoID Validation results,Invalid Geocoding,0,0.00%
Surname match counts,Exact matches,2999,99.97%
Surname match counts,Adjusted names,0,0.00%


## Minority Estimation

In [14]:
minority_estimation_data = data.drop_duplicates(subset='ZIP9', keep='first')[["UNIQUE_ID", "ZIP9"]]
minority_estimation_data.head()

Unnamed: 0,UNIQUE_ID,ZIP9
0,100216,50115-1235
1,354368,50036-0112
2,8178,50210-9009
3,288544,50565-8802
4,163601,50613-2951


In [15]:
from solas_disparity import minority_estimation

me = minority_estimation(
    input_data=minority_estimation_data,
    unique_id_column="UNIQUE_ID",
    zip9_column="ZIP9",
    census_year=2021,
)
me.output.head()

Unnamed: 0,UNIQUE_ID,ZIP9,State,County,Tract,Minority Proportion,Total,Non-Hispanic Total,Non-Hispanic White,Non-Hispanic Black,...,Non-Hispanic Other,Non-Hispanic Multi-Race,Hispanic Total,Hispanic White,Hispanic Black,Hispanic AI/AN,Hispanic Asian,Hispanic PI,Hispanic Other,Hispanic Multi-Race
0,100216,50115-1235,19,77,950300,0.082289,4229.0,4082.0,3881.0,4.0,...,0.0,172.0,147.0,58.0,0.0,0.0,0.0,0.0,29.0,60.0
1,354368,50036-0112,19,15,20200,0.053316,3920.0,3871.0,3711.0,125.0,...,0.0,35.0,49.0,29.0,0.0,0.0,0.0,0.0,8.0,12.0
2,8178,50210-9009,19,181,21200,0.051841,3993.0,3883.0,3786.0,5.0,...,0.0,71.0,110.0,90.0,0.0,0.0,0.0,0.0,1.0,19.0
3,288544,50565-8802,19,21,960100,0.056632,1342.0,1311.0,1266.0,4.0,...,6.0,35.0,31.0,14.0,0.0,0.0,0.0,0.0,17.0,0.0
4,163601,50613-2951,19,13,2200,0.09593,3784.0,3709.0,3421.0,121.0,...,0.0,74.0,75.0,57.0,0.0,0.0,0.0,0.0,0.0,18.0


In [16]:
me.report

Unnamed: 0,Unnamed: 1,Count,Percent
ZIP9 Validation Results,Valid Zip9 pattern (99999-9999),3000,100.00%
ZIP9 Validation Results,Records not matching pattern,0,0.00%
Zip9 to Census Geocoding Match,Matched ZIP2: 01,50,1.67%
Zip9 to Census Geocoding Match,Matched ZIP2: 02,50,1.67%
Zip9 to Census Geocoding Match,Matched ZIP2: 03,50,1.67%
Zip9 to Census Geocoding Match,Matched ZIP2: 04,50,1.67%
Zip9 to Census Geocoding Match,Matched ZIP2: 05,50,1.67%
Zip9 to Census Geocoding Match,Matched ZIP2: 06,50,1.67%
Zip9 to Census Geocoding Match,Matched ZIP2: 07,50,1.67%
Zip9 to Census Geocoding Match,Matched ZIP2: 09,50,1.67%
