In [1]:
import pandas as pd
import requests
import numpy as np

In [4]:
lang_tract = pd.read_csv("/content/drive/MyDrive/H08-ResilienceIndex/data/homophily/raw/lang-S1601-2020-tract.csv")
lang_cnty = pd.read_csv("/content/drive/MyDrive/H08-ResilienceIndex/data/homophily/raw/lang-S1601-2020-county.csv")
race_tract = pd.read_csv("/content/drive/MyDrive/H08-ResilienceIndex/data/homophily/raw/race-B03002-2020-tract.csv")
race_cnty = pd.read_csv("/content/drive/MyDrive/H08-ResilienceIndex/data/homophily/raw/race-B03002-2020-county.csv")

In [5]:
county_df = pd.merge(race_cnty, lang_cnty, on=["GEO_ID", "year"])
county_df["Pr_AAPI"] = county_df["Pr_Asian"] + county_df["Pr_Hawaiian"]
county_df.drop(columns=["GEO_ID", "state", "county", "Pr_Asian", "Pr_Hawaiian", "Pr_Otherrace", "Pr_Multirace"], inplace=True)
column_order = ["fips", "NAME", "year"] + [col for col in county_df.columns if col not in ["fips", "NAME", "year"]]
county_df = county_df.reindex(columns=column_order)
county_df["fips"] = county_df["fips"].astype(str).str.zfill(5)
county_df["NonHispanic_County"] = county_df["Population"] - county_df["Hispanic"]
county_df.rename(columns={"fips": "CountyFIPS",
                          'NAME': "CountyName",
                          'Population': "Pop_County",
                          'Pr_White': "Pr_White_County",
                          'Pr_Black': 'Pr_Black_County',
                          'Pr_AAPI': 'Pr_AAPI_County',
                          'Pr_Native': 'Pr_Native_County',
                          'Pr_Hispanic': 'Pr_Hispanic_County',
                          'Pr_EnglishAtHome': 'Pr_EnglishAtHome_County',
                          'Pr_SpanishAtHome': 'Pr_SpanishAtHome_County',
                          'Pr_IndoEuropeanAtHome': 'Pr_IndoEuropeanAtHome_County',
                          'Pr_AsianLanguageAtHome': 'Pr_AsianLanguageAtHome_County',
                          'Pr_EngLessThanWell': 'Pr_EngLessThanWell_County',
                          'Hispanic': 'Hispanic_County'
                          }, inplace=True)

In [6]:
tract_df = pd.merge(race_tract, lang_tract, on=["GEO_ID", "year"])
tract_df.drop(columns=["GEO_ID"], inplace=True)

for column in tract_df.columns:
  if column.startswith("Pr_"):
      tract_df[column] = tract_df[column].fillna(0)

tract_df['state'] = tract_df['state'].astype(str).str.zfill(2)
tract_df['county'] = tract_df['county'].astype(str).str.zfill(3)
tract_df["CountyFIPS"] = tract_df['state'] + tract_df['county']
tract_df["NonHispanic"] = tract_df["Population"] - tract_df["Hispanic"]
tract_df.drop(columns=["state", "county"], inplace=True)

In [7]:
merged_df = pd.merge(tract_df, county_df, on=["CountyFIPS", "year"], how="left")
merged_df.columns

In [8]:
df = merged_df.copy()

df["Dissimilarity_Hispanic"] = ((df["Hispanic"]/df["Hispanic_County"]) - (df["NonHispanic"]/df["NonHispanic_County"])).abs()
df["HHI_Race"] = df["Pr_White"]**2 + df["Pr_Black"]**2 + df["Pr_Native"]**2 + df["Pr_Asian"]**2 + df["Pr_Hispanic"]**2
df["HHI_Lang"] = df["Pr_EnglishAtHome"]**2 + df["Pr_SpanishAtHome"]**2 + df["Pr_IndoEuropeanAtHome"]**2 + df["Pr_AsianLanguageAtHome"]**2

In [9]:
# Calculate weighted homogeneity index at county level
df['Weighted_HHI_Race'] = df["HHI_Race"] * df['Population']

# Group by county and calculate the sum of weighted homogeneity and total population
race_homophily = df.groupby('CountyFIPS').agg({'Weighted_HHI_Race': 'sum', 'Population': 'sum'})

# Calculate the average homogeneity index at county level
race_homophily['HHI_Race'] = race_homophily['Weighted_HHI_Race'] / race_homophily['Population']

race_homophily.drop(columns=['Weighted_HHI_Race', 'Population'], inplace=True)

# Reset the index
race_homophily.reset_index(inplace=True)

race_homophily

Unnamed: 0,CountyFIPS,HHI_Race
0,01001,0.620392
1,01003,0.719420
2,01005,0.459609
3,01007,0.697653
4,01009,0.777751
...,...,...
3138,56037,0.651999
3139,56039,0.704397
3140,56041,0.775329
3141,56043,0.702440


In [10]:
# Calculate weighted homogeneity index at county level
df['Weighted_HHI_Lang'] = df["HHI_Lang"] * df['Population']

# Group by county and calculate the sum of weighted homogeneity and total population
lang_homophily = df.groupby('CountyFIPS').agg({'Weighted_HHI_Lang': 'sum', 'Population': 'sum'})

# Calculate the average homogeneity index at county level
lang_homophily['HHI_Lang'] = lang_homophily['Weighted_HHI_Lang'] / lang_homophily['Population']

lang_homophily.drop(columns=['Weighted_HHI_Lang', 'Population'], inplace=True)

# Reset the index
lang_homophily.reset_index(inplace=True)

lang_homophily

Unnamed: 0,CountyFIPS,HHI_Lang
0,01001,0.928612
1,01003,0.907124
2,01005,0.870755
3,01007,0.955493
4,01009,0.866312
...,...,...
3138,56037,0.812266
3139,56039,0.700498
3140,56041,0.899716
3141,56043,0.870193


In [11]:
# Group by county and calculate the sum of weighted homogeneity and total population
hispanic_dissimilarity = df.groupby('CountyFIPS').agg({'Dissimilarity_Hispanic': 'sum'})

# Reset index to make 'county' a regular column
hispanic_dissimilarity.reset_index(inplace=True)

# Divided by 2 to normalize the index between 0 and 1, with higher values indicating greater dissimilarity between the groups across the areas.

hispanic_dissimilarity["Dissimilarity_Hispanic"] = hispanic_dissimilarity["Dissimilarity_Hispanic"] * 0.5

hispanic_dissimilarity

Unnamed: 0,CountyFIPS,Dissimilarity_Hispanic
0,01001,0.390292
1,01003,0.397320
2,01005,0.507780
3,01007,0.374769
4,01009,0.418437
...,...,...
3138,56037,0.136081
3139,56039,0.309449
3140,56041,0.203543
3141,56043,0.283535


In [12]:
from functools import reduce

frames = [race_homophily, lang_homophily, hispanic_dissimilarity]
homophily = reduce(lambda left, right: pd.merge(left, right, on=['CountyFIPS'], how='inner'), frames)
homophily

Unnamed: 0,CountyFIPS,HHI_Race,HHI_Lang,Dissimilarity_Hispanic
0,01001,0.620392,0.928612,0.390292
1,01003,0.719420,0.907124,0.397320
2,01005,0.459609,0.870755,0.507780
3,01007,0.697653,0.955493,0.374769
4,01009,0.777751,0.866312,0.418437
...,...,...,...,...
3138,56037,0.651999,0.812266,0.136081
3139,56039,0.704397,0.700498,0.309449
3140,56041,0.775329,0.899716,0.203543
3141,56043,0.702440,0.870193,0.283535


In [13]:
homophily.to_csv("homophily.csv", index=False)