In [None]:
import os
import pandas as pd

# Define folder paths for ACS data
folder_paths = ["data/DP03", "data/DP02"]


# Dictionary to store merged data for each year
all_years_data = []

# Iterate over each folder (DP03 for income, DP02 for education)


for file in os.listdir('data/DP03'):
    pgd_df = pd.read_csv("data/zillow_with_county.csv")
    pgd_df.rename(columns={"zcta": "ZCTA"}, inplace=True)
    pgd_df["ZCTA"] = pgd_df["ZCTA"].astype(str)
    for folder_path in folder_paths:

        if file.endswith(".csv"):  # Ensure it's a CSV file
            file_path = os.path.join(folder_path, file)

            if folder_path == "data/DP02":
                file_path = file_path.replace("DP03", "DP02")

            year = int(file.split(".")[0][-4:])  # Extract year from filename
            
            # Load the file
            acs_df = pd.read_csv(file_path)

            # Standardize ZCTA column name
            acs_df.rename(columns={"GEO_ID": "ZCTA"}, inplace=True)
            acs_df["ZCTA"] = acs_df["ZCTA"].astype(str)

            # Extract relevant columns
            if folder_path == "data/DP03":  # Income data
                acs_df.rename(columns={"DP03_0062E": "median_household_income"}, inplace=True)
                acs_income = acs_df[["ZCTA", "median_household_income"]]
                merged_df = pgd_df[pgd_df["year"] == year].merge(acs_income, on="ZCTA", how="left")

            elif folder_path == "data/DP02":  # Education data
                acs_df.rename(columns={"DP02_0064PE": "college_educated_population"}, inplace=True)
                acs_education = acs_df[["ZCTA", "college_educated_population"]]
                merged_df = merged_df.merge(acs_education, on="ZCTA", how="left")

    # Append the merged dataset for this year
    all_years_data.append(merged_df)

# Concatenate all years into a single DataFrame
final_df = pd.concat(all_years_data)

# Ensure data is sorted by ZCTA and Year before computing rolling averages
final_df = final_df.sort_values(by=["ZCTA", "year"])
print(final_df.columns.to_list)
final_df = final_df.replace("-", pd.NA)
final_df[["median_home_value","median_household_income", "college_educated_population"]] = final_df[["median_home_value","median_household_income", "college_educated_population"]].apply(pd.to_numeric, errors ="coerce")
# Compute 5-year rolling averages
final_df["home_price_5yr_avg"] = final_df.groupby("ZCTA")["median_home_value"].transform(lambda x: x.rolling(5, min_periods=1).mean())
final_df["income_5yr_avg"] = final_df.groupby("ZCTA")["median_household_income"].transform(lambda x: x.rolling(5, min_periods=1).mean())
final_df["college_edu_5yr_avg"] = final_df.groupby("ZCTA")["college_educated_population"].transform(lambda x: x.rolling(5, min_periods=1).mean())

# Calculate county-level rolling averages for comparison
final_df["county_home_price_5yr_avg"] = final_df.groupby(["NAMELSAD_COUNTY_20", "year"])["median_home_value"].transform(lambda x: x.rolling(5, min_periods=1).mean())
final_df["county_income_5yr_avg"] = final_df.groupby(["NAMELSAD_COUNTY_20", "year"])["median_household_income"].transform(lambda x: x.rolling(5, min_periods=1).mean())

# Compute percentage change relative to county averages
final_df["home_price_growth"] = (final_df["home_price_5yr_avg"] - final_df["county_home_price_5yr_avg"]) / final_df["county_home_price_5yr_avg"]
final_df["income_growth"] = (final_df["income_5yr_avg"] - final_df["county_income_5yr_avg"]) / final_df["county_income_5yr_avg"]
final_df["college_edu_growth"] = final_df.groupby("ZCTA")["college_edu_5yr_avg"].pct_change()

# Identify top 20% threshold for home price & income growth
home_price_threshold = final_df["home_price_growth"].quantile(0.8)
income_threshold = final_df["income_growth"].quantile(0.8)

# Gentrification criteria (meeting at least 2 out of 3 conditions)
final_df["gentrified"] = ((final_df["home_price_growth"] > home_price_threshold).astype(int) +
                           (final_df["income_growth"] > income_threshold).astype(int) +
                           (final_df["college_edu_growth"] >= 0.10).astype(int)) >= 2

# Create a binary column for each year (2016-2023) indicating gentrification
for year in range(2016, 2024):
    final_df[f"gentrified_{year}"] = (final_df["year"] == year) & (final_df["gentrified"])

# Save the processed dataset
final_df.to_csv("processed_gentrification_data.csv", index=False)

print("Gentrification analysis completed. File saved as 'processed_gentrification_data.csv'.")



  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interacti

<bound method IndexOpsMixin.tolist of Index(['year', 'ZCTA', 'median_home_value', 'NAMELSAD_COUNTY_20',
       'median_household_income', 'college_educated_population'],
      dtype='object')>
Gentrification analysis completed. File saved as 'processed_gentrification_data.csv'.


In [25]:
# Merge all datasets

import os
import pandas as pd

# Define folder paths for ACS data
folder_paths = ["data/DP03", "data/DP02"]

pgd_df = pd.read_csv("processed_gentrification_data.csv")
pgd_df["ZCTA"] = pgd_df["ZCTA"].astype(str)

all_years_data = []

for file in os.listdir('data/DP03'):
    for folder_path in folder_paths:

        if file.endswith(".csv"):  # Ensure it's a CSV file
            file_path = os.path.join(folder_path, file)

            if folder_path == "data/DP02":
                file_path = file_path.replace("DP03", "DP02")

            year = int(file.split(".")[0][-4:])  # Extract year from filename
            # Load the file
            acs_df = pd.read_csv(file_path)

            # Standardize ZCTA column name
            acs_df.rename(columns={"GEO_ID": "ZCTA"}, inplace=True)
            acs_df["ZCTA"] = acs_df["ZCTA"].astype(str)

            if folder_path == "data/DP03":  # Income data
                merged_df = pgd_df[pgd_df["year"] == year].merge(acs_df, on="ZCTA", how="left")

            elif folder_path == "data/DP02":  # Education data
                merged_df = merged_df.merge(acs_df, on="ZCTA", how="left")
    all_years_data.append(merged_df)
final_df = pd.concat(all_years_data)
final_df.to_csv("all_combined_data.csv", index=False)

print("Merging complete.")



  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interacti

Merging complete.
