In [29]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
final_df = pd.read_csv("all_combined_data.csv")

# Define Features (X) and Target (y)
final_df = final_df[final_df["year"].between(2014, 2018)]
final_df = final_df.replace("-", pd.NA)
final_df = final_df.replace('(X)', pd.NA)
final_df= final_df.fillna(0)
final_df = final_df.dropna()
final_df = final_df.replace([np.inf, -np.inf], 0)

X = final_df.drop(columns=[col for col in final_df.columns if ("gentrified" in col or"NAMELSAD_COUNTY_20"in col or "ZCTA" in col or "year" in col or "NAME" in col or "Name" in col)  ])
y = final_df["gentrified_2019"]  # Example: Using gentrification for 2023
X = X.replace('(X)', pd.NA)
# Handle Missing Values
X = X.fillna(0)
y = y.fillna(0)
X.dropna()
X = X.replace([np.inf, -np.inf], 0)
y.dropna()
y = y.replace([np.inf, -np.inf], 0)
# Initialize Model
model = RandomForestClassifier(random_state=42)

# Recursive Feature Elimination with Cross-Validation (RFECV)
rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(5), scoring="accuracy", min_features_to_select=5)
rfecv.fit(X, y)

# Optimal Number of Features
optimal_features = rfecv.n_features_
print("Optimal Number of Features:", optimal_features)

# Selected Features
selected_features = X.columns[rfecv.support_].tolist()
print("Selected Features:", selected_features)

# Plot RFECV Results
plt.figure(figsize=(12, 6))
plt.title("RFECV: Model Performance vs Number of Features")
plt.xlabel("Number of Features")
plt.ylabel("Cross-Validation Accuracy")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()




In [15]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

final_df = pd.read_csv("all_combined_data.csv")
final_df = final_df.replace("-", pd.NA)
final_df= final_df.fillna(0)


selected_features = ["DP02_0004PE", "DP02_0006PE", "DP02_0008PE", "DP02_0064PE", "DP03_0062E", "DP03_0004PE", "DP03_0027PE"]

# Step 1: Reshape Data to Have One Target Column
# Convert the years 2016-2023 into rows instead of separate columns
# Step 1: Reshape Data to Have One Target Column
# Convert the years 2016-2023 into rows instead of separate columns
gentrification_years = [f"gentrified_{year}" for year in range(2016, 2024)]

df_melted = final_df.melt(id_vars=["ZCTA", "year"] + selected_features, 
                           value_vars=gentrification_years, 
                           var_name="gentrification_year", value_name="gentrified_status")

# Convert "gentrification_year" to an integer (extract the year from the column name)
df_melted["gentrification_year"] = df_melted["gentrification_year"].str.extract("(\d+)").astype(int)

# Step 2: Split into Training (2016-2020) and Testing (2021-2023)
train_df = df_melted[df_melted["gentrification_year"].between(2016, 2020)]
test_df = df_melted[df_melted["gentrification_year"].between(2021, 2023)]

# Define Features (X) and Target (y)
X_train = train_df[selected_features]
y_train = train_df["gentrified_status"]
X_test = test_df[selected_features]
y_test = test_df["gentrified_status"]

# Step 3: Train Random Forest Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 4: Make Predictions
y_pred = rf_model.predict(X_test)

# Step 5: Evaluate Model Performance
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


  interactivity=interactivity, compiler=compiler, result=result)


Model Accuracy: 0.9953081655919835


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Classification Report:
               precision    recall  f1-score   support

       False       1.00      1.00      1.00   1293819
        True       0.00      0.00      0.00      6099

    accuracy                           1.00   1299918
   macro avg       0.50      0.50      0.50   1299918
weighted avg       0.99      1.00      0.99   1299918



  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
import pandas as pd

# Step 1: Prepare Future Data Using a 5-Year Rolling Window
future_predictions = []

for target_year in range(2023, 2029):  # Predict for 2023-2028
    train_start = target_year - 5  # Use past 5 years of data
    train_end = target_year - 1

    # Get data for the past 5 years
    past_5_years_df = df_melted[(df_melted["year"] >= train_start) & (df_melted["year"] <= train_end)]
    past_5_years_df = past_5_years_df.apply(pd.to_numeric, errors='coerce')

    # Aggregate features: Take means over past 5 years
    aggregated_features = past_5_years_df.groupby("ZCTA", as_index=False).mean()

    # Store the target year
    aggregated_features["year"] = target_year
    #print(aggregated_features.columns.to_list)
    # Predict Gentrification
    X_future = aggregated_features[selected_features]
    aggregated_features["predicted_gentrification"] = rf_model.predict(X_future)

    # Store results
    future_predictions.append(aggregated_features)

# Step 2: Combine All Predictions
future_predictions_df = pd.concat(future_predictions, ignore_index=True)

# Step 3: Save Predictions
future_predictions_df.to_csv("gentrification_predictions_2023_2028.csv", index=False)
gentrifying_cities = future_predictions_df[future_predictions_df["predicted_gentrification"] == True]
gentrifying_cities.to_csv("gentrifying_cities_by_2028.csv", index=False)
print("Predictions saved: gentrification_predictions_2023_2028.csv")


Predictions saved: gentrification_predictions_2023_2028.csv
