## Title and Description

## Predicting Urban Gentrification Through Infrastructure and Socioeconomic Indicators


This research analyzes how public infrastructure, socioeconomic factors, and voting patterns can predict gentrification trends in urban areas. Using a data-driven approach, we'll examine:

**Key Components:**
- Public infrastructure data from OpenStreetMap (OSM)
- Socioeconomic indicators from census data 
- Housing market dynamics from price paid data
- Political trends from election results

**Research Goals:**
1. Establish a composite gentrification metric based on:
   - Educational attainment changes
   - Population turnover rates
   - Index of Multiple Deprivation (IMD) shifts
   - Demographic transitions
   - Housing price acceleration

2. Analyze correlations between:
   - Public amenities (the "Starbucks effect")
   - Transportation access
   - Social indicators
   - Political voting patterns
   - Gentrification outcomes

3. Build predictive models to:
   - Identify areas at risk of future gentrification
   - Quantify infrastructure impact on neighborhood change
   - Map potential demographic transitions

4. Provide insights for:
   - Urban planning policy
   - Community investment strategies
   - Housing equity considerations

In [None]:
import pandas as pd
import osmnx as ox
import shapely as shp
import numpy as np
import os
import requests
import fynesse
import geopandas as gpd
import yaml
import matplotlib.pyplot as plt
import seaborn as sns
from zipfile import ZipFile
import MySQLdb
import sklearn
import multiprocessing as mp
import re

# set up database connection

%load_ext sql

with open("./credentials1.yaml") as file:
  credentials = yaml.safe_load(file)

username = credentials["username"]
password = credentials["password"]
url = credentials["url"]
port = credentials["port"]

%config SqlMagic.style = '_DEPRECATED_DEFAULT'


connection_string = f"mysql+pymysql://{username}:{password}@{url}:{port}/ads_2024?local_infile=1"
%sql $connection_string
%sql use ads_2024;

conn = MySQLdb.connect(host=url, user=username, password=password, database="ads_2024", local_infile=True)

# download data
# everything is on the scale of lsoas for data availability

for url in [
    ("lsoas.geojson", f"https://open-geography-portalx-ons.hub.arcgis.com/api/download/v1/items/68515293204e43ca8ab56fa13ae8a547/geojson?layers=0"),
    
]:

    if isinstance(url, tuple):
        filename, url = url
    else:
        filename = f"./{url.split('/')[-1]}"

    if not os.path.exists(filename):
        print(f"Downloading {url}")
        r = requests.get(url)
        with open(filename, 'wb') as f:
            f.write(r.content)
        print(f"Downloaded {filename}")
    else:
        print(f"Already downloaded {filename}")


    if filename.endswith('.zip') and not os.path.exists(filename.replace('.zip', '')):
        with ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall()

The sql extension is already loaded. To reload it, use:
  %reload_ext sql
 * mysql+pymysql://root:***@localhost:3306/ads_2024?local_infile=1
0 rows affected.
Already downloaded lsoas.geojson


In [20]:
%%sql
DROP TABLE IF EXISTS lsoas;
CREATE TABLE IF NOT EXISTS lsoas (
    year INT NOT NULL,
    code VARCHAR(9) NOT NULL,
    name VARCHAR(255) NOT NULL,

    -- Geographic coordinates
    bng_easting INT NOT NULL,              -- British National Grid Easting
    bng_northing INT NOT NULL,             -- British National Grid Northing
    latitude DECIMAL(10,8) NOT NULL,       -- Latitude coordinate
    longitude DECIMAL(11,8) NOT NULL,      -- Longitude coordinate

    -- Unique identifier
    global_id VARCHAR(36) NOT NULL,

    -- Geometry
    geometry GEOMETRY NOT NULL,            -- Geometry of the output area in WG84
    
    -- Constraints
    PRIMARY KEY (year, code)
) DEFAULT CHARSET=utf8 COLLATE=utf8_bin AUTO_INCREMENT=1;

 * mysql+pymysql://root:***@localhost:3306/ads_2024?local_infile=1
0 rows affected.
0 rows affected.


[]

In [23]:
if pd.read_sql("SELECT * from lsoas limit 1", conn).empty:

    if not os.path.exists("lsoas.csv"):
        gdf = gpd.read_file("lsoas.geojson")
        gdf.geometry.set_crs(epsg=27700, inplace=True)
        gdf.geometry = gdf.geometry.to_crs(epsg=4326)
        gdf.to_csv("lsoas.csv", sep="|", index=False)

    command = """
    LOAD DATA LOCAL INFILE 'lsoas.csv'\
    INTO TABLE lsoas\
    FIELDS TERMINATED BY '|'\
    LINES TERMINATED BY '\n'\
    IGNORE 1 LINES\
    (@fid, code, name, @welsh, bng_easting, bng_northing, latitude, longitude, global_id, @geometry)\
    SET year = 2021, geometry = ST_GeomFromText(@geometry);"""

    %sql $command
    


  if pd.read_sql("SELECT * from lsoas limit 1", conn).empty:


 * mysql+pymysql://root:***@localhost:3306/ads_2024?local_infile=1
35672 rows affected.
