## Title and Description

## Predicting Urban Gentrification Through Infrastructure and Socioeconomic Indicators


This research analyzes how public infrastructure, socioeconomic factors, and voting patterns can predict gentrification trends in urban areas. Using a data-driven approach, we'll examine:

**Key Components:**
- Public infrastructure data from OpenStreetMap (OSM)
- Socioeconomic indicators from census data 
- Housing market dynamics from price paid data
- Political trends from election results

**Research Goals:**
1. Establish a composite gentrification metric based on:
   - Educational attainment changes
   - Population turnover rates
   - Index of Multiple Deprivation (IMD) shifts
   - Demographic transitions
   - Housing price acceleration

2. Analyze correlations between:
   - Public amenities (the "Starbucks effect")
   - Transportation access
   - Social indicators
   - Political voting patterns
   - Gentrification outcomes

3. Build predictive models to:
   - Identify areas at risk of future gentrification
   - Quantify infrastructure impact on neighborhood change
   - Map potential demographic transitions

4. Provide insights for:
   - Urban planning policy
   - Community investment strategies
   - Housing equity considerations

In [None]:
import pandas as pd
import osmnx as ox
import shapely as shp
import numpy as np
import os
import requests
import fynesse
import geopandas as gpd
import yaml
import matplotlib.pyplot as plt
import seaborn as sns
from zipfile import ZipFile
import MySQLdb
import sklearn
import multiprocessing as mp
import re

# set up database connection

%load_ext sql

with open("./credentials1.yaml") as file:
  credentials = yaml.safe_load(file)

username = credentials["username"]
password = credentials["password"]
url = credentials["url"]
port = credentials["port"]

%config SqlMagic.style = '_DEPRECATED_DEFAULT'


connection_string = f"mysql+pymysql://{username}:{password}@{url}:{port}/ads_2024?local_infile=1"
%sql $connection_string
%sql use ads_2024;

conn = MySQLdb.connect(host=url, user=username, password=password, database="ads_2024", local_infile=True)

# download data
# everything is on the scale of lsoas for data availability

for url in [
    ("lsoas.geojson", f"https://open-geography-portalx-ons.hub.arcgis.com/api/download/v1/items/68515293204e43ca8ab56fa13ae8a547/geojson?layers=0"),
    "https://www.getthedata.com/downloads/open_postcode_geo.csv.zip",
]:

    if isinstance(url, tuple):
        filename, url = url
    else:
        filename = f"./{url.split('/')[-1]}"

    if not os.path.exists(filename):
        print(f"Downloading {url}")
        r = requests.get(url)
        with open(filename, 'wb') as f:
            f.write(r.content)
        print(f"Downloaded {filename}")
    else:
        print(f"Already downloaded {filename}")


    if filename.endswith('.zip') and not os.path.exists(filename.replace('.zip', '')):
        with ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall()

 * mysql+pymysql://root:***@localhost:3306/ads_2024?local_infile=1
0 rows affected.
Already downloaded lsoas.geojson


In [3]:
%%sql
CREATE TABLE IF NOT EXISTS lsoas (
    year INT NOT NULL,
    code VARCHAR(9) NOT NULL,
    name VARCHAR(255) NOT NULL,

    -- Geographic coordinates
    bng_easting INT NOT NULL,              -- British National Grid Easting
    bng_northing INT NOT NULL,             -- British National Grid Northing
    latitude DECIMAL(10,8) NOT NULL,       -- Latitude coordinate
    longitude DECIMAL(11,8) NOT NULL,      -- Longitude coordinate

    -- Unique identifier
    global_id VARCHAR(36) NOT NULL,

    -- Geometry
    geometry GEOMETRY NOT NULL,            -- Geometry of the output area in WG84
    
    -- Constraints
    PRIMARY KEY (year, code)
) DEFAULT CHARSET=utf8 COLLATE=utf8_bin AUTO_INCREMENT=1;

 * mysql+pymysql://root:***@localhost:3306/ads_2024?local_infile=1
0 rows affected.


[]

In [4]:
if pd.read_sql("SELECT * from lsoas limit 1", conn).empty:
    if not os.path.exists("lsoas.csv"):
        gdf = gpd.read_file("lsoas.geojson")
        gdf.geometry.set_crs(epsg=27700, inplace=True)
        gdf.geometry = gdf.geometry.to_crs(epsg=4326)
        gdf.to_csv("lsoas.csv", sep="|", index=False)

    command = """
    LOAD DATA LOCAL INFILE 'lsoas.csv'\
    INTO TABLE lsoas\
    FIELDS TERMINATED BY '|'\
    LINES TERMINATED BY '\n'\
    IGNORE 1 LINES\
    (@fid, code, name, @welsh, bng_easting, bng_northing, latitude, longitude, global_id, @geometry)\
    SET year = 2021, geometry = ST_GeomFromText(@geometry);"""

    %sql $command
    


  if pd.read_sql("SELECT * from lsoas limit 1", conn).empty:


In [None]:
# download the data
fynesse.access.data()

In [None]:
%%sql
CREATE TABLE IF NOT EXISTS `pp_data` (
  `transaction_unique_identifier` tinytext COLLATE utf8_bin NOT NULL,
  `price` int(10) unsigned NOT NULL,
  `date_of_transfer` date NOT NULL,
  `postcode` varchar(8) COLLATE utf8_bin NOT NULL,
  `property_type` varchar(1) COLLATE utf8_bin NOT NULL,
  `new_build_flag` varchar(1) COLLATE utf8_bin NOT NULL,
  `tenure_type` varchar(1) COLLATE utf8_bin NOT NULL,
  `primary_addressable_object_name` tinytext COLLATE utf8_bin NOT NULL,
  `secondary_addressable_object_name` tinytext COLLATE utf8_bin NOT NULL,
  `street` tinytext COLLATE utf8_bin NOT NULL,
  `locality` tinytext COLLATE utf8_bin NOT NULL,
  `town_city` tinytext COLLATE utf8_bin NOT NULL,
  `district` tinytext COLLATE utf8_bin NOT NULL,
  `county` tinytext COLLATE utf8_bin NOT NULL,
  `ppd_category_type` varchar(2) COLLATE utf8_bin NOT NULL,
  `record_status` varchar(2) COLLATE utf8_bin NOT NULL,
  `db_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT

  -- Constraints
  PRIMARY KEY (`db_id`)
) DEFAULT CHARSET=utf8 COLLATE=utf8_bin AUTO_INCREMENT=1 ;

In [None]:
# WARNING: This code will take a long time to finish (i.e., more than 30 minutes) given our dataset's size. The print informs the uploading progress by year.
if pd.read_sql("SELECT * from pp_data limit 1", conn).empty:
    for year in range(1996,2025):
        print ("Uploading data for year: " + str(year))
        for part in range(1,3):
            file_name = "./pp-" + str(year) + "-part" + str(part) + ".csv"
            %sql LOAD DATA LOCAL INFILE '{file_name}' INTO TABLE pp_data FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED by '"' LINES STARTING BY '' TERMINATED BY '\n';

In [None]:
%%sql
CREATE TABLE IF NOT EXISTS `postcode_data` (
  `postcode` varchar(8) COLLATE utf8_bin NOT NULL,
  `status` enum('live','terminated') NOT NULL,
  `usertype` enum('small', 'large') NOT NULL,
  `easting` int unsigned,
  `northing` int unsigned,
  `positional_quality_indicator` int NOT NULL,
  `country` enum('England', 'Wales', 'Scotland', 'Northern Ireland', 'Channel Islands', 'Isle of Man') NOT NULL,
  `latitude` decimal(11,8) NOT NULL,
  `longitude` decimal(10,8) NOT NULL,
  `postcode_no_space` tinytext COLLATE utf8_bin NOT NULL,
  `postcode_fixed_width_seven` varchar(7) COLLATE utf8_bin NOT NULL,
  `postcode_fixed_width_eight` varchar(8) COLLATE utf8_bin NOT NULL,
  `postcode_area` varchar(2) COLLATE utf8_bin NOT NULL,
  `postcode_district` varchar(4) COLLATE utf8_bin NOT NULL,
  `postcode_sector` varchar(6) COLLATE utf8_bin NOT NULL,
  `outcode` varchar(4) COLLATE utf8_bin NOT NULL,
  `incode` varchar(3)  COLLATE utf8_bin NOT NULL,
  `db_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,

  -- Constraints
  PRIMARY KEY (`db_id`)
) DEFAULT CHARSET=utf8 COLLATE=utf8_bin;

 * mysql+pymysql://root:***@localhost:3306/ads_2024?local_infile=1
5 rows affected.


Tables_in_ads_2024
census_nssec
hours_worked
lsoas
oas
osm_features


In [None]:
if pd.read_sql("SELECT * from postcode_data limit 1", conn).empty:
    %sql LOAD DATA LOCAL INFILE "./open_postcode_geo.csv" INTO TABLE `postcode_data` FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED by '"' LINES TERMINATED BY '\n';

In [None]:
%%sql
MODIFY TABLE pp_data ADD INDEX idx_postcode (postcode), ADD INDEX idx_date_of_transfer (date_of_transfer);
CREATE INDEX idx_postcode_data_postcode ON postcode_data(postcode);