## Title and Description

## Predicting Urban Gentrification Through Infrastructure and Socioeconomic Indicators


This research analyzes how public infrastructure, socioeconomic factors, and voting patterns can predict gentrification trends in urban areas. Using a data-driven approach, we'll examine:

**Key Components:**
- Public infrastructure data from OpenStreetMap (OSM)
- Socioeconomic indicators from census data 
- Housing market dynamics from price paid data
- Political trends from election results

**Research Goals:**
1. Establish a composite gentrification metric based on:
   - Educational attainment changes
   - Population turnover rates
   - Index of Multiple Deprivation (IMD) shifts
   - Demographic transitions
   - Housing price acceleration

2. Analyze correlations between:
   - Public amenities (the "Starbucks effect")
   - Transportation access
   - Social indicators
   - Political voting patterns
   - Gentrification outcomes

3. Build predictive models to:
   - Identify areas at risk of future gentrification
   - Quantify infrastructure impact on neighborhood change
   - Map potential demographic transitions

4. Provide insights for:
   - Urban planning policy
   - Community investment strategies
   - Housing equity considerations

Sources you may want to include in your data include:

  - UK Census Data
    - UK Census 2021 data [here](https://www.ons.gov.uk/search?topics=9731,6646,3845,9497,4262,4128,7755,4994,6885,9724,7367&filter=datasets) or [here](https://www.nomisweb.co.uk/sources/census_2021_bulk).
    - Historical Census data [here](https://www.ons.gov.uk/census/historiccensusdata).
    - Geographic coordinates of Census Output Areas [here](https://www.data.gov.uk/dataset/4d4e021d-fe98-4a0e-88e2-3ead84538537/output-areas-december-2021-boundaries-ew-bgc-v2).
  - Spatial data
    - You should already have a connection to OpenStreetMaps.
    - You might want to also download the entire map for England (or more) [here](https://download.openstreetmap.fr/extracts/) or [here](https://wiki.openstreetmap.org/wiki/Planet.osm).
  - Election data
    - Recent Election Results data [here](https://commonslibrary.parliament.uk/research-briefings/cbp-10009/)
    - Historical Election Results data [here](https://commonslibrary.parliament.uk/research-briefings/cbp-8647/#fullreport).
    - Lookup reference between Output Areas and Parliamentary Constituencies [here](https://geoportal.statistics.gov.uk/datasets/5968b5b2c0f14dd29ba277beaae6dec3_0/explore).
  - Price Paid Data
    - You should have this data already in your database.
  - OSM
    - You should know how to access this data from previous practicals.

In [None]:
import pandas as pd
import osmnx as ox
import shapely as shp
import numpy as np
import os
import requests
import fynesse
import geopandas as gpd
import yaml
import matplotlib.pyplot as plt
import seaborn as sns
from zipfile import ZipFile
import MySQLdb
import sklearn
import multiprocessing as mp
import re

# set up database connection

%load_ext sql

with open("./credentials1.yaml") as file:
  credentials = yaml.safe_load(file)

username = credentials["username"]
password = credentials["password"]
url = credentials["url"]
port = credentials["port"]

%config SqlMagic.style = '_DEPRECATED_DEFAULT'


connection_string = f"mysql+pymysql://{username}:{password}@{url}:{port}/ads_2024?local_infile=1"
%sql $connection_string
%sql use ads_2024;

conn = MySQLdb.connect(host=url, user=username, password=password, database="ads_2024", local_infile=True)

# download data
# everything is on the scale of lsoas for data availability

for url in [
    # 2011 and 2021 lsoa boundaries
    ("lsoas.geojson", f"https://open-geography-portalx-ons.hub.arcgis.com/api/download/v1/items/68515293204e43ca8ab56fa13ae8a547/geojson?layers=0"),
    "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_lsoa_lyr_2011_clipped.zip",

    # LAD boundaries
    ("lad.geojson", "https://stg-arcgisazurecdataprod1.az.arcgis.com/exportfiles-1559-23636/Local_Authority_Districts_May_2024_Boundaries_UK_BGC_-6307115499537197728.geojson?sv=2018-03-28&sr=b&sig=rmjgun9Rr9udzOE7cC%2BpWb0sSNW90liXW%2FM2D6gU3EU%3D&se=2024-12-16T11%3A15%3A51Z&sp=r"),
    
    # postcode dataset
    "https://www.getthedata.com/downloads/open_postcode_geo.csv.zip",
    
    # age breakdowns by lsoa
    ("2011_ages.csv", "https://www.nomisweb.co.uk/api/v01/dataset/nm_145_1.bulk.csv?time=latest&measures=20100&rural_urban=total&geography=TYPE298"),
    # best we can do for 2021 is 23 categories via the age by sex dataset
    ("2021_age_by_sex.csv", "https://static.ons.gov.uk/datasets/5bae0c60-f78f-46cf-8283-7bda537795e2/RM121-2021-1-filtered-2024-12-16T10:44:40Z.csv"),

    # ethinicity breakdowns by lsoa
    ("2011_ethnicity.csv", "https://www.nomisweb.co.uk/api/v01/dataset/nm_608_1.bulk.csv?time=latest&measures=20100&rural_urban=total&geography=TYPE298"),
    ("2021_ethnicity.csv", "https://static.ons.gov.uk/datasets/f3fc16b9-9b5f-4efd-9d7f-38539939498a/TS021-2021-3-filtered-2024-12-16T10:52:06Z.csv"),

    # deprivation dimensions by lsoa
    ("2021_deprivation.csv", "https://static.ons.gov.uk/datasets/76fb8bab-c932-4411-95b5-6dd26a375857/TS011-2021-6-filtered-2024-12-16T12:30:51Z.csv"),
    ("2011_deprivation.csv", "https://www.nomisweb.co.uk/api/v01/dataset/nm_519_1.bulk.csv?time=latest&measures=20100&rural_urban=total&geography=TYPE298"),

    

]:

    if isinstance(url, tuple):
        filename, url = url
    else:
        filename = f"./{url.split('/')[-1]}"

    if not os.path.exists(filename):
        print(f"Downloading {url}")
        r = requests.get(url)
        with open(filename, 'wb') as f:
            f.write(r.content)
        print(f"Downloaded {filename}")
    else:
        print(f"Already downloaded {filename}")


    if filename.endswith('.zip') and not os.path.exists(filename.replace('.zip', '')):
        with ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall()

The sql extension is already loaded. To reload it, use:
  %reload_ext sql
 * mysql+pymysql://root:***@localhost:3306/ads_2024?local_infile=1
0 rows affected.
Already downloaded lsoas.geojson
Already downloaded ./infuse_lsoa_lyr_2011_clipped.zip
Already downloaded lad.geojson
Already downloaded ./open_postcode_geo.csv.zip
Already downloaded 2011_ages.csv


$G = \frac 1 2 c - \frac 1 4 e + \frac 1 8 h - \frac 1 8 d + 0.25$

where
- $c$ is the ratio of households that have chnaged in each lsoa
- $e$ is the relative change in the proportion of non-white residents at the individual level within each lsoa
- $h$ represents the relative change in median house sale price compared to a local authority average. the median house sale price for a given LSOA was 90 per cent of the local authority average in 2009 and increased to 110 per cent of the borough-wide average in 2016, producing a score of 0.22 using the formula of relative change
- $d$ represents the relative change in the index of multiple deprivation (IMD) score between 2010 and 2015

In [19]:
%%sql
DROP TABLE IF EXISTS lsoas;
CREATE TABLE IF NOT EXISTS lsoas (
    year INT NOT NULL,
    code VARCHAR(9) NOT NULL,
    name VARCHAR(255) NOT NULL,

    -- Geometry
    geometry GEOMETRY NOT NULL,            -- Geometry of the output area in WG84
    
    -- Constraints
    PRIMARY KEY (year, code)
) DEFAULT CHARSET=utf8 COLLATE=utf8_bin AUTO_INCREMENT=1;

 * mysql+pymysql://root:***@localhost:3306/ads_2024?local_infile=1
0 rows affected.
0 rows affected.


[]

In [26]:
if pd.read_sql("SELECT * from lsoas limit 1", conn).empty:
    if not os.path.exists("lsoas2021.csv"):
        gdf = gpd.read_file("lsoas.geojson")
        gdf.geometry.set_crs(epsg=27700, inplace=True)
        gdf.geometry = gdf.geometry.to_crs(epsg=4326)
        gdf.to_csv("lsoas2021.csv", sep="|", index=False)

    command = """
    LOAD DATA LOCAL INFILE 'lsoas2021.csv'\
    INTO TABLE lsoas\
    FIELDS TERMINATED BY '|'\
    LINES TERMINATED BY '\n'\
    IGNORE 1 LINES\
    (@fid, code, name, @welsh, @bng_easting, @bng_northing, @latitude, @longitude, @global_id, @geometry)\
    SET year = 2021, geometry = ST_GeomFromText(@geometry);"""

    %sql $command

    if not os.path.exists("lsoas2011.csv"):
        gdf = gpd.read_file("infuse_lsoa_lyr_2011_clipped.shp")
        gdf.geometry.set_crs(epsg=27700, inplace=True)
        gdf.geometry = gdf.geometry.to_crs(epsg=4326)
        gdf.to_csv("lsoas2011.csv", sep="|", index=False)

    command = """
    LOAD DATA LOCAL INFILE 'lsoas2011.csv'\
    INTO TABLE lsoas\
    FIELDS TERMINATED BY '|'\
    LINES TERMINATED BY '\n'\
    IGNORE 1 LINES\
    (code, name, @welsh, @geometry)\
    SET year = 2011, geometry = ST_GeomFromText(@geometry);"""

    %sql $command
    


  if pd.read_sql("SELECT * from lsoas limit 1", conn).empty:


 * mysql+pymysql://root:***@localhost:3306/ads_2024?local_infile=1
35672 rows affected.
 * mysql+pymysql://root:***@localhost:3306/ads_2024?local_infile=1
42619 rows affected.


In [21]:
%%sql
CREATE TABLE IF NOT EXISTS `pp_data` (
  `transaction_unique_identifier` tinytext COLLATE utf8_bin NOT NULL,
  `price` int(10) unsigned NOT NULL,
  `date_of_transfer` date NOT NULL,
  `postcode` varchar(8) COLLATE utf8_bin NOT NULL,
  `property_type` varchar(1) COLLATE utf8_bin NOT NULL,
  `new_build_flag` varchar(1) COLLATE utf8_bin NOT NULL,
  `tenure_type` varchar(1) COLLATE utf8_bin NOT NULL,
  `primary_addressable_object_name` tinytext COLLATE utf8_bin NOT NULL,
  `secondary_addressable_object_name` tinytext COLLATE utf8_bin NOT NULL,
  `street` tinytext COLLATE utf8_bin NOT NULL,
  `locality` tinytext COLLATE utf8_bin NOT NULL,
  `town_city` tinytext COLLATE utf8_bin NOT NULL,
  `district` tinytext COLLATE utf8_bin NOT NULL,
  `county` tinytext COLLATE utf8_bin NOT NULL,
  `ppd_category_type` varchar(2) COLLATE utf8_bin NOT NULL,
  `record_status` varchar(2) COLLATE utf8_bin NOT NULL,
  `db_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,

  -- Constraints
  PRIMARY KEY (`db_id`)
) DEFAULT CHARSET=utf8 COLLATE=utf8_bin AUTO_INCREMENT=1 ;

 * mysql+pymysql://root:***@localhost:3306/ads_2024?local_infile=1
0 rows affected.


[]

In [22]:
# WARNING: This code will take a long time to finish (i.e., more than 30 minutes) given our dataset's size. The print informs the uploading progress by year.
if pd.read_sql("SELECT * from pp_data limit 1", conn).empty:
    # download the data
    fynesse.access.data()
    for year in range(1996,2025):
        print ("Uploading data for year: " + str(year))
        for part in range(1,3):
            file_name = "./pp-" + str(year) + "-part" + str(part) + ".csv"
            %sql LOAD DATA LOCAL INFILE '{file_name}' INTO TABLE pp_data FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED by '"' LINES STARTING BY '' TERMINATED BY '\n';

  if pd.read_sql("SELECT * from pp_data limit 1", conn).empty:


In [23]:
%%sql
CREATE TABLE IF NOT EXISTS `postcode_data` (
  `postcode` varchar(8) COLLATE utf8_bin NOT NULL,
  `status` enum('live','terminated') NOT NULL,
  `usertype` enum('small', 'large') NOT NULL,
  `easting` int unsigned,
  `northing` int unsigned,
  `positional_quality_indicator` int NOT NULL,
  `country` enum('England', 'Wales', 'Scotland', 'Northern Ireland', 'Channel Islands', 'Isle of Man') NOT NULL,
  `latitude` decimal(11,8) NOT NULL,
  `longitude` decimal(10,8) NOT NULL,
  `postcode_no_space` tinytext COLLATE utf8_bin NOT NULL,
  `postcode_fixed_width_seven` varchar(7) COLLATE utf8_bin NOT NULL,
  `postcode_fixed_width_eight` varchar(8) COLLATE utf8_bin NOT NULL,
  `postcode_area` varchar(2) COLLATE utf8_bin NOT NULL,
  `postcode_district` varchar(4) COLLATE utf8_bin NOT NULL,
  `postcode_sector` varchar(6) COLLATE utf8_bin NOT NULL,
  `outcode` varchar(4) COLLATE utf8_bin NOT NULL,
  `incode` varchar(3)  COLLATE utf8_bin NOT NULL,
  `db_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,

  -- Constraints
  PRIMARY KEY (`db_id`)
) DEFAULT CHARSET=utf8 COLLATE=utf8_bin;

 * mysql+pymysql://root:***@localhost:3306/ads_2024?local_infile=1
0 rows affected.


[]

In [24]:
if pd.read_sql("SELECT * from postcode_data limit 1", conn).empty:
    command = """
    LOAD DATA LOCAL INFILE 'open_postcode_geo.csv'\
    INTO TABLE postcode_data\
    FIELDS TERMINATED BY ','\
    OPTIONALLY ENCLOSED BY '"'\
    LINES STARTING BY ''\
    TERMINATED BY '\n'\
    IGNORE 1 LINES\
    (postcode, status, usertype, easting, northing, positional_quality_indicator, country, latitude, longitude, postcode_no_space, postcode_fixed_width_seven, postcode_fixed_width_eight, postcode_area, postcode_district, postcode_sector, outcode, incode);"""
    
    %sql $command

  if pd.read_sql("SELECT * from postcode_data limit 1", conn).empty:


In [25]:
%%sql
CREATE INDEX idx_postcode ON pp_data(postcode);
CREATE INDEX idx_date_of_transfer ON pp_data(date_of_transfer);
CREATE INDEX idx_postcode_data_postcode ON postcode_data(postcode);

 * mysql+pymysql://root:***@localhost:3306/ads_2024?local_infile=1
(pymysql.err.OperationalError) (1061, "Duplicate key name 'idx_postcode'")
[SQL: CREATE INDEX idx_postcode ON pp_data(postcode);]
(Background on this error at: https://sqlalche.me/e/20/e3q8)
