# Predictive-Policing-in-the-London-Metropolitan-Police-District

## 1. Environment Setup

### 1. 1. Update Linux Packages

In [None]:
!sudo apt update

### 1. 2. Install Python and Packages (pip & venv)

In [None]:
!sudo apt install -y python3 python3-pip python3-venv

### 1. 3. Create a Virtual Environment

In [None]:
!python3 -m venv .venv

### 1. 4. Activate the Virtual Environment

In [None]:
!source .venv/bin/activate

### 1.5. Install the Required Python Packages

In [None]:
%pip install -r requirements.txt

### 1. 6. Create Folders to Store Datasets

In [None]:
import os

# Raw Data Files
os.makedirs("Datasets", exist_ok=True)
os.makedirs("Datasets/Raw-Data", exist_ok=True)
os.makedirs("Datasets/Raw-Data/Police-Data", exist_ok=True)
os.makedirs("Datasets/Raw-Data/Map-Data", exist_ok=True)
os.makedirs("Datasets/Raw-Data/Income-Data", exist_ok=True)
os.makedirs("Datasets/Raw-Data/Population-Data", exist_ok=True)
os.makedirs("Datasets/Raw-Data/Unemployment-Data", exist_ok=True)
os.makedirs("Datasets/Raw-Data/Weather-Data", exist_ok=True)

# Data For Processing

os.makedirs("Datasets/Data-For-Processing", exist_ok=True)
os.makedirs("Datasets/Data-For-Processing/Police-Data", exist_ok=True)
os.makedirs("Datasets/Data-For-Processing/Map-Data", exist_ok=True)
os.makedirs("Datasets/Data-For-Processing/Income-Data", exist_ok=True)
os.makedirs("Datasets/Data-For-Processing/Population-Data", exist_ok=True)
os.makedirs("Datasets/Data-For-Processing/Unemployment-Data", exist_ok=True)
os.makedirs("Datasets/Data-For-Processing/Weather-Data", exist_ok=True)

# Processed Data Files
os.makedirs("Datasets/Processed-Data", exist_ok=True)
os.makedirs("Datasets/Processed-Data/Police-Data", exist_ok=True)
os.makedirs("Datasets/Processed-Data/Map-Data", exist_ok=True)
os.makedirs("Datasets/Processed-Data/Income-Data", exist_ok=True)
os.makedirs("Datasets/Processed-Data/Population-Data", exist_ok=True)
os.makedirs("Datasets/Processed-Data/Unemployment-Data", exist_ok=True)
os.makedirs("Datasets/Processed-Data/Weather-Data", exist_ok=True)

## 2. Data Ingestion

### 2. 1. Police Data Ingestion

In [None]:
# Download Police Data
!wget -O Datasets/Raw-Data/Police-Data/police_data_2020_2022.zip https://data.police.uk/data/archive/2022-12.zip
print("Downloaded police data from 2020 to 2022.")

!wget -O Datasets/Raw-Data/Police-Data/police_data_2022_2024.zip https://data.police.uk/data/archive/2024-12.zip
print("Downloaded police data from 2022 to 2024.")

In [None]:
# Extract Police Data
!unzip -o Datasets/Raw-Data/Police-Data/police_data_2020_2022.zip -d Datasets/Raw-Data/Police-Data/
!unzip -o Datasets/Raw-Data/Police-Data/police_data_2022_2024.zip -d Datasets/Raw-Data/Police-Data/
print("Extracted police data zip files.")

### 2. 2. Map Data Ingestion

In [None]:
# Download Map Data 

# Please Download LSOA and MSOA map data manually from the following links due to there being no direct download links available and place them in Raw-Data/Map-Data/
# LSOA Map Data - https://geoportal.statistics.gov.uk/datasets/6beafcfd9b9c4c9993a06b6b199d7e6d_0/explore?location=43.468898%2C-2.489483%2C3.79 (Both csv and geojson formats available)
# MSOA Map Data - https://geoportal.statistics.gov.uk/datasets/12baf1e6a44441208ffe5ba5ed063a68_0/explore?location=52.284503%2C-1.473701%2C11.49 (Both csv and geojson formats available)

# LSAO to MSOA Lookup Data
!wget -O Datasets/Raw-Data/Map-Data/LSOA_to_MSOA_Lookup_2021.zip https://www.arcgis.com/sharing/rest/content/items/c4f84c38814d4b82aa4760ade686c3cc/data
print("Downloaded LSOA to MSOA Lookup data.")

In [None]:
!unzip -o Datasets/Raw-Data/Map-Data/LSOA_to_MSOA_Lookup_2021.zip -d Datasets/Raw-Data/Map-Data/
print("Extracted LSOA to MSOA Lookup data.")

### 2. 3. Income Data Ingestion

In [None]:
# Download Income Data

# calculate other years - https://chatgpt.com/share/695b5cf5-8750-8007-bcbd-d531f4e8bdd9

# 2023 income data
!wget -O Datasets/Raw-Data/Income-Data/Income_Data_MSOA_2023.xlsx https://www.ons.gov.uk/file?uri=/employmentandlabourmarket/peopleinwork/earningsandworkinghours/datasets/smallareaincomeestimatesformiddlelayersuperoutputareasenglandandwales/financialyearending2023/datasetfinal.xlsx
print("Downloaded 2023 income data.")

#2020 income data
!wget -O Datasets/Raw-Data/Income-Data/Income_Data_MSOA_2020.xlsx https://www.ons.gov.uk/file?uri=/employmentandlabourmarket/peopleinwork/earningsandworkinghours/datasets/smallareaincomeestimatesformiddlelayersuperoutputareasenglandandwales/financialyearending2020/saiefy1920finalqaddownload280923.xlsx    
print("Downloaded 2020 income data.")

### 2. 4. Population Data Ingestion

In [None]:
# Download Population Data

# https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates/datasets/lowersuperoutputareamidyearpopulationestimates

# Mid 2020 - 2021
!wget -O Datasets/Raw-Data/Population-Data/LSOA_Midyear_Population_Estimates_2020.xlsx https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/populationestimates/datasets/lowersuperoutputareamidyearpopulationestimates/mid2020sape23dt2/sape23dt2mid2020lsoasyoaestimatesunformatted.xlsx

# Mid 2021 - 2022
!wget -O Datasets/Raw-Data/Population-Data/LSOA_Midyear_Population_Estimates_2021_2022.xlsx https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/populationestimates/datasets/lowersuperoutputareamidyearpopulationestimates/mid2021andmid2022/sapelsoasyoatablefinal.xlsx

# Mid 2022 - 2024
!wget -O Datasets/Raw-Data/Population-Data/LSOA_Midyear_Population_Estimates_2022_2024.xlsx https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/populationestimates/datasets/lowersuperoutputareamidyearpopulationestimates/mid2022revisednov2025tomid2024/sapelsoasyoa20222024.xlsx

### 2. 5. Whether Data Ingestion

In [None]:
# Monthly Status only- https://www.metoffice.gov.uk/pub/data/weather/uk/climate/stationdata/heathrowdata.txt

!wget -O Datasets/Raw-Data/Weather-Data/Heathrow_Monthly_Weather_Data.txt https://www.metoffice.gov.uk/pub/data/weather/uk/climate/stationdata/heathrowdata.txt
print("Downloaded Heathrow monthly weather data.")

## 3. Data Preparation

### 3. 1. Import Libraries

In [None]:
import pandas as pd
from pathlib import Path
from typing import List

### 3. 2. Turn XLSX Files to CSV Files

In [None]:
# List of Excel files
excel_files = [
    "Datasets/Raw-Data/Population-Data/LSOA_Midyear_Population_Estimates_2020.xlsx",
    "Datasets/Raw-Data/Population-Data/LSOA_Midyear_Population_Estimates_2021_2022.xlsx",
    "Datasets/Raw-Data/Population-Data/LSOA_Midyear_Population_Estimates_2022_2024.xlsx",
    "Datasets/Raw-Data/Income-Data/Income_Data_MSOA_2020.xlsx",
    "Datasets/Raw-Data/Income-Data/Income_Data_MSOA_2023.xlsx"
]

for file in excel_files:
    file_path = Path(file)
    output_dir = file_path.parent / "csv"
    output_dir.mkdir(exist_ok=True)

    # Load Excel file
    xls = pd.ExcelFile(file_path)

    for sheet in xls.sheet_names:
        df = pd.read_excel(xls, sheet_name=sheet)

        # Clean sheet name for filename
        safe_sheet_name = sheet.replace(" ", "_").replace("/", "_")

        output_file = output_dir / f"{file_path.stem}_{safe_sheet_name}.csv"
        df.to_csv(output_file, index=False)

        print(f"Saved: {output_file}")


### 3. 3. Copy Files to the Data-For-Processing Folder (Without Police Data)

In [None]:
# Copy only the Necessary Files

# MAP Data
!cp Datasets/Raw-Data/Map-Data/PCD_OA21_LSOA21_MSOA21_LAD_NOV25_UK_LU.csv Datasets/Data-For-Processing/Map-Data/LSOA_to_MSOA_Lookup.csv
!cp Datasets/Raw-Data/Map-Data/Map-LSOA-2021.csv Datasets/Data-For-Processing/Map-Data/Map-LSOA.csv
!cp Datasets/Raw-Data/Map-Data/Map-MSOA-2021.csv Datasets/Data-For-Processing/Map-Data/Map-MSOA.csv

# Income Data
!cp Datasets/Raw-Data/Income-Data/csv/Income_Data_MSOA_2020_Total_annual_income.csv Datasets/Data-For-Processing/Income-Data/Total_Annual_Income_2020_MSOA.csv
!cp Datasets/Raw-Data/Income-Data/csv/Income_Data_MSOA_2023_Total_annual_income.csv Datasets/Data-For-Processing/Income-Data/Total_Annual_Income_2023_MSOA.csv

# Population Data
!cp Datasets/Raw-Data/Population-Data/csv/LSOA_Midyear_Population_Estimates_2020_Mid-2020_Persons.csv Datasets/Data-For-Processing/Population-Data/Population_2020_LSOA.csv
!cp Datasets/Raw-Data/Population-Data/csv/LSOA_Midyear_Population_Estimates_2021_2022_Mid-2021_LSOA_2021.csv Datasets/Data-For-Processing/Population-Data/Population_2021_LSOA.csv
!cp Datasets/Raw-Data/Population-Data/csv/LSOA_Midyear_Population_Estimates_2022_2024_Mid-2022_LSOA_2021.csv Datasets/Data-For-Processing/Population-Data/Population_2022_LSOA.csv
!cp Datasets/Raw-Data/Population-Data/csv/LSOA_Midyear_Population_Estimates_2022_2024_Mid-2023_LSOA_2021.csv Datasets/Data-For-Processing/Population-Data/Population_2023_LSOA.csv
!cp Datasets/Raw-Data/Population-Data/csv/LSOA_Midyear_Population_Estimates_2022_2024_Mid-2024_LSOA_2021.csv Datasets/Data-For-Processing/Population-Data/Population_2024_LSOA.csv

# Weather Data
!cp Datasets/Raw-Data/Weather-Data/Heathrow_Monthly_Weather_Data.txt Datasets/Data-For-Processing/Weather-Data/Heathrow_Monthly_Weather_1948_2025.txt


### 3. 4. Copy Police Data to the Data-For-Processing Folder

#### 3. 4. 1. Removing Non-Metropolitan Street Data from Police Data (Raw-Data Folder)

In [None]:
police_root = Path("Datasets/Raw-Data/Police-Data")
date_range = ("2022-01", "2024-12")
keep_token = "metropolitan-street"

if not police_root.exists():
    raise FileNotFoundError(f"Missing directory: {police_root}")

kept: List[Path] = []
removed: List[Path] = []

# Delete every police file that does not belong to the Metropolitan force.
for file_path in police_root.rglob("*"):
    if not file_path.is_file():
        continue
    if keep_token in file_path.name.lower():
        kept.append(file_path)
        continue
    file_path.unlink()
    removed.append(file_path)

print(f"Date Range - {date_range[0]} - {date_range[1]}")
for path in sorted(kept):
    print(path)

print(f"Removed {len(removed)} other files.")

#### 3. 4. 2. Create a Single Police Data File

In [40]:
police_data_root = Path("Datasets/Raw-Data/Police-Data")

def load_police_dataset(file_glob: str) -> pd.DataFrame:
    """Load all monthly police CSVs matching the glob into a single pandas DataFrame."""
    
    # Find matching CSV files recursively
    matches: List[Path] = sorted(police_data_root.rglob(file_glob))
    
    if not matches:
        raise FileNotFoundError(f"No police files matched pattern: {file_glob}")
    
    print(f"Matched {len(matches)} files for pattern '{file_glob}'")

    # Read and concatenate all CSVs
    dfs = [
        pd.read_csv(path, low_memory=False)
        for path in matches
    ]

    df = pd.concat(dfs, ignore_index=True)

    return df

police_street_df = load_police_dataset("*-metropolitan-street.csv")

# Simple sanity checks

print("Combined police street rows:", len(police_street_df))

police_street_df.to_csv("Datasets/Raw-Data/Police-Data/Police_Street_Data_2020_2024_LSOA.csv", index=False)

Matched 60 files for pattern '*-metropolitan-street.csv'
Combined police street rows: 5635649


#### 3. 4. 3. Copy Police Data to the Data-For-Processing Folder


In [41]:
!cp Datasets/Raw-Data/Police-Data/Police_Street_Data_2020_2024_LSOA.csv Datasets/Data-For-Processing/Police-Data/Police_Street_Data_2020_2024_LSOA.csv

## 4.  Data Cleaning

### 4. 1. Police Data Cleaning

#### 4. 1. 2. Read the combined police street data CSV

In [42]:
street_police_df = pd.read_csv("Datasets/Raw-Data/Police-Data/Police_Street_Data_2020_2024_LSOA.csv")

In [None]:
street_police_df.info()

In [None]:
street_police_df.head()

#### 4. 1. 2.  Rename Police Data Columns for Clarity

In [60]:
street_police_df.rename(columns={
    "Crime ID": "Crime_ID",
    "Month": "Date",
    "Reported by": "Reported_By",
    "Falls within": "Falls_Within",
    "LSOA code": "LSOA_Code",
    "LSOA name": "LSOA_Name",
    "Crime type": "Crime_Type",
    "Last outcome category": "Last_Outcome_Category",
}, inplace=True)

#### 4. 1. 4. Cheack For Null Values

In [None]:
street_police_df.isnull().sum()

In [44]:
# Drop rows where LSOA_Code is null/NaN (or blank just in case)
before = len(street_police_df)

street_police_df["LSOA_Code"] = street_police_df["LSOA_Code"].astype("object").str.strip()
street_police_df = street_police_df.dropna(subset=["LSOA_Code"])
street_police_df = street_police_df[street_police_df["LSOA_Code"] != ""]

after = len(street_police_df)
print(f"Dropped {before - after} rows with null/blank LSOA_Code. Remaining: {after}")

Dropped 71339 rows with null/blank LSOA_Code. Remaining: 5564310


In [45]:
# Replace missing values with meaningful placeholders
street_police_df["Crime_ID"] = street_police_df["Crime_ID"].fillna("No crime ID")
street_police_df["Last_Outcome_Category"] = street_police_df["Last_Outcome_Category"].fillna("No outcome mentioned")
street_police_df["Context"] = street_police_df["Context"].fillna("No context mentioned")

In [None]:
street_police_df.isnull().sum()

### 4. 2. Map Data Cleaning

#### 4. 2. 1. Read the Map Data

In [None]:
MSOA_Map = pd.read_csv("Datasets/Data-For-Processing/Map-Data/Map-MSOA.csv")
LSOA_Map = pd.read_csv("Datasets/Data-For-Processing/Map-Data/Map-LSOA.csv")
Lookup_Map = pd.read_csv("Datasets/Data-For-Processing/Map-Data/LSOA_to_MSOA_Lookup.csv")

#### 4. 2. 2.  Clean MSOA_Map Data

In [None]:
MSOA_Map.info()

In [None]:
MSOA_Map.head()

##### 4. 2. 2. 1. Rename MSOA Map Columns for Clarity

In [None]:
MSOA_Map.rename(columns={
    "MSOA21CD" : "MSOA_Code",
    "MSOA21NM" : "MSOA_Name",
    "LAT" : "Latitude",
    "LONG" : "Longitude",
    "BNG_E" : "British_National_Grid_Easting",
    "BNG_N" : "British_National_Grid_Northing",
    "Shape__Area" : "Shape_Area",
    "Shape__Length" : "Shape_Length",
}, inplace=True)

In [None]:
MSOA_Map.head()

##### 4. 2. 2. 2. Cheack For Null Values

In [None]:
MSOA_Map.isnull().sum()

#### 4. 2. 3. Clean LSOA_Map Data

In [None]:
LSOA_Map.info()

In [None]:
LSOA_Map.head()

##### 4. 2. 3. 1. Rename LSOA Map Columns for Clarity

In [None]:
LSOA_Map.rename(columns={
    "LSOA21CD" : "LSOA_Code",
    "LSOA21NM" : "LSOA_Name",
    "LAT" : "Latitude",
    "LONG" : "Longitude",
    "BNG_E" : "British_National_Grid_Easting",
    "BNG_N" : "British_National_Grid_Northing",
    "Shape__Area" : "Shape_Area",
    "Shape__Length" : "Shape_Length",
}, inplace=True)

In [None]:
LSOA_Map.head()

##### 4. 2. 3. 2. Cheack For Null Values

In [None]:
LSOA_Map.isnull().sum()

#### 4. 2. 4. Clean LSOA_to_MSOA_Lookup Data

In [None]:
Lookup_Map.info()

In [None]:
Lookup_Map.head()

##### 4. 2. 4. 1. Rename Lookup Map Columns for Clarity

In [None]:
Lookup_Map.rename(columns={
    "lsoa21cd" : "LSOA_Code",
    "msoa21cd" : "MSOA_Code",
    "lsoa21nm" : "LSOA_Name",
    "msoa21nm" : "MSOA_Name",
}, inplace=True)

In [None]:
Lookup_Map.head()

##### 4. 2. 4. 2. Cheack For Null Values

In [None]:
Lookup_Map.isnull().sum()

In [None]:
# Drop rows where LSOA_Code is null/NaN 
before = len(Lookup_Map)

Lookup_Map["LSOA_Code"] = Lookup_Map["LSOA_Code"].astype("object").str.strip()
Lookup_Map = Lookup_Map.dropna(subset=["LSOA_Code"])
Lookup_Map = Lookup_Map[Lookup_Map["LSOA_Code"] != ""]
after = len(Lookup_Map)

print(f"Dropped {before - after} rows with null/blank LSOA_Code. Remaining: {after}")

In [None]:
# Replace missing values with meaningful placeholders
Lookup_Map["LSOA_Name"] = Lookup_Map["LSOA_Name"].fillna("No LSOA Name")
Lookup_Map["MSOA_Name"] = Lookup_Map["MSOA_Name"].fillna("No MSOA Name")

In [None]:
Lookup_Map.isnull().sum()

### 4. 3. Income Data Cleaning

#### 4. 3. 1. Read the Income Data

In [None]:
Income_Data_2020 = pd.read_csv("Datasets/Data-For-Processing/Income-Data/Total_Annual_Income_2020_MSOA.csv",
                            skiprows=4
                                )
Income_Data_2023 = pd.read_csv("Datasets/Data-For-Processing/Income-Data/Total_Annual_Income_2023_MSOA.csv",
                            skiprows=3
                                )

#### 4. 3. 2. Clean 2020 Income Data

In [None]:
Income_Data_2020.head()

In [None]:
Income_Data_2020.info()

##### 4. 3. 2. 1. Rename Income 2020 Columns for Clarity

In [None]:
Income_Data_2020.rename(columns={
    "MSOA code" : "MSOA_Code",
    "MSOA name" : "MSOA_Name",
    "Total annual income (£)" : "Total_Annual_Income_British_Pounds"
}, inplace=True)

In [None]:
Income_Data_2020.head()

In [None]:
# Check for null values
Income_Data_2020.isnull().sum()

#### 4. 3. 3. Clean 2023 Income Data

In [None]:
Income_Data_2023.head()

In [None]:
Income_Data_2023.info()

##### 4. 3. 3. 1. Rename Income 2020 Columns for Clarity

In [None]:
Income_Data_2020.rename(columns={
    "MSOA code" : "MSOA_Code",
    "MSOA name" : "MSOA_Name",
    "Total annual income (£)" : "Total_Annual_Income_British_Pounds"
}, inplace=True)

In [None]:
Income_Data_2023.head()

In [None]:
# Check for null values
Income_Data_2023.isnull().sum()

### 4. 4. Population Data Cleaning

In [None]:
population_2020 = pd.read_csv("Datasets/Data-For-Processing/Population-Data/Population_2020_LSOA.csv", skiprows=4)
population_2021 = pd.read_csv("Datasets/Data-For-Processing/Population-Data/Population_2021_LSOA.csv", skiprows=3)
population_2022 = pd.read_csv("Datasets/Data-For-Processing/Population-Data/Population_2022_LSOA.csv", skiprows=3)
population_2023 = pd.read_csv("Datasets/Data-For-Processing/Population-Data/Population_2023_LSOA.csv", skiprows=3)
population_2024 = pd.read_csv("Datasets/Data-For-Processing/Population-Data/Population_2024_LSOA.csv", skiprows=3)

#### 4. 4. 1. Clean 2020 Population Data

In [None]:
population_2020.head()

In [None]:
population_2020.info()

In [None]:
# Rename columns for clarity
population_2020.rename(columns={
    "LSOA Code" : "LSOA_Code",
    "LSOA Name" : "LSOA_Name",
    "All Ages" : "Total_Population"
}, inplace=True)

In [None]:
population_2020.head()

In [None]:
# Check for null values
population_2020.isnull().sum()

#### 4. 4. 2. Clean 2021 Population Data

In [None]:
population_2021.head()

In [None]:
population_2021.info()

In [None]:
# Rename columns for clarity
population_2021.rename(columns={
    "LSOA 2021 Code" : "LSOA_Code",
    "LSOA 2021 Name" : "LSOA_Name",
    "Total" : "Total_Population"
}, inplace=True)

In [None]:
population_2021.head()

In [None]:
# Check for null values
population_2021.isnull().sum()

#### 4. 4. 3. Clean 2022 Population Data

In [None]:
population_2022.head()

In [None]:
population_2022.info()

In [None]:
# Rename columns for clarity
population_2022.rename(columns={
    "LSOA 2021 Code" : "LSOA_Code",
    "LSOA 2021 Name" : "LSOA_Name",
    "Total" : "Total_Population"
}, inplace=True)

In [None]:
population_2022.head()

In [None]:
# Check for null values
population_2022.isnull().sum()

#### 4. 4. 4. Clean 2023 Population Data

In [None]:
population_2023.head()

In [None]:
population_2023.info()

In [None]:
# Rename columns for clarity
population_2023.rename(columns={
    "LSOA 2021 Code" : "LSOA_Code",
    "LSOA 2021 Name" : "LSOA_Name",
    "Total" : "Total_Population"
}, inplace=True)

In [None]:
population_2023.head()

In [None]:
# Check for null values
population_2023.isnull().sum()

#### 4. 4. 5. Clean 2024 Population Data

In [None]:
population_2024.head()

In [None]:
population_2024.info()

In [None]:
# Rename columns for clarity
population_2024.rename(columns={
    "LSOA 2021 Code" : "LSOA_Code",
    "LSOA 2021 Name" : "LSOA_Name",
    "Total" : "Total_Population"
}, inplace=True)

In [None]:
population_2024.head()

In [None]:
# Check for null values
population_2024.isnull().sum()

### 4. 6. Whether Data Cleaning

In [None]:
weather_data = pd.read_fwf("Datasets/Data-For-Processing/Weather-Data/Heathrow_Monthly_Weather_1948_2025.txt", skiprows=5)

In [None]:
weather_data.head()

In [None]:
weather_data.tail()

In [None]:
weather_data.info()

In [None]:
# Drop the First Row which contains units
weather_data = weather_data.drop(index=0).reset_index(drop=True)

In [None]:
weather_data.head()

In [None]:
# Rename columns for clarity
weather_data.rename(columns={
    "yyyy" : "Year",
    "mm" : "Month",
    "tmax" : "Max_Temperature_Celsius",
    "tmin" : "Min_Temperature_Celsius",
    "af" : "Air_Frost_Days",
    "rain" : "Rainfall_mm",
    "sun" : "Sunshine_Hours"
}, inplace=True)

In [None]:
# Check for null values
weather_data.isnull().sum()

## 5. Data Transformation

### 5.  1. Police Data Transformation

In [61]:
street_police_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5564310 entries, 0 to 5635648
Data columns (total 13 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   Crime_ID               object        
 1   Date                   datetime64[ns]
 2   Reported_By            object        
 3   Falls_Within           object        
 4   Longitude              float64       
 5   Latitude               float64       
 6   Location               object        
 7   LSOA_Code              object        
 8   LSOA_Name              object        
 9   Crime_Type             object        
 10  Last_Outcome_Category  object        
 11  Context                object        
 12  Month_Period           period[M]     
dtypes: datetime64[ns](1), float64(2), object(9), period[M](1)
memory usage: 594.3+ MB


#### 5. 1. 1. Change the Data Types

In [63]:
street_police_df["Context"] = street_police_df["Context"].astype("object")
street_police_df["Date"] = street_police_df["Date"].astype("datetime64[ns]")


In [64]:
street_police_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5564310 entries, 0 to 5635648
Data columns (total 13 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   Crime_ID               object        
 1   Date                   datetime64[ns]
 2   Reported_By            object        
 3   Falls_Within           object        
 4   Longitude              float64       
 5   Latitude               float64       
 6   Location               object        
 7   LSOA_Code              object        
 8   LSOA_Name              object        
 9   Crime_Type             object        
 10  Last_Outcome_Category  object        
 11  Context                object        
 12  Month_Period           period[M]     
dtypes: datetime64[ns](1), float64(2), object(9), period[M](1)
memory usage: 594.3+ MB


#### 5. 1. 2. Date as a Year and Month 

In [69]:
# POLICE data transform
# Month as a monthly period (for grouping by month)
street_police_df["Month_Period"] = pd.to_datetime(street_police_df["Date"], errors="coerce").dt.to_period("M")
street_police_df[["Date", "Month_Period"]].head()

street_police_df["Year"] = street_police_df["Date"].dt.year
street_police_df["Month"] = street_police_df["Date"].dt.month

In [70]:
street_police_df.tail()

Unnamed: 0,Crime_ID,Date,Reported_By,Falls_Within,Longitude,Latitude,Location,LSOA_Code,LSOA_Name,Crime_Type,Last_Outcome_Category,Context,Month_Period,Year,Month
5635644,36fcabb2c063f186635356d58681862de06217035a0bb3...,2024-12-01,Metropolitan Police Service,Metropolitan Police Service,-2.103086,52.600139,On or near Bushbury Road,E01010477,Wolverhampton 015F,Theft from the person,Investigation complete; no suspect identified,No context mentioned,2024-12,2024,12
5635645,7ddc1e3986e95f45c240b18f83bd691f6d9e1ffd48119c...,2024-12-01,Metropolitan Police Service,Metropolitan Police Service,-2.093462,52.587606,On or near Dean'S Road,E01010447,Wolverhampton 018E,Theft from the person,Investigation complete; no suspect identified,No context mentioned,2024-12,2024,12
5635646,01bad260b814aa7814b61aff2c75d1df1141ad833cfa92...,2024-12-01,Metropolitan Police Service,Metropolitan Police Service,-2.139659,52.577474,On or near Bristol Street,E01010466,Wolverhampton 020D,Violence and sexual offences,Under investigation,No context mentioned,2024-12,2024,12
5635647,67990b567541935fba7d1be7d072c0da6e6a97485e495a...,2024-12-01,Metropolitan Police Service,Metropolitan Police Service,-2.209463,52.193978,On or near Parking Area,E01032292,Worcester 011B,Theft from the person,Under investigation,No context mentioned,2024-12,2024,12
5635648,eda0c37f65ebf1308126e42ee5ceb1e89a3a766cce4530...,2024-12-01,Metropolitan Police Service,Metropolitan Police Service,-0.369572,50.817227,On or near Stanley Road,E01031788,Worthing 011B,Violence and sexual offences,Under investigation,No context mentioned,2024-12,2024,12


In [74]:
street_police_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5564310 entries, 0 to 5635648
Data columns (total 15 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   Crime_ID               object        
 1   Date                   datetime64[ns]
 2   Reported_By            object        
 3   Falls_Within           object        
 4   Longitude              float64       
 5   Latitude               float64       
 6   Location               object        
 7   LSOA_Code              object        
 8   LSOA_Name              object        
 9   Crime_Type             object        
 10  Last_Outcome_Category  object        
 11  Context                object        
 12  Month_Period           period[M]     
 13  Year                   int32         
 14  Month                  int32         
dtypes: datetime64[ns](1), float64(2), int32(2), object(9), period[M](1)
memory usage: 636.8+ MB


In [76]:
street_police_df_simple = street_police_df[["Crime_ID", "Longitude", "Latitude", "Location", "LSOA_Code", "LSOA_Name", "Crime_Type", "Last_Outcome_Category", "Month_Period", "Year", "Month"]]


In [77]:
street_police_df_simple.to_csv("Datasets/Processed-Data/Police-Data/Street_Police_Data_Cleaned_LSAO_2020_2024.csv", index=False)

### 5. 2. Population Data Transformation 

In [53]:
population_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34753 entries, 0 to 34752
Data columns (total 98 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   LSOA_Code                  34753 non-null  object 
 1   LSOA_Name                  34753 non-null  object 
 2   LA Code (2018 boundaries)  34753 non-null  object 
 3   LA name (2018 boundaries)  34753 non-null  object 
 4   LA Code (2021 boundaries)  34753 non-null  object 
 5   LA name (2021 boundaries)  34753 non-null  object 
 6   Total_Population           34753 non-null  int64  
 7   0.0                        34753 non-null  float64
 8   1.0                        34753 non-null  float64
 9   2.0                        34753 non-null  float64
 10  3.0                        34753 non-null  float64
 11  4.0                        34753 non-null  float64
 12  5.0                        34753 non-null  float64
 13  6.0                        34753 non-null  flo

In [59]:
# Add Year column
population_2020["Year"] = 2020

# Create a simplified DataFrame with only relevant columns
population_2020_simple = population_2020[["LSOA_Code", "LSOA_Name", "Total_Population", "Year"]]
population_2020_simple.head()

Unnamed: 0,LSOA_Code,LSOA_Name,Total_Population,Year
0,E01011949,Hartlepool 009A,1944,2020
1,E01011950,Hartlepool 008A,1298,2020
2,E01011951,Hartlepool 007A,1208,2020
3,E01011952,Hartlepool 002A,1724,2020
4,E01011953,Hartlepool 002B,2026,2020


In [72]:
population_2021.info()

Unnamed: 0,LAD 2021 Code,LAD 2021 Name,LSOA_Code,LSOA_Name,Total_Population,F0,F1,F2,F3,F4,...,M81,M82,M83,M84,M85,M86,M87,M88,M89,M90
0,E06000001,Hartlepool,E01011949,Hartlepool 009A,1854,5,10,10,10,0,...,4,5,6,6,6,0,0,6,0,0
1,E06000001,Hartlepool,E01011950,Hartlepool 008A,1037,7,5,10,5,5,...,0,5,0,6,0,0,0,0,0,0
2,E06000001,Hartlepool,E01011951,Hartlepool 007A,1203,7,10,5,5,5,...,0,5,0,0,0,5,0,0,0,0
3,E06000001,Hartlepool,E01011952,Hartlepool 002A,1610,8,16,16,14,5,...,4,0,4,6,0,6,5,0,0,11
4,E06000001,Hartlepool,E01011953,Hartlepool 002B,1970,12,21,16,14,19,...,4,0,4,0,6,0,0,0,0,5


In [73]:
# Add Year column
population_2021["Year"] = 2021

# Create a simplified DataFrame with only relevant columns
population_2021_simple = population_2021[["LSOA_Code", "LSOA_Name", "Total_Population", "Year"]]
population_2021_simple.head()

Unnamed: 0,LSOA_Code,LSOA_Name,Total_Population,Year
0,E01011949,Hartlepool 009A,1854,2021
1,E01011950,Hartlepool 008A,1037,2021
2,E01011951,Hartlepool 007A,1203,2021
3,E01011952,Hartlepool 002A,1610,2021
4,E01011953,Hartlepool 002B,1970,2021


In [78]:
population_2022.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35672 entries, 0 to 35671
Columns: 187 entries, LAD 2023 Code to M90
dtypes: int64(183), object(4)
memory usage: 50.9+ MB


In [79]:
# Add Year column
population_2022["Year"] = 2022

# Create a simplified DataFrame with only relevant columns
population_2022_simple = population_2022[["LSOA_Code", "LSOA_Name", "Total_Population", "Year"]]
population_2022_simple.head()

Unnamed: 0,LSOA_Code,LSOA_Name,Total_Population,Year
0,E01011949,Hartlepool 009A,1876,2022
1,E01011950,Hartlepool 008A,1117,2022
2,E01011951,Hartlepool 007A,1260,2022
3,E01011952,Hartlepool 002A,1635,2022
4,E01011953,Hartlepool 002B,1984,2022


In [80]:
population_2023.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35672 entries, 0 to 35671
Columns: 187 entries, LAD 2023 Code to M90
dtypes: int64(183), object(4)
memory usage: 50.9+ MB


In [81]:
# Add Year column
population_2023["Year"] = 2023

# Create a simplified DataFrame with only relevant columns
population_2023_simple = population_2023[["LSOA_Code", "LSOA_Name", "Total_Population", "Year"]]
population_2023_simple.head()

Unnamed: 0,LSOA_Code,LSOA_Name,Total_Population,Year
0,E01011949,Hartlepool 009A,1925,2023
1,E01011950,Hartlepool 008A,1177,2023
2,E01011951,Hartlepool 007A,1320,2023
3,E01011952,Hartlepool 002A,1670,2023
4,E01011953,Hartlepool 002B,2075,2023


In [82]:
population_2024.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35672 entries, 0 to 35671
Columns: 187 entries, LAD 2023 Code to M90
dtypes: int64(183), object(4)
memory usage: 50.9+ MB


In [83]:
# Add Year column
population_2024["Year"] = 2024

# Create a simplified DataFrame with only relevant columns
population_2024_simple = population_2024[["LSOA_Code", "LSOA_Name", "Total_Population", "Year"]]
population_2024_simple.head()

Unnamed: 0,LSOA_Code,LSOA_Name,Total_Population,Year
0,E01011949,Hartlepool 009A,1898,2024
1,E01011950,Hartlepool 008A,1247,2024
2,E01011951,Hartlepool 007A,1393,2024
3,E01011952,Hartlepool 002A,1669,2024
4,E01011953,Hartlepool 002B,2303,2024


In [84]:
combined_population = pd.concat([
    population_2020_simple,
    population_2021_simple,
    population_2022_simple,
    population_2023_simple,
    population_2024_simple
], ignore_index=True)

In [87]:
combined_population.head()

Unnamed: 0,LSOA_Code,LSOA_Name,Total_Population,Year
0,E01011949,Hartlepool 009A,1944,2020
1,E01011950,Hartlepool 008A,1298,2020
2,E01011951,Hartlepool 007A,1208,2020
3,E01011952,Hartlepool 002A,1724,2020
4,E01011953,Hartlepool 002B,2026,2020


In [88]:
combined_population.to_csv("Datasets/Processed-Data/Population-Data/Population_Cleaned_LSOA_2020_2024.csv", index=False)