# Predictive-Policing-in-the-London-Metropolitan-Police-District

## 1. Environment Setup

### 1. 1. Update Linux Packages

In [None]:
!sudo apt update

### 1. 2. Install Python and Packages (pip & venv)

In [None]:
!sudo apt install -y python3 python3-pip python3-venv

### 1. 3. Create a Virtual Environment

In [None]:
!python3 -m venv .venv

### 1. 4. Activate the Virtual Environment

In [None]:
!source .venv/bin/activate

### 1.5. Install the Required Python Packages

In [None]:
%pip install -r requirements.txt

### 1. 6. Create Folders to Store Datasets

In [None]:
import os

# Raw Data Files
os.makedirs("Datasets", exist_ok=True)
os.makedirs("Datasets/Raw-Data", exist_ok=True)
os.makedirs("Datasets/Raw-Data/Police-Data", exist_ok=True)
os.makedirs("Datasets/Raw-Data/Map-Data", exist_ok=True)
os.makedirs("Datasets/Raw-Data/Income-Data", exist_ok=True)
os.makedirs("Datasets/Raw-Data/Population-Data", exist_ok=True)
os.makedirs("Datasets/Raw-Data/Unemployment-Data", exist_ok=True)
os.makedirs("Datasets/Raw-Data/Weather-Data", exist_ok=True)

# Data For Processing

os.makedirs("Datasets/Data-For-Processing", exist_ok=True)
os.makedirs("Datasets/Data-For-Processing/Police-Data", exist_ok=True)
os.makedirs("Datasets/Data-For-Processing/Map-Data", exist_ok=True)
os.makedirs("Datasets/Data-For-Processing/Income-Data", exist_ok=True)
os.makedirs("Datasets/Data-For-Processing/Population-Data", exist_ok=True)
os.makedirs("Datasets/Data-For-Processing/Unemployment-Data", exist_ok=True)
os.makedirs("Datasets/Data-For-Processing/Weather-Data", exist_ok=True)

# Processed Data Files
os.makedirs("Datasets/Processed-Data", exist_ok=True)
os.makedirs("Datasets/Processed-Data/Police-Data", exist_ok=True)
os.makedirs("Datasets/Processed-Data/Map-Data", exist_ok=True)
os.makedirs("Datasets/Processed-Data/Income-Data", exist_ok=True)
os.makedirs("Datasets/Processed-Data/Population-Data", exist_ok=True)
os.makedirs("Datasets/Processed-Data/Unemployment-Data", exist_ok=True)
os.makedirs("Datasets/Processed-Data/Weather-Data", exist_ok=True)

## 2. Data Ingestion

### 2. 1. Police Data Ingestion

In [None]:
# Download Police Data
!wget -O Datasets/Raw-Data/Police-Data/police_data_2020_2022.zip https://data.police.uk/data/archive/2022-12.zip
print("Downloaded police data from 2020 to 2022.")

!wget -O Datasets/Raw-Data/Police-Data/police_data_2022_2024.zip https://data.police.uk/data/archive/2024-12.zip
print("Downloaded police data from 2022 to 2024.")

In [None]:
# Extract Police Data
!unzip -o Datasets/Raw-Data/Police-Data/police_data_2020_2022.zip -d Datasets/Raw-Data/Police-Data/
!unzip -o Datasets/Raw-Data/Police-Data/police_data_2022_2024.zip -d Datasets/Raw-Data/Police-Data/
print("Extracted police data zip files.")

### 2. 2. Map Data Ingestion

In [None]:
# Download Map Data 

# Please Download LSOA and MSOA map data manually from the following links due to there being no direct download links available and place them in Raw-Data/Map-Data/
# LSOA Map Data - https://geoportal.statistics.gov.uk/datasets/6beafcfd9b9c4c9993a06b6b199d7e6d_0/explore?location=43.468898%2C-2.489483%2C3.79 (Both csv and geojson formats available)
# MSOA Map Data - https://geoportal.statistics.gov.uk/datasets/12baf1e6a44441208ffe5ba5ed063a68_0/explore?location=52.284503%2C-1.473701%2C11.49 (Both csv and geojson formats available)

# LSAO to MSOA Lookup Data
!wget -O Datasets/Raw-Data/Map-Data/LSOA_to_MSOA_Lookup_2021.zip https://www.arcgis.com/sharing/rest/content/items/c4f84c38814d4b82aa4760ade686c3cc/data
print("Downloaded LSOA to MSOA Lookup data.")

In [None]:
!unzip -o Datasets/Raw-Data/Map-Data/LSOA_to_MSOA_Lookup_2021.zip -d Datasets/Raw-Data/Map-Data/
print("Extracted LSOA to MSOA Lookup data.")

### 2. 3. Income Data Ingestion

In [None]:
# Download Income Data

# calculate other years - https://chatgpt.com/share/695b5cf5-8750-8007-bcbd-d531f4e8bdd9

# 2023 income data
!wget -O Datasets/Raw-Data/Income-Data/Income_Data_MSOA_2023.xlsx https://www.ons.gov.uk/file?uri=/employmentandlabourmarket/peopleinwork/earningsandworkinghours/datasets/smallareaincomeestimatesformiddlelayersuperoutputareasenglandandwales/financialyearending2023/datasetfinal.xlsx
print("Downloaded 2023 income data.")

#2020 income data
!wget -O Datasets/Raw-Data/Income-Data/Income_Data_MSOA_2020.xlsx https://www.ons.gov.uk/file?uri=/employmentandlabourmarket/peopleinwork/earningsandworkinghours/datasets/smallareaincomeestimatesformiddlelayersuperoutputareasenglandandwales/financialyearending2020/saiefy1920finalqaddownload280923.xlsx    
print("Downloaded 2020 income data.")

### 2. 4. Population Data Ingestion

In [None]:
# Download Population Data

# https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates/datasets/lowersuperoutputareamidyearpopulationestimates

# Mid 2020 - 2021
!wget -O Datasets/Raw-Data/Population-Data/LSOA_Midyear_Population_Estimates_2020.xlsx https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/populationestimates/datasets/lowersuperoutputareamidyearpopulationestimates/mid2020sape23dt2/sape23dt2mid2020lsoasyoaestimatesunformatted.xlsx

# Mid 2021 - 2022
!wget -O Datasets/Raw-Data/Population-Data/LSOA_Midyear_Population_Estimates_2021_2022.xlsx https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/populationestimates/datasets/lowersuperoutputareamidyearpopulationestimates/mid2021andmid2022/sapelsoasyoatablefinal.xlsx

# Mid 2022 - 2024
!wget -O Datasets/Raw-Data/Population-Data/LSOA_Midyear_Population_Estimates_2022_2024.xlsx https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/populationestimates/datasets/lowersuperoutputareamidyearpopulationestimates/mid2022revisednov2025tomid2024/sapelsoasyoa20222024.xlsx

### 2. 5. Whether Data Ingestion

In [None]:
# Monthly Status only- https://www.metoffice.gov.uk/pub/data/weather/uk/climate/stationdata/heathrowdata.txt

!wget -O Datasets/Raw-Data/Weather-Data/Heathrow_Monthly_Weather_Data.txt https://www.metoffice.gov.uk/pub/data/weather/uk/climate/stationdata/heathrowdata.txt
print("Downloaded Heathrow monthly weather data.")

## 3. Data Preparation

### 3. 1. Import Libraries

In [None]:
import pandas as pd
from pathlib import Path
from typing import List

### 3. 2. Turn XLSX Files to CSV Files

In [None]:
# List of Excel files
excel_files = [
    "Datasets/Raw-Data/Population-Data/LSOA_Midyear_Population_Estimates_2020.xlsx",
    "Datasets/Raw-Data/Population-Data/LSOA_Midyear_Population_Estimates_2021_2022.xlsx",
    "Datasets/Raw-Data/Population-Data/LSOA_Midyear_Population_Estimates_2022_2024.xlsx",
    "Datasets/Raw-Data/Income-Data/Income_Data_MSOA_2020.xlsx",
    "Datasets/Raw-Data/Income-Data/Income_Data_MSOA_2023.xlsx"
]

for file in excel_files:
    file_path = Path(file)
    output_dir = file_path.parent / "csv"
    output_dir.mkdir(exist_ok=True)

    # Load Excel file
    xls = pd.ExcelFile(file_path)

    for sheet in xls.sheet_names:
        df = pd.read_excel(xls, sheet_name=sheet)

        # Clean sheet name for filename
        safe_sheet_name = sheet.replace(" ", "_").replace("/", "_")

        output_file = output_dir / f"{file_path.stem}_{safe_sheet_name}.csv"
        df.to_csv(output_file, index=False)

        print(f"Saved: {output_file}")


### 3. 3. Copy Files to the Data-For-Processing Folder (Without Police Data)

In [None]:
# Copy only the Necessary Files

# MAP Data
!cp Datasets/Raw-Data/Map-Data/PCD_OA21_LSOA21_MSOA21_LAD_NOV25_UK_LU.csv Datasets/Data-For-Processing/Map-Data/LSOA_to_MSOA_Lookup.csv
!cp Datasets/Raw-Data/Map-Data/Map-LSOA-2021.csv Datasets/Data-For-Processing/Map-Data/Map-LSOA.csv
!cp Datasets/Raw-Data/Map-Data/Map-MSOA-2021.csv Datasets/Data-For-Processing/Map-Data/Map-MSOA.csv

# Income Data
!cp Datasets/Raw-Data/Income-Data/csv/Income_Data_MSOA_2020_Total_annual_income.csv Datasets/Data-For-Processing/Income-Data/Total_Annual_Income_2020_MSOA.csv
!cp Datasets/Raw-Data/Income-Data/csv/Income_Data_MSOA_2023_Total_annual_income.csv Datasets/Data-For-Processing/Income-Data/Total_Annual_Income_2023_MSOA.csv

# Population Data
!cp Datasets/Raw-Data/Population-Data/csv/LSOA_Midyear_Population_Estimates_2020_Mid-2020_Persons.csv Datasets/Data-For-Processing/Population-Data/Population_2020_LSOA.csv
!cp Datasets/Raw-Data/Population-Data/csv/LSOA_Midyear_Population_Estimates_2021_2022_Mid-2021_LSOA_2021.csv Datasets/Data-For-Processing/Population-Data/Population_2021_LSOA.csv
!cp Datasets/Raw-Data/Population-Data/csv/LSOA_Midyear_Population_Estimates_2022_2024_Mid-2022_LSOA_2021.csv Datasets/Data-For-Processing/Population-Data/Population_2022_LSOA.csv
!cp Datasets/Raw-Data/Population-Data/csv/LSOA_Midyear_Population_Estimates_2022_2024_Mid-2023_LSOA_2021.csv Datasets/Data-For-Processing/Population-Data/Population_2023_LSOA.csv
!cp Datasets/Raw-Data/Population-Data/csv/LSOA_Midyear_Population_Estimates_2022_2024_Mid-2024_LSOA_2021.csv Datasets/Data-For-Processing/Population-Data/Population_2024_LSOA.csv

# Weather Data
!cp Datasets/Raw-Data/Weather-Data/Heathrow_Monthly_Weather_Data.txt Datasets/Data-For-Processing/Weather-Data/Heathrow_Monthly_Weather_1948_2025.txt


### 3. 4. Copy Police Data to the Data-For-Processing Folder

#### 3. 4. 1. Removing Non-Metropolitan Street Data from Police Data (Raw-Data Folder)

In [None]:
police_root = Path("Datasets/Raw-Data/Police-Data")
date_range = ("2022-01", "2024-12")
keep_token = "metropolitan-street"

if not police_root.exists():
    raise FileNotFoundError(f"Missing directory: {police_root}")

kept: List[Path] = []
removed: List[Path] = []

# Delete every police file that does not belong to the Metropolitan force.
for file_path in police_root.rglob("*"):
    if not file_path.is_file():
        continue
    if keep_token in file_path.name.lower():
        kept.append(file_path)
        continue
    file_path.unlink()
    removed.append(file_path)

print(f"Date Range - {date_range[0]} - {date_range[1]}")
for path in sorted(kept):
    print(path)

print(f"Removed {len(removed)} other files.")

#### 3. 4. 2. Create a Single Police Data File

In [None]:
police_data_root = Path("Datasets/Raw-Data/Police-Data")

def load_police_dataset(file_glob: str) -> pd.DataFrame:
    """Load all monthly police CSVs matching the glob into a single pandas DataFrame."""
    
    # Find matching CSV files recursively
    matches: List[Path] = sorted(police_data_root.rglob(file_glob))
    
    if not matches:
        raise FileNotFoundError(f"No police files matched pattern: {file_glob}")
    
    print(f"Matched {len(matches)} files for pattern '{file_glob}'")

    # Read and concatenate all CSVs
    dfs = [
        pd.read_csv(path, low_memory=False)
        for path in matches
    ]

    df = pd.concat(dfs, ignore_index=True)

    return df

police_street_df = load_police_dataset("*-metropolitan-street.csv")

# Simple sanity checks

print("Combined police street rows:", len(police_street_df))

police_street_df.to_csv("Datasets/Raw-Data/Police-Data/Police_Street_Data_2020_2024_LSOA.csv", index=False)

#### 3. 4. 3. Copy Police Data to the Data-For-Processing Folder


In [None]:
!cp Datasets/Raw-Data/Police-Data/Police_Street_Data_2020_2024_LSOA.csv Datasets/Data-For-Processing/Police-Data/Police_Street_Data_2020_2024_LSOA.csv

## 4.  Data Cleaning

### 4. 1. Police Data Cleaning

#### 4. 1. 2. Read the combined police street data CSV

In [None]:
street_police_df = pd.read_csv("Datasets/Raw-Data/Police-Data/Police_Street_Data_2020_2024_LSOA.csv")

In [None]:
street_police_df.info()

In [None]:
street_police_df.head()

#### 4. 1. 2.  Rename Police Data Columns for Clarity

In [None]:
street_police_df.rename(columns={
    "Crime ID": "Crime_ID",
    "Month": "Date",
    "Reported by": "Reported_By",
    "Falls within": "Falls_Within",
    "LSOA code": "LSOA_Code",
    "LSOA name": "LSOA_Name",
    "Crime type": "Crime_Type",
    "Last outcome category": "Last_Outcome_Category",
}, inplace=True)

#### 4. 1. 4. Cheack For Null Values

In [None]:
street_police_df.isnull().sum()

In [None]:
# Drop rows where LSOA_Code is null/NaN (or blank just in case)
before = len(street_police_df)

street_police_df["LSOA_Code"] = street_police_df["LSOA_Code"].astype("object").str.strip()
street_police_df = street_police_df.dropna(subset=["LSOA_Code"])
street_police_df = street_police_df[street_police_df["LSOA_Code"] != ""]

after = len(street_police_df)
print(f"Dropped {before - after} rows with null/blank LSOA_Code. Remaining: {after}")

In [None]:
# Replace missing values with meaningful placeholders
street_police_df["Crime_ID"] = street_police_df["Crime_ID"].fillna("No crime ID")
street_police_df["Last_Outcome_Category"] = street_police_df["Last_Outcome_Category"].fillna("No outcome mentioned")
street_police_df["Context"] = street_police_df["Context"].fillna("No context mentioned")

In [None]:
street_police_df.isnull().sum()

### 4. 2. Map Data Cleaning

#### 4. 2. 1. Read the Map Data

In [None]:
MSOA_Map = pd.read_csv("Datasets/Data-For-Processing/Map-Data/Map-MSOA.csv")
LSOA_Map = pd.read_csv("Datasets/Data-For-Processing/Map-Data/Map-LSOA.csv")
Lookup_Map = pd.read_csv("Datasets/Data-For-Processing/Map-Data/LSOA_to_MSOA_Lookup.csv")

#### 4. 2. 2.  Clean MSOA_Map Data

In [None]:
MSOA_Map.info()

In [None]:
MSOA_Map.head()

##### 4. 2. 2. 1. Rename MSOA Map Columns for Clarity

In [None]:
MSOA_Map.rename(columns={
    "MSOA21CD" : "MSOA_Code",
    "MSOA21NM" : "MSOA_Name",
    "LAT" : "Latitude",
    "LONG" : "Longitude",
    "BNG_E" : "British_National_Grid_Easting",
    "BNG_N" : "British_National_Grid_Northing",
    "Shape__Area" : "Shape_Area",
    "Shape__Length" : "Shape_Length",
}, inplace=True)

In [None]:
MSOA_Map.head()

##### 4. 2. 2. 2. Cheack For Null Values

In [None]:
MSOA_Map.isnull().sum()

#### 4. 2. 3. Clean LSOA_Map Data

In [None]:
LSOA_Map.info()

In [None]:
LSOA_Map.head()

##### 4. 2. 3. 1. Rename LSOA Map Columns for Clarity

In [None]:
LSOA_Map.rename(columns={
    "LSOA21CD" : "LSOA_Code",
    "LSOA21NM" : "LSOA_Name",
    "LAT" : "Latitude",
    "LONG" : "Longitude",
    "BNG_E" : "British_National_Grid_Easting",
    "BNG_N" : "British_National_Grid_Northing",
    "Shape__Area" : "Shape_Area",
    "Shape__Length" : "Shape_Length",
}, inplace=True)

In [None]:
LSOA_Map.head()

##### 4. 2. 3. 2. Cheack For Null Values

In [None]:
LSOA_Map.isnull().sum()

#### 4. 2. 4. Clean LSOA_to_MSOA_Lookup Data

In [None]:
Lookup_Map.info()

In [None]:
Lookup_Map.head()

##### 4. 2. 4. 1. Rename Lookup Map Columns for Clarity

In [None]:
Lookup_Map.rename(columns={
    "lsoa21cd" : "LSOA_Code",
    "msoa21cd" : "MSOA_Code",
    "lsoa21nm" : "LSOA_Name",
    "msoa21nm" : "MSOA_Name",
}, inplace=True)

In [None]:
Lookup_Map.head()

##### 4. 2. 4. 2. Cheack For Null Values

In [None]:
Lookup_Map.isnull().sum()

In [None]:
# Drop rows where LSOA_Code is null/NaN 
before = len(Lookup_Map)

Lookup_Map["LSOA_Code"] = Lookup_Map["LSOA_Code"].astype("object").str.strip()
Lookup_Map = Lookup_Map.dropna(subset=["LSOA_Code"])
Lookup_Map = Lookup_Map[Lookup_Map["LSOA_Code"] != ""]
after = len(Lookup_Map)

print(f"Dropped {before - after} rows with null/blank LSOA_Code. Remaining: {after}")

In [None]:
# Replace missing values with meaningful placeholders
Lookup_Map["LSOA_Name"] = Lookup_Map["LSOA_Name"].fillna("No LSOA Name")
Lookup_Map["MSOA_Name"] = Lookup_Map["MSOA_Name"].fillna("No MSOA Name")

In [None]:
Lookup_Map.isnull().sum()

### 4. 3. Income Data Cleaning

#### 4. 3. 1. Read the Income Data

In [None]:
Income_Data_2020 = pd.read_csv("Datasets/Data-For-Processing/Income-Data/Total_Annual_Income_2020_MSOA.csv",
                            skiprows=4
                                )
Income_Data_2023 = pd.read_csv("Datasets/Data-For-Processing/Income-Data/Total_Annual_Income_2023_MSOA.csv",
                            skiprows=3
                                )

#### 4. 3. 2. Clean 2020 Income Data

In [None]:
Income_Data_2020.head()

In [None]:
Income_Data_2020.info()

##### 4. 3. 2. 1. Rename Income 2020 Columns for Clarity

In [None]:
Income_Data_2020.rename(columns={
    "MSOA code" : "MSOA_Code",
    "MSOA name" : "MSOA_Name",
    "Total annual income (£)" : "Total_Annual_Income_British_Pounds"
}, inplace=True)

In [None]:
Income_Data_2020.head()

In [None]:
# Check for null values
Income_Data_2020.isnull().sum()

#### 4. 3. 3. Clean 2023 Income Data

In [None]:
Income_Data_2023.head()

In [None]:
Income_Data_2023.info()

##### 4. 3. 3. 1. Rename Income 2020 Columns for Clarity

In [None]:
Income_Data_2020.rename(columns={
    "MSOA code" : "MSOA_Code",
    "MSOA name" : "MSOA_Name",
    "Total annual income (£)" : "Total_Annual_Income_British_Pounds"
}, inplace=True)

In [None]:
Income_Data_2023.head()

In [None]:
# Check for null values
Income_Data_2023.isnull().sum()

### 4. 4. Population Data Cleaning

In [None]:
population_2020 = pd.read_csv("Datasets/Data-For-Processing/Population-Data/Population_2020_LSOA.csv", skiprows=4)
population_2021 = pd.read_csv("Datasets/Data-For-Processing/Population-Data/Population_2021_LSOA.csv", skiprows=3)
population_2022 = pd.read_csv("Datasets/Data-For-Processing/Population-Data/Population_2022_LSOA.csv", skiprows=3)
population_2023 = pd.read_csv("Datasets/Data-For-Processing/Population-Data/Population_2023_LSOA.csv", skiprows=3)
population_2024 = pd.read_csv("Datasets/Data-For-Processing/Population-Data/Population_2024_LSOA.csv", skiprows=3)

#### 4. 4. 1. Clean 2020 Population Data

In [None]:
population_2020.head()

In [None]:
population_2020.info()

In [None]:
# Rename columns for clarity
population_2020.rename(columns={
    "LSOA Code" : "LSOA_Code",
    "LSOA Name" : "LSOA_Name",
    "All Ages" : "Total_Population"
}, inplace=True)

In [None]:
population_2020.head()

In [None]:
# Check for null values
population_2020.isnull().sum()

#### 4. 4. 2. Clean 2021 Population Data

In [None]:
population_2021.head()

In [None]:
population_2021.info()

In [None]:
# Rename columns for clarity
population_2021.rename(columns={
    "LSOA 2021 Code" : "LSOA_Code",
    "LSOA 2021 Name" : "LSOA_Name",
    "Total" : "Total_Population"
}, inplace=True)

In [None]:
population_2021.head()

In [None]:
# Check for null values
population_2021.isnull().sum()

#### 4. 4. 3. Clean 2022 Population Data

In [None]:
population_2022.head()

In [None]:
population_2022.info()

In [None]:
# Rename columns for clarity
population_2022.rename(columns={
    "LSOA 2021 Code" : "LSOA_Code",
    "LSOA 2021 Name" : "LSOA_Name",
    "Total" : "Total_Population"
}, inplace=True)

In [None]:
population_2022.head()

In [None]:
# Check for null values
population_2022.isnull().sum()

#### 4. 4. 4. Clean 2023 Population Data

In [None]:
population_2023.head()

In [None]:
population_2023.info()

In [None]:
# Rename columns for clarity
population_2023.rename(columns={
    "LSOA 2021 Code" : "LSOA_Code",
    "LSOA 2021 Name" : "LSOA_Name",
    "Total" : "Total_Population"
}, inplace=True)

In [None]:
population_2023.head()

In [None]:
# Check for null values
population_2023.isnull().sum()

#### 4. 4. 5. Clean 2024 Population Data

In [None]:
population_2024.head()

In [None]:
population_2024.info()

In [None]:
# Rename columns for clarity
population_2024.rename(columns={
    "LSOA 2021 Code" : "LSOA_Code",
    "LSOA 2021 Name" : "LSOA_Name",
    "Total" : "Total_Population"
}, inplace=True)

In [None]:
population_2024.head()

In [None]:
# Check for null values
population_2024.isnull().sum()

### 4. 6. Whether Data Cleaning

In [None]:
weather_data = pd.read_fwf("Datasets/Data-For-Processing/Weather-Data/Heathrow_Monthly_Weather_1948_2025.txt", skiprows=5)

In [None]:
weather_data.head()

In [None]:
weather_data.tail()

In [None]:
weather_data.info()

In [None]:
# Drop the First Row which contains units
weather_data = weather_data.drop(index=0).reset_index(drop=True)

In [None]:
weather_data.head()

In [None]:
# Rename columns for clarity
weather_data.rename(columns={
    "yyyy" : "Year",
    "mm" : "Month",
    "tmax" : "Max_Temperature_Celsius",
    "tmin" : "Min_Temperature_Celsius",
    "af" : "Air_Frost_Days",
    "rain" : "Rainfall_mm",
    "sun" : "Sunshine_Hours"
}, inplace=True)

In [None]:
# Check for null values
weather_data.isnull().sum()

## 5. Data Transformation

### 5.  1. Police Data Transformation

In [None]:
street_police_df.info()

#### 5. 1. 1. Change the Data Types

In [None]:
street_police_df["Context"] = street_police_df["Context"].astype("object")
street_police_df["Date"] = street_police_df["Date"].astype("datetime64[ns]")


In [None]:
street_police_df.info()

#### 5. 1. 2. Date as a Time Period , Year and Month 

In [None]:
street_police_df["Month_Period"] = pd.to_datetime(street_police_df["Date"], errors="coerce").dt.to_period("M")
street_police_df[["Date", "Month_Period"]].head()

street_police_df["Year"] = street_police_df["Date"].dt.year
street_police_df["Month"] = street_police_df["Date"].dt.month

In [None]:
street_police_df.tail()

In [None]:
street_police_df.info()

#### 5. 1. 3. Save Relevent Police Data

In [None]:
# Create a simplified DataFrame with only relevant columns
street_police_df_simple = street_police_df[["Crime_ID", "Longitude", "Latitude", "Location", "LSOA_Code", "LSOA_Name", "Crime_Type", "Last_Outcome_Category", "Month_Period", "Year", "Month"]]


In [None]:
# Save cleaned police data
street_police_df_simple.to_csv("Datasets/Processed-Data/Police-Data/Street_Police_Data_Cleaned_LSAO_2020_2024.csv", index=False)

### 5. 2. Population Data Transformation 

#### 5. 2. 1. Population 2020 Data Transformation

In [None]:
population_2020.info()

In [None]:
# Add Year column
population_2020["Year"] = 2020

# Create a simplified DataFrame with only relevant columns
population_2020_simple = population_2020[["LSOA_Code", "LSOA_Name", "Total_Population", "Year"]]
population_2020_simple.head()

#### 5. 2. 2. Population 2021 Data Transformation

In [None]:
population_2021.info()

In [None]:
# Add Year column
population_2021["Year"] = 2021

# Create a simplified DataFrame with only relevant columns
population_2021_simple = population_2021[["LSOA_Code", "LSOA_Name", "Total_Population", "Year"]]
population_2021_simple.head()

#### 5. 2. 3. Population 2022 Data Transformation

In [None]:
population_2022.info()

In [None]:
# Add Year column
population_2022["Year"] = 2022

# Create a simplified DataFrame with only relevant columns
population_2022_simple = population_2022[["LSOA_Code", "LSOA_Name", "Total_Population", "Year"]]
population_2022_simple.head()

#### 5. 2. 4. Population 2023 Data Transformation

In [None]:
population_2023.info()

In [None]:
# Add Year column
population_2023["Year"] = 2023

# Create a simplified DataFrame with only relevant columns
population_2023_simple = population_2023[["LSOA_Code", "LSOA_Name", "Total_Population", "Year"]]
population_2023_simple.head()

#### 5. 2. 5. Population 2024 Data Transformation

In [None]:
population_2024.info()

In [None]:
# Add Year column
population_2024["Year"] = 2024

# Create a simplified DataFrame with only relevant columns
population_2024_simple = population_2024[["LSOA_Code", "LSOA_Name", "Total_Population", "Year"]]
population_2024_simple.head()

#### 5. 2. 6. Combine All Data in to a One Dataset

In [None]:
# Combine all years into a single DataFrame
combined_population = pd.concat([
    population_2020_simple,
    population_2021_simple,
    population_2022_simple,
    population_2023_simple,
    population_2024_simple
], ignore_index=True)

In [None]:
combined_population.head()

In [None]:
# Save cleaned population data
combined_population.to_csv("Datasets/Processed-Data/Population-Data/Population_Cleaned_LSOA_2020_2024.csv", index=False)

### 5. 3. Map Data Transformation

#### 5. 3. 1. LSOA Map Transformation

In [None]:
LSOA_Map.info()

In [None]:
# Create a simplified DataFrame with only relevant columns
LSOA_Map_Simple = LSOA_Map[["LSOA_Code", "LSOA_Name", "Latitude", "Longitude", "British_National_Grid_Easting", "British_National_Grid_Northing", "Shape_Area", "Shape_Length"]]

In [None]:
# Save cleaned LSOA map data
LSOA_Map_Simple.to_csv("Datasets/Processed-Data/Map-Data/LSOA_Map_Cleaned.csv", index=False)

#### 5. 3. 2. MSOA Map Transformation

In [None]:
MSOA_Map.info()

In [None]:
# Create a simplified DataFrame with only relevant columns
MSOA_Map_Simple = MSOA_Map[["MSOA_Code", "MSOA_Name", "Latitude", "Longitude", "British_National_Grid_Easting", "British_National_Grid_Northing", "Shape_Area", "Shape_Length"]]

In [None]:
# Save cleaned MSOA map data
MSOA_Map_Simple.to_csv("Datasets/Processed-Data/Map-Data/MSOA_Map_Cleaned.csv", index=False)

#### 5. 3. 3. LSOA to MSOA Lookup Map Transformation

In [98]:
Lookup_Map.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2709549 entries, 0 to 2720555
Data columns (total 14 columns):
 #   Column     Dtype  
---  ------     -----  
 0   pcd7       object 
 1   pcd8       object 
 2   pcds       object 
 3   dointr     int64  
 4   doterm     float64
 5   usertype   int64  
 6   oa21cd     object 
 7   LSOA_Code  object 
 8   MSOA_Code  object 
 9   ladcd      object 
 10  LSOA_Name  object 
 11  MSOA_Name  object 
 12  ladnm      object 
 13  ladnmw     object 
dtypes: float64(1), int64(2), object(11)
memory usage: 310.1+ MB


In [99]:
# Create a simplified DataFrame with only relevant columns
Lookup_Map_Simple = Lookup_Map[["LSOA_Code", "MSOA_Code", "LSOA_Name", "MSOA_Name"]]

In [101]:
# Save cleaned LSOA to MSOA lookup data
Lookup_Map_Simple.to_csv("Datasets/Processed-Data/Map-Data/LSOA_MSOA_Lookup_Cleaned.csv", index=False)

### 5. 4. Weather Data Transformation

In [103]:
weather_data.head()

Unnamed: 0,Year,Month,Max_Temperature_Celsius,Min_Temperature_Celsius,Air_Frost_Days,Rainfall_mm,Sunshine_Hours
0,1948.0,1.0,8.9,3.3,---,85.0,---
1,1948.0,2.0,7.9,2.2,---,26.0,---
2,1948.0,3.0,14.2,3.8,---,14.0,---
3,1948.0,4.0,15.4,5.1,---,35.0,---
4,1948.0,5.0,18.1,6.9,---,57.0,---


#### 5. 4. 1. Convert Year and Month to Month_Period


In [None]:
# Convert Year and Month to Month_Period
weather_data["Month_Period"] = pd.to_datetime(
    weather_data["Year"].astype(int).astype(str) + "-" + weather_data["Month"].astype(int).astype(str).str.zfill(2),
    format="%Y-%m"
).dt.to_period("M")
weather_data[["Year", "Month", "Month_Period"]].head()

Unnamed: 0,Year,Month,Month_Period
0,1948.0,1.0,1948-01
1,1948.0,2.0,1948-02
2,1948.0,3.0,1948-03
3,1948.0,4.0,1948-04
4,1948.0,5.0,1948-05


In [105]:
weather_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 936 entries, 0 to 935
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype    
---  ------                   --------------  -----    
 0   Year                     936 non-null    float64  
 1   Month                    936 non-null    float64  
 2   Max_Temperature_Celsius  936 non-null    object   
 3   Min_Temperature_Celsius  936 non-null    object   
 4   Air_Frost_Days           936 non-null    object   
 5   Rainfall_mm              936 non-null    object   
 6   Sunshine_Hours           936 non-null    object   
 7   Month_Period             936 non-null    period[M]
dtypes: float64(2), object(5), period[M](1)
memory usage: 58.6+ KB


In [None]:
# Save cleaned weather data
weather_data.to_csv("Datasets/Processed-Data/Weather-Data/Heathrow_Monthly_Weather_Cleaned_1948_2025.csv", index=False)

### 5. 5. Income Data Transformation