# Real-World Crime Analytics for the London Metropolitan Area

## 1. Environment Setup (Linux)

### 1.1. Update Linux Packages

In [None]:
!sudo apt update

### 1.2. Install Python and Packages (pip & venv)

In [None]:
!sudo apt install -y python3 python3-pip python3-venv

### 1.3. Create a Virtual Environment

In [None]:
!python3 -m venv venv

### 1.4. Activate the Virtual Environment

In [None]:
!source venv/bin/activate

### 1.5. Install the Required Python Packages

In [None]:
%pip install -r requirements.txt

## 2. Data Ingestion

### 2.1. Create Folders to Store Data

In [None]:
import os

# Create datasets folder if it doesn't exist
os.makedirs("Datasets", exist_ok=True)
os.makedirs("Datasets/Raw-Data", exist_ok=True)
os.makedirs("Datasets/Raw-Data/Map-Data", exist_ok=True)
os.makedirs("Datasets/Raw-Data/Police-Data", exist_ok=True)
os.makedirs("Datasets/Raw-Data/Income-Data", exist_ok=True)
os.makedirs("Datasets/Raw-Data/Population-Data", exist_ok=True)

### 2.2. Download Data

#### 2.2.1. Download Income Data - <a>www.ons.gov.uk</a>

In [None]:
!wget -O Datasets/Raw-Data/Income-Data/Income-MSOA.xlsx "https://www.ons.gov.uk/file?uri=/employmentandlabourmarket/peopleinwork/earningsandworkinghours/datasets/smallareaincomeestimatesformiddlelayersuperoutputareasenglandandwales/financialyearending2020/saiefy1920finalqaddownload280923.xlsx"
print("Downloaded to Datasets/Raw-Data/Income-Data/Income-MSOA.xlsx")

#### 2.2.2. Download Population Data - <a>www.ons.gov.uk</a>

In [None]:
!wget -O Datasets/Raw-Data/Population-Data/Population-LSOA.xlsx "https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/populationestimates/datasets/lowersuperoutputareamidyearpopulationestimates/mid2022revisednov2025tomid2024/sapelsoasyoa20222024.xlsx"
print("Downloaded to Datasets/Raw-Data/Population-Data/Population-LSOA.xlsx")

#### 2.2.3. Download Police Data - <a>data.police.uk</a>

In [None]:
!wget -O Datasets/Raw-Data/Police-Data/RAW-POLICE-2022-10-TO-2025-09.zip "https://data.police.uk/data/archive/2025-09.zip"
print("Downloaded to Datasets/Raw-Data/RAW-POLICE-2022-10-TO-2025-09.zip")

In [None]:
# Extract the police data zip file
!unzip -o Datasets/Raw-Data/RAW-POLICE-2022-10-TO-2025-09.zip -d Datasets/Raw-Data/Police-Data
print("Extracted police data to Datasets/Raw-Data/Police-Data/")

In [None]:
# Clean up the zip file to save space
!rm -rf Datasets/Raw-Data/RAW-POLICE-2022-10-TO-2025-09.zip

#### 2.2.4. Download Map Data - <a>geoportal.statistics.gov.uk</a>

#### Note: Map Data from ONS Geography Can't be Downloaded Directly, So Use the Below Links to Download Them.

1. LSOAs - <a>https://geoportal.statistics.gov.uk/datasets/ons::output-areas-december-2021-boundaries-ew-bgc-v2/about</a>

2. MSOAs - <a>https://geoportal.statistics.gov.uk/datasets/ons::middle-layer-super-output-areas-december-2021-boundaries-ew-bfc-v7-2/about</a>

### 2.3. Converting File Types and Keeping Necessary Files

#### 2.3.1. Convert XLSX (Excel) files to CSV using pandas

In [None]:
import pandas as pd
import os
import re

# Safe filename from sheet name
def sanitize(name: str) -> str:
    # Replace non-alphanumeric with underscores, strip, collapse repeats
    name = re.sub(r"[^A-Za-z0-9]+", "_", name).strip("_")
    return re.sub(r"_+", "_", name) or "Sheet"

# Convert Income-MSOA.xlsx: write one CSV per sheet
income_xlsx_path = "Datasets/Raw-Data/Income-Data/Income-MSOA.xlsx"
income_out_dir = "Datasets/Raw-Data/Income-Data/Income-MSOA"
os.makedirs(income_out_dir, exist_ok=True)

try:
    xls_income = pd.ExcelFile(income_xlsx_path)
    for sheet in xls_income.sheet_names:
        df = pd.read_excel(xls_income, sheet_name=sheet)
        safe = sanitize(sheet)
        out_path = os.path.join(income_out_dir, f"Income-MSOA-{safe}.csv")
        df.to_csv(out_path, index=False)
    print(f"Exported {len(xls_income.sheet_names)} sheets from {income_xlsx_path} to {income_out_dir}")
except Exception as e:
    print("Failed to process income workbook:", e)

# Convert Population-LSOA.xlsx: write one CSV per sheet
population_xlsx_path = "Datasets/Raw-Data/Population-Data/Population-LSOA.xlsx"
population_out_dir = "Datasets/Raw-Data/Population-Data/Population-LSOA"
os.makedirs(population_out_dir, exist_ok=True)

try:
    xls_pop = pd.ExcelFile(population_xlsx_path)
    for sheet in xls_pop.sheet_names:
        df = pd.read_excel(xls_pop, sheet_name=sheet)
        safe = sanitize(sheet)
        out_path = os.path.join(population_out_dir, f"Population-LSOA-{safe}.csv")
        df.to_csv(out_path, index=False)
    print(f"Exported {len(xls_pop.sheet_names)} sheets from {population_xlsx_path} to {population_out_dir}")
except Exception as e:
    print("Failed to process population workbook:", e)

#### 2.3.2. Retain Only Metropolitan Police Records

In [None]:
from pathlib import Path
from typing import List

police_root = Path("Datasets/Raw-Data/Police-Data")
date_range = ("2022-10", "2025-09")
keep_token = "metropolitan"

if not police_root.exists():
    raise FileNotFoundError(f"Missing directory: {police_root}")

kept: List[Path] = []
removed: List[Path] = []

# Delete every police file that does not belong to the Metropolitan force.
for file_path in police_root.rglob("*"):
    if not file_path.is_file():
        continue
    if keep_token in file_path.name.lower():
        kept.append(file_path)
        continue
    file_path.unlink()
    removed.append(file_path)

print(f"Date Range - {date_range[0]} - {date_range[1]}")
for path in sorted(kept):
    print(path)

print(f"Removed {len(removed)} other files.")

### 2.4. Copy Only the Necessary Files for Cleaning

In [None]:
os.makedirs("Datasets/Data-for-Cleaning", exist_ok=True)
os.makedirs("Datasets/Data-for-Cleaning/Police-Data", exist_ok=True)
os.makedirs("Datasets/Data-for-Cleaning/Income-Data", exist_ok=True)
os.makedirs("Datasets/Data-for-Cleaning/Population-Data", exist_ok=True)
os.makedirs("Datasets/Data-for-Cleaning/Map-Data", exist_ok=True)

#### 2.4.1. Copy Police Data

In [None]:
!cp -r Datasets/Raw-Data/Police-Data/* Datasets/Data-for-Cleaning/Police-Data/

#### 2.4.2. Copy Map Data

In [None]:
!cp -r Datasets/Raw-Data/Map-Data/* Datasets/Data-for-Cleaning/Map-Data/

#### 2.4.3. Copy Income Data

In [None]:
!cp -r Datasets/Raw-Data/Income-Data/Income-MSOA/Income-MSOA-Total_annual_income.csv Datasets/Data-for-Cleaning/Income-Data/

#### 2.4.4. Copy Population Data

In [None]:
!cp -r Datasets/Raw-Data/Population-Data/Population-LSOA/Population-LSOA-Mid_2022_LSOA_2021.csv Datasets/Data-for-Cleaning/Population-Data/
!cp -r Datasets/Raw-Data/Population-Data/Population-LSOA/Population-LSOA-Mid_2023_LSOA_2021.csv Datasets/Data-for-Cleaning/Population-Data/
!cp -r Datasets/Raw-Data/Population-Data/Population-LSOA/Population-LSOA-Mid_2024_LSOA_2021.csv Datasets/Data-for-Cleaning/Population-Data/

## 3. Data Cleaning

### 3. 1. Initialize Spark

In [None]:
# Import and initialize Spark
import pyspark
from pathlib import Path
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import NumericType


# Initialize session
spark = SparkSession.builder \
    .appName("Real-World Crime Analytics for the London Metropolitan Area") \
    .master("local[*]") \
    .getOrCreate()

print("Spark version:", spark.version)

### 3. 2. Cleaning Income Data

#### 3.2.1.  Reading the Income Data - Data & Schema

In [None]:
# Load and display the total annual income data
total_income_path = "Datasets/Data-for-Cleaning/Income-Data/Income-MSOA-Total_annual_income.csv"

total_income_df = spark.read.option("header", "true").csv(total_income_path)

total_income_df.show(5)
total_income_df.printSchema()

#### 3.2.2.  Remove Extra Header Lines and Load Cleaned Income Data

In [None]:
# read as text so we can inspect the actual first lines
raw = spark.read.text(total_income_path)

# Drop the first 4 lines
clean_lines = raw.rdd.zipWithIndex() \
    .filter(lambda x: x[1] >= 4) \
    .map(lambda x: x[0].value)

# Convert back to CSV DataFrame
clean_df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(clean_lines)

total_income_df = clean_df

total_income_df.show(5)
total_income_df.printSchema()

#### 3.2.3. Summary Statistics for Cleaned Income Data

In [None]:
total_income_df.describe().show()

#### 3.2.4. Detecting Duplicate Rows in Cleaned Income Data


In [None]:
duplicates = total_income_df.groupBy(total_income_df.columns) \
               .count() \
               .filter("count > 1")

duplicates.show()

### 3. 3. Cleaning Population Data

#### 3.3.1.  Reading the Population Data - Data & Schema

In [None]:
population_paths = {
    "2022": "Datasets/Data-for-Cleaning/Population-Data/Population-LSOA-Mid_2022_LSOA_2021.csv",
    "2023": "Datasets/Data-for-Cleaning/Population-Data/Population-LSOA-Mid_2023_LSOA_2021.csv",
    "2024": "Datasets/Data-for-Cleaning/Population-Data/Population-LSOA-Mid_2024_LSOA_2021.csv",
}

population_dfs = {}
for year, path in population_paths.items():
    df = spark.read.option("header", "true").csv(path)
    population_dfs[year] = df
    print(f"Population {year} Data:")
    df.show(5)
    df.printSchema()

population_2022_df = population_dfs["2022"]
population_2023_df = population_dfs["2023"]
population_2024_df = population_dfs["2024"]

#### 3.3.2.  Remove Extra Header Lines and Load Cleaned Population Data

In [None]:
def clean_population(path: str):
    rows = spark.read.text(path)
    trimmed = rows.rdd.zipWithIndex().filter(lambda x: x[1] >= 3).map(lambda x: x[0].value)
    return spark.read.option("header", "true").option("inferSchema", "true").csv(trimmed)

population_clean = {year: clean_population(path) for year, path in population_paths.items()}

population_2022_df = population_clean["2022"]
population_2023_df = population_clean["2023"]
population_2024_df = population_clean["2024"]

for year, df in population_clean.items():
    print(f"Cleaned Population {year} Data:")
    df.show(5)
    df.printSchema()

#### 3.3.3. Summary Statistics for Cleaned Population Data

In [None]:
print("Population 2022 Data:")
population_2022_df.describe().show()

print("Population 2023 Data:")
population_2023_df.describe().show()

print("Population 2024 Data:")
population_2024_df.describe().show()

#### 3.3.4. Detecting Duplicate Rows in Cleaned Population Data


In [None]:
duplicates = {
    year: df.groupBy(*df.columns).count().filter("count > 1")
    for year, df in population_clean.items()
}

duplicates_2022 = duplicates["2022"]
duplicates_2023 = duplicates["2023"]
duplicates_2024 = duplicates["2024"]

for year, dup_df in duplicates.items():
    print(f"Duplicate Rows in Population {year} Data:")
    dup_df.show()

### 3. 4. Cleaning Map Data

#### 3.4.1.  Reading the Map Data - Data & Schema

In [None]:
# Load and display the population datasets
map_paths = {
    "lsoa": "Datasets/Data-for-Cleaning/Map-Data/Map-LSOA-2021.csv",
    "msoa": "Datasets/Data-for-Cleaning/Map-Data/Map-MSOA-2021.csv",
}

map_dfs = {}
for map, path in map_paths.items():
    df = spark.read.option("header", "true").csv(path)
    map_dfs[map] = df
    print(f"Population {map} Data:")
    df.show(5)
    df.printSchema()

map_lsoa_df = map_dfs["lsoa"]
map_msoa_df = map_dfs["msoa"]


#### 3.4.2. Summary Statistics for Map Data

In [None]:
print("Map LSOA Data:")
map_lsoa_df.describe().show()

print("Map MSOA Data:")
map_msoa_df.describe().show()


#### 3.4.3. Detecting Duplicate Rows in Map Data


In [None]:
duplicates_map = {
    "lsoa": map_lsoa_df.groupBy(*map_lsoa_df.columns).count().filter("count > 1"),
    "msoa": map_msoa_df.groupBy(*map_msoa_df.columns).count().filter("count > 1"),
}

duplicates_lsoa = duplicates_map["lsoa"]
duplicates_msoa = duplicates_map["msoa"]

for map_type, dup_df in duplicates_map.items():
    print(f"Duplicate Rows in Map {map_type.upper()} Data:")
    dup_df.show()

### 3. 5. Cleaning Police Data

#### 3.5.1. Combine All Police data into a Single Frame


In [None]:

police_data_root = Path("Datasets/Data-for-Cleaning/Police-Data")

def load_police_dataset(file_glob: str) -> DataFrame:
    """Load all monthly police CSVs matching the glob into a single DataFrame."""
    # Gather every CSV matching the pattern across the month folders.
    matches = sorted(police_data_root.rglob(file_glob))
    if not matches:
        raise FileNotFoundError(f"No police files matched pattern: {file_glob}")
    print(f"Matched {len(matches)} files for pattern '{file_glob}'")
    # Load all matched files into a single Spark DataFrame.
    df = spark.read.option("header", "true").option("inferSchema", "true").csv([str(path) for path in matches])

    return df

# Build combined DataFrames for each police dataset family.
police_outcomes_df = load_police_dataset("*-metropolitan-outcomes.csv")
police_stop_search_df = load_police_dataset("*-metropolitan-stop-and-search.csv")
police_street_df = load_police_dataset("*-metropolitan-street.csv")

# Register temp views for downstream Spark SQL operations.
police_outcomes_df.createOrReplaceTempView("police_outcomes")
police_stop_search_df.createOrReplaceTempView("police_stop_and_search")
police_street_df.createOrReplaceTempView("police_street")

# Print simple row counts for a quick sanity check.
print("Combined police outcomes rows:", police_outcomes_df.count())
print("Combined police stop and search rows:", police_stop_search_df.count())
print("Combined police street rows:", police_street_df.count())

#### 3.5.2. Police Outcomes Data Check

##### 3.5.2.1. Police Outcomes Schema Check

In [None]:
print("Police Outcomes Schema:")
police_outcomes_df.printSchema()

##### 3.5.2.2. Police Outcomes Data Rows Check

In [None]:
print("First 5 Records")
police_outcomes_df.orderBy("Month").show(5)

print("Last 5 Records")
police_outcomes_df.orderBy("Month").tail(5)

##### 3.5.2.3. Police Outcomes Values Check

In [None]:
police_outcomes_df.describe().show()

##### 3.5.2.4. Fill missing coordinates with readable placeholders.


In [None]:
# Fill all missing values in police_outcomes_df with None
police_outcomes_df = police_outcomes_df.fillna("None")

# Detect numeric columns automatically
numeric_cols_outcomes = [
    f.name for f in police_outcomes_df.schema.fields
    if isinstance(f.dataType, NumericType)
]

# Fill all numeric columns with 0.0
if numeric_cols_outcomes:
    police_outcomes_df = police_outcomes_df.fillna(0.0, subset=numeric_cols_outcomes)

police_outcomes_df.describe().show()

#### 3.5.3. Police Stop and Search Data Check

##### 3.5.3.1. Police Stop and Search Schema Check

In [None]:
print("Police Stop and Search Schema:")
police_stop_search_df.printSchema()

##### 3.5.3.2. Police Stop and Search Data Rows Check

In [None]:
print("First 5 Records")
police_stop_search_df.orderBy("Date").show(5)

print("Last 5 Records")
police_stop_search_df.orderBy("Date").tail(5)

##### 3.5.3.3. Police Stop and Search Values Check

In [None]:
police_stop_search_df.describe().show()

##### 3.5.3.4. Fill missing coordinates with readable placeholders.


In [None]:
# Fill all missing values in police_stop_search_df with None
police_stop_search_df = police_stop_search_df.fillna("None")

# Detect numeric columns automatically
numeric_cols_stop = [
    f.name for f in police_stop_search_df.schema.fields
    if isinstance(f.dataType, NumericType)
]

# Fill all numeric columns with 0.0
if numeric_cols_stop:
    police_stop_search_df = police_stop_search_df.fillna(0.0, subset=numeric_cols_stop)

police_stop_search_df.describe().show()

#### 3.5.4. Police Street Data Check

##### 3.5.4.1. Police Street Schema Check

In [None]:
print("Police Stop and Search Schema:")
police_street_df.printSchema()

##### 3.5.4.2. Police Street Data Rows Check

In [None]:
print("First 5 Records")
police_street_df.orderBy("Month").show(5)

print("Last 5 Records")
police_street_df.orderBy("Month").tail(5)

##### 3.5.4.3. Police Street Values Check

In [None]:
police_street_df.describe().show()

##### 3.5.4.4. Fill missing coordinates with readable placeholders.


In [None]:
# Fill all missing values in police_street_df with None
police_street_df = police_street_df.fillna("None")

# Detect numeric columns automatically
numeric_cols_street = [
    f.name for f in police_street_df.schema.fields
    if isinstance(f.dataType, NumericType)
]

# Fill all numeric columns with 0.0
if numeric_cols_street:
    police_street_df = police_street_df.fillna(0.0, subset=numeric_cols_street)

police_street_df.describe().show()

## 4. Data Transformation

### 4.0. Create Folders for Cleaned Data

In [None]:
os.makedirs("Datasets/Cleaned-Data/Police-Data", exist_ok=True)
os.makedirs("Datasets/Cleaned-Data/Income-Data", exist_ok=True)
os.makedirs("Datasets/Cleaned-Data/Population-Data", exist_ok=True)
os.makedirs("Datasets/Cleaned-Data/Map-Data", exist_ok=True)

### 4.1. Transform Cleaned Income Dataset

In [None]:
total_income_df.show(5)

#### 4.1.1. Select only the required columns from clean_df


In [None]:
selected_income_df = total_income_df.select(
    "MSOA code",
    "MSOA name",
    "Total annual income (Â£)"
)

selected_income_df.show(5)

#### 4.1.2. Standardize Colomn Names 

In [None]:
def simplify_column_names(df):
    new_cols = [col.lower().replace(" ", "_") for col in df.columns]
    return df.toDF(*new_cols)

selected_income_df = simplify_column_names(selected_income_df)
selected_income_df.show(5)

#### 4.1.3. Verify Income Data Schema

In [None]:
selected_income_df.printSchema()

#### 4.1.3. Save the cleaned income data

In [None]:
selected_income_df.coalesce(1).write.mode("overwrite").option("header", "true").csv("Datasets/Cleaned-Data/Income-Data/")

### 4.2. Transform Cleaned Population Dataset

#### 4.2.1. Display the Population DataFrames

In [None]:
population_2022_df.show(5)
population_2023_df.show(5)
population_2024_df.show(5)

#### 4.2.2. Select Required Columns from Population Data

In [None]:
def select_population_columns(population_df):
    # Select only the required columns from the population DataFrame
    return population_df.select(
        "LSOA 2021 Code",
        "LSOA 2021 Name",
        "Total"

    )

selected_population_2022_df = select_population_columns(population_2022_df)
selected_population_2023_df = select_population_columns(population_2023_df)
selected_population_2024_df = select_population_columns(population_2024_df)

selected_population_2022_df.show(5)
selected_population_2023_df.show(5)
selected_population_2024_df.show(5)


#### 4.2.3. Standardize Population Column Names

In [None]:
def simplify_column_names(df):
    new_cols = [col.lower().replace(" ", "_") for col in df.columns]
    return df.toDF(*new_cols)

selected_population_2022_df = simplify_column_names(selected_population_2022_df)
selected_population_2022_df.show(5)

selected_population_2023_df = simplify_column_names(selected_population_2023_df)
selected_population_2023_df.show(5)

selected_population_2024_df = simplify_column_names(selected_population_2024_df)
selected_population_2024_df.show(5)

#### 4.2.4. Verify Population Data Schema

In [None]:
selected_population_2022_df.printSchema()
selected_population_2023_df.printSchema()
selected_population_2024_df.printSchema()

#### 4.2.5. Save Cleaned Population Data

In [None]:
# Save the cleaned income data
selected_population_2022_df.coalesce(1).write.mode("overwrite").option("header", "true").csv("Datasets/Cleaned-Data/Population-Data/2022")
selected_population_2023_df.coalesce(1).write.mode("overwrite").option("header", "true").csv("Datasets/Cleaned-Data/Population-Data/2023")
selected_population_2024_df.coalesce(1).write.mode("overwrite").option("header", "true").csv("Datasets/Cleaned-Data/Population-Data/2024")

### 4.3. Transform Cleaned Map Dataset

#### 4.3.1. Display the Map DataFrames

In [None]:
map_lsoa_df.show(5)
map_msoa_df.show(5)

#### 4.3.2. Select Required Columns from LSOA Map Data

In [None]:
def select_lsoa_columns(map_lsoa_df):
    # Select only the required columns from the population DataFrame
    return map_lsoa_df.select(
        "LSOA21CD",
        "LSOA21NM",
        "LAT",
        "LONG"
    )

selected_map_lsoa_df = select_lsoa_columns(map_lsoa_df)

selected_map_lsoa_df.show(5)

#### 4.3.3. Select Required Columns from MSOA Map Data

In [None]:
def select_msoa_columns(map_msoa_df):
    # Select only the required columns from the population DataFrame
    return map_msoa_df.select(
        "MSOA21CD",
        "MSOA21NM",
        "LAT",
        "LONG"
    )

selected_map_msoa_df = select_msoa_columns(map_msoa_df)

selected_map_msoa_df.show(5)

#### 4.3.4. Standardize Map Column Names

In [None]:
def simplify_column_names(df):
    new_cols = [col.lower().replace(" ", "_") for col in df.columns]
    return df.toDF(*new_cols)

selected_map_lsoa_df = simplify_column_names(selected_map_lsoa_df)
selected_map_lsoa_df.show(5)

selected_map_msoa_df = simplify_column_names(selected_map_msoa_df)
selected_map_msoa_df.show(5)


#### 4.3.5. Verify Map Data Schema

In [None]:
selected_map_lsoa_df.printSchema()
selected_map_msoa_df.printSchema()

#### 4.3.6. Save Cleaned Map Data

In [None]:
# Save the cleaned income data
selected_map_lsoa_df.coalesce(1).write.mode("overwrite").option("header", "true").csv("Datasets/Cleaned-Data/Map-Data/lsoa")
selected_map_msoa_df.coalesce(1).write.mode("overwrite").option("header", "true").csv("Datasets/Cleaned-Data/Map-Data/msoa")
    

### 4.4. Transform Cleaned Police Dataset

In [None]:
police_outcomes_df.show(5)
police_stop_search_df.show(5)
police_street_df.show(5)

#### 4.4.1. Display the Police DataFrames

In [None]:
def simplify_column_names(df):
    new_cols = [col.lower().replace(" ", "_") for col in df.columns]
    return df.toDF(*new_cols)

police_outcomes_df = simplify_column_names(police_outcomes_df)
police_outcomes_df.show(5)

police_stop_search_df = simplify_column_names(police_stop_search_df)
police_stop_search_df.show(5)

police_street_df = simplify_column_names(police_street_df)
police_street_df.show(5)

#### 4.4.2. Standardize Police Data Column Names

In [None]:
# Save the cleaned income data
# police_stop_search_df.coalesce(1).write.mode("overwrite").option("header", "true").csv("Datasets/Cleaned-Data/Police-Data/stop-search")
# police_outcomes_df.coalesce(1).write.mode("overwrite").option("header", "true").csv("Datasets/Cleaned-Data/Police-Data/outcomes")
# police_street_df.coalesce(1).write.mode("overwrite").option("header", "true").csv("Datasets/Cleaned-Data/Police-Data/street")

#### 4.4.3. Save Police Data as CSV (Commented Out - Using Parquet Instead)

In [None]:
# Save stop & search data as Parquet
police_stop_search_df.write.mode("overwrite") \
    .option("compression", "snappy") \
    .parquet("Datasets/Cleaned-Data/Police-Data/stop-search-parquet")

# Save outcomes data as Parquet
police_outcomes_df.write.mode("overwrite") \
    .option("compression", "snappy") \
    .parquet("Datasets/Cleaned-Data/Police-Data/outcomes-parquet")

# Save street data as Parquet
police_street_df.write.mode("overwrite") \
    .option("compression", "snappy") \
    .parquet("Datasets/Cleaned-Data/Police-Data/street-parquet")


#### 4.4.4. Save Police Data as Parquet with Snappy Compression

## 5. Data Querying