In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
import uuid

#paths of raw file and cleaned data file
raw_path = Path("C:/Users/rohit/OneDrive/Desktop/Uni/Dissertation/cleaned_dataset/combined_city_of_london_crime.csv")
cleaned_path = Path("C:/Users/rohit/OneDrive/Desktop/Uni/Dissertation/cleaned_dataset/cleaned_city_of_london_crime_for_spatial_analysis.csv")
# ----------------------------------------

#function to load and clean the data for spatial analysis
df = pd.read_csv(raw_path)
print(f"Total rows loaded: {len(df)}")   # <-- print after loading

df.columns = [c.strip() for c in df.columns]
lower_map = {c.lower(): c for c in df.columns}

lon_col = lower_map.get("longitude")
lat_col = lower_map.get("latitude")
loc_col = lower_map.get("location")
month_col = lower_map.get("month")
crime_id_col = lower_map.get("crime id") or lower_map.get("crime_id")

if lon_col is None or lat_col is None:
    raise ValueError("Longitude/Latitude columns not found.")

# data cleanings for longitude and latitude attributes
df = df[~df[lon_col].isna() & ~df[lat_col].isna()].copy()
if loc_col in df.columns:
    df = df[df[loc_col].fillna("").str.strip().str.lower() != "no location"]

df[lon_col] = pd.to_numeric(df[lon_col], errors="coerce")
df[lat_col] = pd.to_numeric(df[lat_col], errors="coerce")
df = df.dropna(subset=[lon_col, lat_col])

# UUID fix before dedup
if crime_id_col in df.columns:
    null_mask = df[crime_id_col].isna() | (df[crime_id_col].astype(str).str.strip() == "")
    if null_mask.any():
        df.loc[null_mask, crime_id_col] = [str(uuid.uuid4()) for _ in range(null_mask.sum())]
    df = df.drop_duplicates(subset=[crime_id_col], keep="first")

# parse month
if month_col in df.columns:
    df[month_col] = pd.to_datetime(df[month_col], errors="coerce")

# quantile trim & city bbox
lon_q1, lon_q99 = df[lon_col].quantile([0.01, 0.99])
lat_q1, lat_q99 = df[lat_col].quantile([0.01, 0.99])
df_trim = df[df[lon_col].between(lon_q1, lon_q99) & df[lat_col].between(lat_q1, lat_q99)].copy()

bbox = dict(lat_min=51.507, lat_max=51.525, lon_min=-0.112, lon_max=-0.064)
df_city = df_trim[(df_trim[lon_col] >= bbox["lon_min"]) & (df_trim[lon_col] <= bbox["lon_max"]) &
                  (df_trim[lat_col] >= bbox["lat_min"]) & (df_trim[lat_col] <= bbox["lat_max"])].copy()

if df_city.empty:
    df_city = df_trim.copy()

print(f"Total rows after cleaning: {len(df_city)}")   # <-- print after cleaning

# Saving the cleaned dataset
cleaned_path.parent.mkdir(parents=True, exist_ok=True)
df_city.to_csv(cleaned_path, index=False)
print(f"Cleaned file saved to: {cleaned_path}")


Total rows loaded: 19172
Total rows after cleaning: 16638
Cleaned file saved to: C:\Users\rohit\OneDrive\Desktop\Uni\Dissertation\cleaned_dataset\cleaned_city_of_london_crime_for_spatial_analysis.csv
