In [14]:
import pandas as pd
import numpy as np
import ast
from sklearn.cluster import KMeans

def clean_la_dataset_with_fixed_zones(csv_path, n_zones=5):  # Match Chicago zone count
    df = pd.read_csv(csv_path)
    df.rename(columns=lambda x: x.strip(), inplace=True)

    # Keep only relevant columns
    cols = [
        "Date Occurred", "Time Occurred", "Area Name", "Crime Code Description",
        "Premise Description", "Weapon Description", "Status Description",'Location'
    ]
    df = df[cols].copy()

    # Parse date and time
    df["Date Occurred"] = pd.to_datetime(df["Date Occurred"], errors="coerce")
    df["Hour"] = df["Time Occurred"].astype(str).str.zfill(4).str[:2].astype(int)
    df["DayOfWeek"] = df["Date Occurred"].dt.day_name()
    df["IsWeekend"] = df["DayOfWeek"].isin(["Saturday", "Sunday"]).astype(int)

    # Extract latitude and longitude
    def extract_lat(x):
        try:
            val = ast.literal_eval(x)
            return val.get("latitude") if isinstance(val, dict) else None
        except:
            return None

    def extract_lon(x):
        try:
            val = ast.literal_eval(x)
            return val.get("longitude") if isinstance(val, dict) else None
        except:
            return None

    df["Latitude"] = df["Location"].apply(extract_lat)
    df["Longitude"] = df["Location"].apply(extract_lon)

    # Drop invalid coordinates
    df = df.dropna(subset=["Latitude", "Longitude"])
    df = df[(df["Latitude"] != 0) & (df["Longitude"] != 0)]

    # Create zones using KMeans (same number as Chicago)
    coords = df[["Latitude", "Longitude"]].values
    kmeans = KMeans(n_clusters=n_zones, random_state=42, n_init=10)
    df["Zone"] = kmeans.fit_predict(coords)

    # Light condition
    df["Light_Condition"] = df["Hour"].apply(lambda x: "Day" if 6 <= x <= 18 else "Night")

    # Temperature (simulate realistic hourly variation)
    np.random.seed(42)
    df["Temperature"] = 15 + 10 * np.sin(df["Hour"] * np.pi / 12) + np.random.normal(0, 3, len(df))

    # Rain (10% chance)
    df["Rain"] = np.random.choice([0, 1], size=len(df), p=[0.9, 0.1])

    # Encode location and crime types
    df["Location_Type_Label"] = pd.factorize(df["Premise Description"])[0]
    df["Arrest"] = 0
    df["Domestic"] = 0
    df["Crime_Type"] = df["Crime Code Description"]
    df["Crime_Type_Label"] = pd.factorize(df["Crime_Type"])[0]

    # Select final features
    cleaned = df[[
        "Crime_Type", "Latitude", "Longitude", "Zone", "Hour", "DayOfWeek",
        "IsWeekend", "Light_Condition", "Temperature", "Rain",
        "Location_Type_Label", "Arrest", "Domestic", "Crime_Type_Label"
    ]]

    cleaned.to_csv("../dataset/clean_la.csv", index=False)
    print("Cleaned dataset saved as clean_losangeles.csv")
    print("Shape:", cleaned.shape)
    print("Unique crime types:", cleaned['Crime_Type'].nunique())
    print("Zones created:", cleaned['Zone'].nunique())

    return cleaned

# Example usage:
clean_la_dataset_with_fixed_zones("../dataset/SanFrancisco.csv")


  ret = a @ b
  ret = a @ b
  ret = a @ b
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight


Cleaned dataset saved as clean_losangeles.csv
Shape: (1993259, 14)
Unique crime types: 140
Zones created: 5


Unnamed: 0,Crime_Type,Latitude,Longitude,Zone,Hour,DayOfWeek,IsWeekend,Light_Condition,Temperature,Rain,Location_Type_Label,Arrest,Domestic,Crime_Type_Label
0,VEHICLE - STOLEN,34.0454,-118.3157,4,23,Friday,0,Night,13.901952,0,0,0,0,0
1,VEHICLE - STOLEN,33.9572,-118.2717,4,18,Wednesday,0,Day,4.585207,0,0,0,0,0
2,VEHICLE - STOLEN,34.1211,-118.2048,4,22,Wednesday,0,Night,11.943066,0,1,0,0,0
3,VEHICLE - STOLEN,34.241,-118.3987,3,16,Sunday,1,Day,10.908836,0,0,0,0,0
4,VEHICLE - STOLEN,34.3147,-118.4589,3,16,Wednesday,0,Day,5.637286,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1993254,SHOPLIFTING - PETTY THEFT ($950 & UNDER),34.1883,-118.6274,3,12,Friday,0,Day,9.909550,0,18,0,0,50
1993255,KIDNAPPING,33.9449,-118.2367,4,19,Tuesday,0,Night,11.239028,0,0,0,0,58
1993256,OTHER MISCELLANEOUS CRIME,34.1244,-118.1985,4,21,Friday,0,Night,7.986678,0,0,0,0,4
1993257,"EMBEZZLEMENT, GRAND THEFT ($950.01 & OVER)",34.2227,-118.5361,3,16,Tuesday,0,Day,8.310842,0,5,0,0,5
