In [2]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree

In [3]:
# Load the CSV
df = pd.read_csv("../Data/latlongCP.csv", sep=",",low_memory=False)
chunks = pd.read_csv('../Data/Culture.csv', chunksize=1000000, low_memory=False, sep=";")

# Extract lat/lon as radians (for haversine distance)
df["lat"] = df["latitude"].astype(float)
df["lon"] = df["longitude"].astype(float)
df = df.dropna(subset=["lat", "lon", "code_postal"])
df = df.dropna()
coords_rad = np.radians(df[["lat", "lon"]].to_numpy())

# Build BallTree (fast nearest-neighbor search)
tree = BallTree(coords_rad, metric="haversine")

def latlon_to_zip(lat, lon):
    """Return closest postal code for given latitude, longitude."""
    dist, idx = tree.query(np.radians([[lat, lon]]), k=1)
    postal = df.iloc[idx[0][0]]["code_postal"]
    postal = str(postal).zfill(5)
    return str(postal)

chunk_list = []
for chunk in chunks:
    print("Processing a new chunk...")
    df1 = pd.DataFrame(chunk)
    df2 = df1[['Latitude','Longitude']].dropna()
    df3 = pd.DataFrame()
    df3['CodePostal'] = df2.apply(lambda row: latlon_to_zip(row['Latitude'], row['Longitude']), axis=1)
    chunk_list.append(df3)

# Concatenate all chunks into a single DataFrame
df = pd.concat(chunk_list, ignore_index=True)

# Display basic information about the dataset
counts = df3.groupby("CodePostal").size().reset_index(name="CultureCount")
print(counts.sort_values(by="CultureCount", ascending=False).head(10))

counts.to_csv('../DataCleaned/Culture_Cleaned.csv', index=False)

Processing a new chunk...
    CodePostal  CultureCount
119      92350          5700
44       32350          3496
1        02160          3217
25       22330          2979
87       71220          2308
88       71570          2290
62       48200          2174
82       66420          2156
111      86200          2139
83       67130          1985
