In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree

In [40]:
# Load the CSV
df = pd.read_csv("Data/latlongCP.csv", sep=",",low_memory=False)
chunks = pd.read_csv('Data/Culture.csv', chunksize=1000000, low_memory=False, sep=";")

# Extract lat/lon as radians (for haversine distance)
df["lat"] = df["latitude"].astype(float)
df["lon"] = df["longitude"].astype(float)
df = df.dropna(subset=["lat", "lon", "code_postal"])
df = df.dropna()
coords_rad = np.radians(df[["lat", "lon"]].to_numpy())

# Build BallTree (fast nearest-neighbor search)
tree = BallTree(coords_rad, metric="haversine")

def latlon_to_zip(lat, lon):
    """Return closest postal code for given latitude, longitude."""
    print(lat, lon)
    dist, idx = tree.query(np.radians([[lat, lon]]), k=1)
    postal = df.iloc[idx[0][0]]["code_postal"]
    print("postal",postal)
    return str(postal)

chunk_list = []
for chunk in chunks:
    print("Processing a new chunk...")
    df1 = pd.DataFrame(chunk)
    df2 = df1[['Latitude','Longitude']].dropna()
    df3 = pd.DataFrame()
    ''' if needed x = 1130
    x_padded = str(x).zfill(6)
    print(x_padded)  # output: 001130'''

    df3['CodePostal'] = df2.apply(lambda row: latlon_to_zip(row['Latitude'], row['Longitude']), axis=1)
    chunk_list.append(df3)

# Concatenate all chunks into a single DataFrame
df = pd.concat(chunk_list, ignore_index=True)

# Display basic information about the dataset
print(df.info())
print(df.head())
df.to_csv('DataCleaned/Culture_Cleaned.csv', index=False)

Processing a new chunk...
45.9581098634323 5.35858969556051
postal 1130
49.3840096583855 3.32837377789723
postal 2160
49.2190971790064 -0.356054338461469
postal 14220
49.4193888621204 0.234937672855018
postal 76610
44.9244749808599 2.44411373788183
postal 15290
44.9309145630983 2.44475997902592
postal 15290
45.034927829231 3.09240608025448
postal 48200
45.6451700242156 0.153721739870912
postal 24490
45.6593673172358 0.160892646437427
postal 24490
45.6520213841466 0.162786148236133
postal 24490
45.6802214753235 -0.337861897938119
postal 17390
45.4464249388785 -0.432554511497704
postal 24490
46.131631485123 3.45640510915965
postal 71220
45.9367359081496 -0.962457996267473
postal 17480
46.1405298980339 -1.15338079133719
postal 17000
46.1598986015101 -1.14763868548343
postal 17000
46.1598310892013 -1.1500067416188
postal 17000
45.7448993483815 -0.635287492911286
postal 17390
47.1022310471162 2.41523167323581
postal 89110
47.0595139773684 2.39098584965009
postal 89110
45.0933377495691 1.935

In [43]:
df4 = pd.read_csv("DataCleaned/Culture_Cleaned.csv", sep=",",low_memory=False)
counts = df4.groupby("CodePostal").size().reset_index(name="Count")
print(counts.sort_values(by="Count", ascending=False).head(10))

counts.to_csv('DataCleaned/Culture_Cleaned.csv', index=False)

     CodePostal  Count
119       92350   5700
44        32350   3496
1          2160   3217
25        22330   2979
87        71220   2308
88        71570   2290
62        48200   2174
82        66420   2156
111       86200   2139
83        67130   1985


In [44]:
df4 = pd.read_csv("DataCleaned/Culture_Cleaned.csv", sep=",",low_memory=False)
print(df4.head(10))

   CodePostal  Count
0        1130    287
1        2160   3217
2        2460   1343
3        4000     23
4        4120    188
5        4170     98
6        4380    168
7        4510    222
8        4530    180
9        6110   1733


In [61]:
# Load the CSV
chunks = pd.read_csv('Data/LoyerT1T2.csv', chunksize=1000000, low_memory=False, sep=";", encoding="latin1")

chunk_list = []
for chunk in chunks:
    print("Processing a new chunk...")
    df1 = pd.DataFrame(chunk)
    df2 = df1[['DEP', 'loypredm2']].dropna().copy()
    df2 = df2.rename(columns={'DEP':'Département','loypredm2':'Loyerm2'})
    df2['Loyerm2'] = df2['Loyerm2'].astype(str).str.replace(',', '.').astype(float)
    chunk_list.append(df2)

# Concatenate all chunks into a single DataFrame
df = pd.concat(chunk_list, ignore_index=True)
df['Loyerm2'] = pd.to_numeric(df['Loyerm2'], errors='coerce')
mean = df.groupby("Département")["Loyerm2"].mean().reset_index(name="Loyerm2Mean")

# Display basic information about the dataset
print(mean.info())
print(mean.head())
mean.to_csv('DataCleaned/LoyerT1T2_Cleaned.csv', index=False)

Processing a new chunk...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Département  100 non-null    object 
 1   Loyerm2Mean  100 non-null    float64
dtypes: float64(1), object(1)
memory usage: 1.7+ KB
None
  Département  Loyerm2Mean
0          01    11.872627
1          02    11.225315
2          03     9.060860
3          04    11.546030
4          05    11.392891


In [65]:
# Load the CSV
chunks = pd.read_csv('Data/Education.csv', chunksize=1000000, low_memory=False, sep=";")

chunk_list = []
for chunk in chunks:
    print("Processing a new chunk...")
    df1 = pd.DataFrame(chunk)
    df2 = df1[['Adresse : code postal']].dropna().copy()
    df2 = df2.rename(columns={'Adresse : code postal':'CodePostal'})
    chunk_list.append(df2)

# Concatenate all chunks into a single DataFrame
df = pd.concat(chunk_list, ignore_index=True)
count = df.groupby("CodePostal").size().reset_index(name="Count")

# Display basic information about the dataset
print(count.info())
print(count.head())
print(count.sort_values(by="Count", ascending=False).head(10))
count.to_csv('DataCleaned/Education_Cleaned.csv', index=False)

Processing a new chunk...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8051 entries, 0 to 8050
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   CodePostal  8051 non-null   int64
 1   Count       8051 non-null   int64
dtypes: int64(2)
memory usage: 125.9 KB
None
   CodePostal  Count
0        1000     36
1        1001      2
2        1011      4
3        1090      9
4        1100     36
      CodePostal  Count
6137       75019    130
6133       75015    125
3779       51100    124
1364       21000    120
6138       75020    117
7896       97600    107
6131       75013    106
6136       75018     99
6134       75016     98
1641       25000     95
