In [1]:
import numpy as np
from sklearn.neighbors import BallTree
import pandas as pd

Lecture Fichier

In [2]:
input_file = 'C:/Users/Christian/Desktop/PA/Dataset/valeurs_foncieres_with_regions.csv'
df = pd.read_csv(input_file, delimiter=';', dtype={'code_postal': str, 'code_commune': str, 'code_departement': str})

In [3]:
df.head(10)

Unnamed: 0,id_mutation,date_mutation,nature_mutation,valeur_fonciere,adresse_numero,adresse_suffixe,adresse_code_voie,adresse_nom_voie,code_postal,code_commune,...,code_departement,type_local,surface_reelle_bati,surface_terrain,longitude,latitude,nombre_lots,num_dep,dep_name,region_name
0,2020-5,2020-01-09,Vente,72000.0,5367.0,,B107,LORETTE,1270,1108,...,1,Maison,35.0,381.0,5.350942,46.380918,0,1,Ain,Auvergne-Rhône-Alpes
1,2020-7,2020-01-06,Vente,180300.0,31.0,,0970,RUE COMTE DE LA TEYSSONNIERE,1000,1053,...,1,Maison,75.0,525.0,5.208899,46.19687,0,1,Ain,Auvergne-Rhône-Alpes
2,2020-12,2020-01-13,Vente,97000.0,16.0,,1770,RUE DES GRAVES,1000,1053,...,1,Appartement,92.0,,5.232261,46.199051,1,1,Ain,Auvergne-Rhône-Alpes
3,2020-13,2020-01-09,Vente,215000.0,2.0,,0140,IMP DU COLOMBIER,1250,1422,...,1,Maison,93.0,790.0,5.310961,46.140943,0,1,Ain,Auvergne-Rhône-Alpes
4,2020-15,2020-01-09,Vente,264450.0,5340.0,,0052,IMP GRAND CHAMP,1310,1385,...,1,Maison,111.0,1002.0,5.162395,46.189471,0,1,Ain,Auvergne-Rhône-Alpes
5,2020-17,2020-01-08,Vente,269000.0,5090.0,,B032,HAMEAU DES RIPPES,1240,1069,...,1,Maison,287.0,1100.0,5.274764,46.146702,0,1,Ain,Auvergne-Rhône-Alpes
6,2020-21,2020-01-09,Vente,99000.0,18.0,,1770,RUE DES GRAVES,1000,1053,...,1,Appartement,91.0,,5.232261,46.199051,1,1,Ain,Auvergne-Rhône-Alpes
7,2020-22,2020-01-10,Vente,235600.0,27.0,,0263,RUE DE LA BROCHETTE,1370,1241,...,1,Maison,97.0,797.0,5.347364,46.237743,0,1,Ain,Auvergne-Rhône-Alpes
8,2020-24,2020-01-13,Vente,165000.0,337.0,,0085,RTE DE CROYAT,1290,1365,...,1,Maison,87.0,1265.0,4.914381,46.26373,0,1,Ain,Auvergne-Rhône-Alpes
9,2020-26,2020-01-08,Vente,70000.0,5.0,,1320,RUE EDGAR QUINET,1000,1053,...,1,Appartement,45.0,,5.225508,46.204052,1,1,Ain,Auvergne-Rhône-Alpes


Nettoyer les lignes contenant des NaN dans les coordonnées

In [4]:
df = df.dropna(subset=['latitude', 'longitude'])

# Algo BallTree

In [5]:
ball_tree_models = {}
for region, group in df.groupby('region_name'):
    coords = group[['latitude', 'longitude']].values * np.pi / 180  # Conversion des degrés en radians pour haversine
    ball_tree_models[region] = BallTree(coords, metric='haversine')

In [6]:
def estimate_mean_price(lat, lon, region, surface_bati, type_local, k=10):
    if region not in ball_tree_models:
        return np.nan
    tree = ball_tree_models[region]
    coords = np.array([[lat, lon]]) * np.pi / 180
    dist, ind = tree.query(coords, k=k)
    indices = ind[0]
    nearest_sales = df.iloc[indices]

    filtered_sales = nearest_sales[(nearest_sales['type_local'] == type_local) & 
                                    (nearest_sales['surface_reelle_bati'] >= 0.8 * surface_bati) & 
                                    (nearest_sales['surface_reelle_bati'] <= 1.2 * surface_bati)]

    if len(filtered_sales) == 0:
        return np.nan
    
    mean_price = filtered_sales['valeur_fonciere'].mean()
    return mean_price

In [7]:
def compute_mean_distance(lat, lon, region, k=10):
    if region not in ball_tree_models:
        return np.nan
    tree = ball_tree_models[region]
    coords = np.array([[lat, lon]]) * np.pi / 180 
    dist, ind = tree.query(coords, k=k)
    mean_distance_km = np.mean((dist[0] * 6341) * 1000) if len(dist[0]) > 0 else np.nan
    return mean_distance_km

In [8]:
example_property = {'latitude': 46.388082, 'longitude': 5.347202, 'region': 'Auvergne-Rhône-Alpes', 'surface_bati': 50, 'type_local': 'Appartement'}
mean_price = estimate_mean_price(example_property['latitude'], 
                                                           example_property['longitude'], 
                                                           example_property['region'], 
                                                           example_property['surface_bati'], 
                                                           example_property['type_local'])
mean_distance = compute_mean_distance(example_property['latitude'], example_property['longitude'], example_property['region'])
print(f"Prix estimé moyen des 10 biens les plus proches : {mean_price} €")
print(f"Distance moyenne des 10 biens les plus proches : {mean_distance} mètres")

Prix estimé moyen des 10 biens les plus proches : 69666.66666666667 €
Distance moyenne des 10 biens les plus proches : 208.5147444702001 mètres


In [9]:
# Ajouter une nouvelle colonne pour les prix moyens du voisinage (10 biens les plus proches)
df['prix_moyen_voisinage'] = df.apply(
    lambda row: estimate_mean_price(row['latitude'], row['longitude'], row['region_name'], row['surface_reelle_bati'], row['type_local']), axis=1)

In [10]:
# Ajouter une nouvelle colonne pour la distance moyenne des 10 biens les plus proches
df['distance_moyen_voisinage'] = df.apply(
    lambda row: compute_mean_distance(row['latitude'], row['longitude'], row['region_name']), axis=1)

In [11]:
# Sauvegarder le DataFrame avec la nouvelle colonne
output_file = 'C:/Users/Christian/Desktop/PA/Dataset/valeurs_foncieres_with_mean_price_neighborhood.csv'
df.to_csv(output_file, sep=';', index=False)