In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
YEAR = "2020"

# Import dataframe

In [3]:
df = pd.read_csv(f"full_clean/full_clean_{YEAR}.csv")
df_house = df[df["house_flat_land"] == "house"].drop(columns=["house_flat_land"])
df_flat = df[df["house_flat_land"] == "flat"].drop(columns=["house_flat_land"])
df_land = df[df["house_flat_land"] == "land"].drop(columns=["house_flat_land"])
del df

# Merge JDNs

In [4]:
jdns = None
for jdn in os.listdir("jdns"):
    if not jdn.startswith("jdn"):
        continue
    df_jdn = pd.read_csv(f"jdns/{jdn}")
    df_jdn.drop("index", axis=1, inplace=True)
    df_jdn.drop_duplicates(subset=["city"], keep="first", inplace=True)
    if jdns is None:
        jdns = df_jdn
    else:
        jdns = pd.merge(jdns, df_jdn, on="city", how="left")

In [5]:
jdns["city"] = jdns["city"].str.lower()
for old, new in [("-", " "), ("(", ""), (")", ""), ("'", " ")]:
    jdns["city"] = jdns["city"].str.replace(old, new, regex=False)

In [6]:
def drop(df_in, df_name):
    df = df_in.copy()
    df.drop(columns=[
        "own_cars(%)",
        "part_logement_vacant(%)"
    ], inplace=True)
    
    if df_name == "flat":
        df.drop(columns = [
            "land_size",
        ], inplace=True)
    elif df_name == "land":
        df = df.drop(columns=[
            "rooms_number",
            "built_surface",
            "address_number"
        ])
    
    df.dropna(axis=0, inplace=True)
        
    weird_sales = df[(df["value_euros"] < 200) |
                     (df["value_euros"] > 10000000)].index
    df.drop(weird_sales, axis=0, inplace=True)
    
    if df_name in ["house", "flat"]:
        weird_room_number = df[df["rooms_number"] >= 20].index
        df.drop(weird_room_number, axis=0, inplace=True)
    
    if df_name == "house":
        weird_size_land = df[df["land_size"] >= 10000].index
        df.drop(weird_size_land, axis=0, inplace=True)
    
    return df

In [7]:
def preprocess(df, df_name):
    df = pd.merge(df, jdns, on="city", how="left")
    df = drop(df, df_name)
    df.drop_duplicates(keep="first", inplace=True)
    return df

In [8]:
df_house = preprocess(df_house, "house")
df_flat = preprocess(df_flat, "flat")
df_land = preprocess(df_land, "land")

# Add distance to school

In [9]:
schools = pd.read_csv("school.csv")

schools.columns = map(str.lower, schools.columns)
schools = schools[["code postal", "longitude", "latitude"]]
schools.rename(columns = {"code postal": "zip_code"}, inplace=True)
schools.dropna(subset=["longitude", "latitude"], axis=0, inplace=True)
schools.sort_values(by="zip_code", ascending=True, inplace=True)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [10]:
def get_distance_lon_lat(lat1, lon1, lat2, lon2):
    r = 6371
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    a = np.sin(delta_phi / 2)**2 + np.cos(phi1) * np.cos(phi2) *   np.sin(delta_lambda / 2)**2
    res = r * (2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)))
    return np.round(res, 2)

In [11]:
def get_min_distance(df):
    zip_codes1 = df["zip_code"].values
    longitudes1 = df["longitude"].values
    latitudes1 = df["latitude"].values

    zip_codes2 = schools["zip_code"].values
    longitudes2 = schools["longitude"].values
    latitudes2 = schools["latitude"].values

    min_distances = []
    for i, (zip_code1, longitude1, latitude1) in enumerate(zip(zip_codes1, longitudes1, latitudes1)):
        if i % 1000 == 0:
            print(i, df.shape[0])
        min_dist = 1e308
        min_index = -1
        index = np.where(zip_codes2 == zip_code1)[0]

        if len(index):
            index = index[0]
        else:
            index = len(zip_codes2)

        while index < len(zip_codes2) and zip_codes2[index] == zip_code1:
            longitude2, latitude2 = longitudes2[index], latitudes2[index]
            d = get_distance_lon_lat(latitude1, longitude1, latitude2, longitude2)
            if d < min_dist:
                min_dist = d
                min_index = index
            index += 1
        min_distances.append(min_dist)
    return min_distances

In [12]:
df_house.shape[0], df_flat.shape[0], df_land.shape[0]

(84603, 46510, 12116)

In [13]:
df_house["dist_to_school"] = get_min_distance(df_house)
df_flat["dist_to_school"] = get_min_distance(df_flat)
df_land["dist_to_school"] = get_min_distance(df_land) 

0 84603
1000 84603
2000 84603
3000 84603
4000 84603
5000 84603
6000 84603
7000 84603
8000 84603
9000 84603
10000 84603
11000 84603
12000 84603
13000 84603
14000 84603
15000 84603
16000 84603
17000 84603
18000 84603
19000 84603
20000 84603
21000 84603
22000 84603
23000 84603
24000 84603
25000 84603
26000 84603
27000 84603
28000 84603
29000 84603
30000 84603
31000 84603
32000 84603
33000 84603
34000 84603
35000 84603
36000 84603
37000 84603
38000 84603
39000 84603
40000 84603
41000 84603
42000 84603
43000 84603
44000 84603
45000 84603
46000 84603
47000 84603
48000 84603
49000 84603
50000 84603
51000 84603
52000 84603
53000 84603
54000 84603
55000 84603
56000 84603
57000 84603
58000 84603
59000 84603
60000 84603
61000 84603
62000 84603
63000 84603
64000 84603
65000 84603
66000 84603
67000 84603
68000 84603
69000 84603
70000 84603
71000 84603
72000 84603
73000 84603
74000 84603
75000 84603
76000 84603
77000 84603
78000 84603
79000 84603
80000 84603
81000 84603
82000 84603
83000 84603
84000

In [14]:
def drop_infinite_distance(df):
    indices = df[df["dist_to_school"] == 1e308].index
    df.drop(indices, axis=0, inplace=True)

In [15]:
drop_infinite_distance(df_house)
drop_infinite_distance(df_flat)
drop_infinite_distance(df_land)

# Export

In [16]:
if not os.path.exists("real_estate"):
    os.mkdir("real_estate")

In [17]:
df_house.to_csv(f"real_estate/real_estate_house{YEAR}.csv", index=False)
df_flat.to_csv(f"real_estate/real_estate_flat{YEAR}.csv", index=False)
df_land.to_csv(f"real_estate/real_estate_land{YEAR}.csv", index=False)