## Import of data related to new housing (post 2021), with DPE (Energetic Performance Diagnosis) and GES (Greenhouse Gas) scores.

### Libs imports

In [31]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv

In [32]:
# Loading environment variables
load_dotenv()
API_KEY = os.getenv('ADEME_API_KEY')

### Connecting to ADEME's API (and testing the response's status code)

In [33]:
url=  'https://data.ademe.fr/data-fair/api/v1/datasets/dpe03existant/lines' # database url
headers = {
        'accept': 'application/json',
        'x-apikey': API_KEY
        }
params = {
        'select': '_geopoint,adresse_ban,annee_construction,numero_dpe,etiquette_dpe,etiquette_ges,conso_5_usages_par_m2_ef,emission_ges_5_usages_par_m2,categorie_enr',
        'qs': 'code_departement_ban:75'
        }
response = requests.get(url, headers=headers, params=params)
response.status_code

200

### Checking data we have connected to

In [34]:
pd.DataFrame(response.json()["results"])

Unnamed: 0,categorie_enr,adresse_ban,conso_5_usages_par_m2_ef,etiquette_ges,annee_construction,_geopoint,emission_ges_5_usages_par_m2,numero_dpe,etiquette_dpe,_score
0,réseau de chaleur ou de froid vertueux,50 Rue du Disque 75013 Paris,238.0,D,1948.0,"48.82458102161721,2.363825947937033",36.0,2275E0252200Y,D,
1,,74 Rue de Sèvres 75007 Paris,206.7,D,1948.0,"48.84762702994544,2.318554043655058",45.7,2275E1242094U,D,
2,,7 Rue Pierre Sémard 75009 Paris,125.8,B,1930.0,"48.877989001472606,2.347287007868445",8.0,2275E0392026W,D,
3,,19 Rue Antoine Chantin 75014 Paris,93.0,B,1987.0,"48.82729698043097,2.321912996369212",6.0,2275E0464107F,D,
4,réseau de chaleur ou de froid vertueux,12 Avenue de la Porte de Clignancourt 75018 Paris,104.0,C,,"48.89892902627486,2.3444450656734404",15.0,2275E1334975D,C,
5,,101 Rue du Chemin Vert 75011 Paris,90.0,A,1947.0,"48.861080983623,2.380873994245739",6.0,2275E0647807P,C,
6,,29 Avenue de la Motte-Picquet 75007 Paris,167.0,D,1947.0,"48.8551579728205,2.307180967430216",37.0,2275E0479391B,D,
7,,70b Avenue de Clichy 75017 Paris,288.0,D,1948.0,"48.887955965292726,2.325008019561719",53.0,2275E0348574Q,E,
8,,27 Rue Marbeuf 75008 Paris,203.0,D,,"48.869091982871474,2.304235983873164",37.0,2275E1153202W,E,
9,,9b Rue de Valence 75005 Paris,190.0,C,1947.0,"48.83768798913495,2.3495429633026275",28.0,2275E0725539H,E,


### Main function importing all data from ADEME's database
#### (segmenting API calls according to districts, construction periods and diagnostician's time of visit, to avoid reaching the 10000 entries' limit of the ADEME's API)

In [None]:
def get_all_dpe_data():
    # Initialize empty DataFrame with desired columns
    columns = ['_geopoint', 'adresse_ban', 'annee_construction', 'numero_dpe', 'etiquette_dpe', 'etiquette_ges', 
              'conso_5_usages_par_m2_ef', 'emission_ges_5_usages_par_m2', 'categorie_enr', 'date_visite_diagnostiqueur']
    final_df = pd.DataFrame(columns=columns)
    
    # Postal codes of the 20 Parisian districts
    postal_codes = [f'750{str(i).zfill(2)}' for i in range(1, 21)]
    
    # Building periods (typical for Paris)
    construction_periods = [
        ("0", "1900"),     # Pré-1900
        ("1901", "1945"),  # Pré-guerre
        ("1946", "1970"),  # Reconstruction
        ("1971", "1990"),  # Fin du XXe siècle
        ("1991", "2010"),  # Début du XXIe siècle
        ("2011", "2023")   # Construction récente
    ]
    
    # Diagnostician's time of visit
    years = list(range(2021, 2026))  # From 2021 to 2025
    months = list(range(1, 13))      # 12 months
    
    total_segments = len(postal_codes) * len(construction_periods) * len(years) * len(months)
    segment_count = 0
    
    # Requesting the API (segmenting the calls)

    for code in postal_codes:
        for start_year, end_year in construction_periods:
            for year in years:
                for month in months:
                    segment_count += 1
                    month_str = str(month).zfill(2)
                    
                    print(f"\nFetching segment {segment_count}/{total_segments}")
                    print(f"Postal code: {code}, period: {start_year}-{end_year}, visit date: {year}-{month_str}")
                    
                    batch_size = 5000
                    offset = 0
                    
                    while True:
                        url = 'https://data.ademe.fr/data-fair/api/v1/datasets/dpe03existant/lines'
                        headers = {
                                'accept': 'application/json',
                                'x-apikey': API_KEY
                        }
                        params = {
                            'size': batch_size,
                            'skip': offset,
                            'select': '_geopoint,adresse_ban,annee_construction,numero_dpe,etiquette_dpe,etiquette_ges,conso_5_usages_par_m2_ef,emission_ges_5_usages_par_m2,categorie_enr,date_visite_diagnostiqueur',
                            'qs': f'code_postal_ban:{code} AND annee_construction:>={start_year} AND annee_construction:<={end_year} AND date_visite_diagnostiqueur:>={year}-{month_str}-01 AND date_visite_diagnostiqueur:<={year}-{month_str}-31'
                        }
                        
                        response = requests.get(url, headers=headers, params=params)
                        
                        if response.status_code != 200:
                            print(f"HTTP Error: {response.status_code}")
                            print(f"Response: {response.text}")
                            break
                            
                        try:
                            data = response.json()
                            batch = data.get("results", [])
                            
                            if not batch:  # If no results
                                print(f"No results for this segment")
                                break
                            
                            # Converting to DataFrame and adding to the final DataFrame
                            batch_df = pd.DataFrame(batch)
                            final_df = pd.concat([final_df, batch_df], ignore_index=True)
                            
                            total_for_segment = data.get('total', 0)
                            print(f"Total records so far: {len(final_df)}")
                            print(f"Records for this segment: {offset + len(batch)}/{total_for_segment}")
                            
                            if len(batch) < batch_size or offset + len(batch) >= total_for_segment:
                                print(f"All data retrieved for this segment")
                                break
                                
                            offset += len(batch)
                            
                            # Limit checking
                            if offset + batch_size > 10000:
                                print(f"Reached API limit for this segment, moving to next")
                                break
                                
                        except json.JSONDecodeError as e:
                            print(f"JSON decoding error for this segment, offset {offset}")
                            print(f"Response: {response.text[:200]}...")
                            break
    
    # Temporary backup every 50000 records to avoid restarting from scratch in case of problem
    if len(final_df) > 0 and len(final_df) % 50000 < 5000:
        temp_filename = f"dpe_paris_temp_{len(final_df)}.csv"
        final_df.to_csv(temp_filename, index=False)
        print(f"Intermediate save: {temp_filename}")
    
    print(f"\nFetching completed. Total: {len(final_df)} recordings")
    return final_df

# Running the main function
df = get_all_dpe_data()
print(f"\nFinal DataFrame's shape: {df.shape}")


Fetching segment 1/7200
Postal code: 75001, period: 0-1900, visit date: 2021-01
No results for this segment

Fetching segment 2/7200
Postal code: 75001, period: 0-1900, visit date: 2021-02
No results for this segment

Fetching segment 3/7200
Postal code: 75001, period: 0-1900, visit date: 2021-03
No results for this segment

Fetching segment 4/7200
Postal code: 75001, period: 0-1900, visit date: 2021-04
No results for this segment

Fetching segment 5/7200
Postal code: 75001, period: 0-1900, visit date: 2021-05
No results for this segment

Fetching segment 6/7200
Postal code: 75001, period: 0-1900, visit date: 2021-06
No results for this segment

Fetching segment 7/7200
Postal code: 75001, period: 0-1900, visit date: 2021-07
Total records so far: 1
Records for this segment: 1/1
All data retrieved for this segment

Fetching segment 8/7200
Postal code: 75001, period: 0-1900, visit date: 2021-08
Total records so far: 3
Records for this segment: 2/2
All data retrieved for this segment

Fet

### Checking imported data

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354644 entries, 0 to 354643
Data columns (total 11 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   _geopoint                     354644 non-null  object
 1   adresse_ban                   354450 non-null  object
 2   annee_construction            354644 non-null  object
 3   numero_dpe                    354644 non-null  object
 4   etiquette_dpe                 354644 non-null  object
 5   etiquette_ges                 354644 non-null  object
 6   conso_5_usages_par_m2_ef      354643 non-null  object
 7   emission_ges_5_usages_par_m2  354643 non-null  object
 8   categorie_enr                 72462 non-null   object
 9   date_visite_diagnostiqueur    354644 non-null  object
 10  _score                        0 non-null       object
dtypes: object(11)
memory usage: 29.8+ MB


In [37]:
df.head()

Unnamed: 0,_geopoint,adresse_ban,annee_construction,numero_dpe,etiquette_dpe,etiquette_ges,conso_5_usages_par_m2_ef,emission_ges_5_usages_par_m2,categorie_enr,date_visite_diagnostiqueur,_score
0,"48.86595400777677,2.3354320257122905",Rue Sainte-Anne 75001 Paris,1900,2275E0012511D,E,B,143.0,10.0,,2021-07-07,
1,"48.860345970770695,2.343468977763066",3 Rue du Roule 75001 Paris,1850,2175E0136723M,F,C,152.9,11.5,,2021-08-02,
2,"48.864780089917204,2.3411900768347564",5 Rue Hérold 75001 Paris,1850,2175E0230740N,F,C,153.9,11.0,,2021-08-29,
3,"48.86435902854955,2.342206030505405",5 Rue Coq Héron 75001 Paris,1850,2175E1032152B,E,D,197.0,35.9,,2021-09-13,
4,"48.865052440555104,2.3295030329353708",3 Rue d’Alger 75001 Paris,1830,2175E0320904J,D,B,106.3,8.0,,2021-09-13,


### Exporting data to a .csv

In [38]:
df.to_csv("dpe_logements_existants_21_ademe.csv")