## Import of data related to vehicles circulating in Paris from 2014 to 2022 (based on their type, ecological index, fuel type).

### Libs imports.

In [43]:
import requests
import pandas as pd
import time

### Connecting to DiDo's API (and testing the response's status code).

In [44]:
code_commune = 75101
url= 'https://data.statistiques.developpement-durable.gouv.fr/dido/api/v1/datafiles/37dd7056-6c4d-44e0-a720-32d4064f9a26/rows' # database url
params = {
    "millesime": "2023-05",
    "page": 1,
    "pageSize": 20  
    }

response = requests.get(url, params=params)
print(response.status_code)
# print(response.json()["message"])  # In case of error (response != 200), gives the error message
# print(response.json()["errors"])  # In case of error (response != 200), gives all the types of errors

200


### Checking data we have connected to.

In [45]:
pd.DataFrame(response.json()["data"])

Unnamed: 0,CLASSE_VEHICULE,CATEGORIE_VEHICULE,CARBURANT,CRITAIR,PARC_2011,PARC_2012,PARC_2013,PARC_2014,PARC_2015,PARC_2016,PARC_2017,PARC_2018,PARC_2019,PARC_2020,PARC_2021,PARC_2022,COMMUNE_CODE,COMMUNE_LIBELLE
0,vp,Véhicule particulier,Diesel,Crit'Air 3,1469,1537,1573,1573,1652,1638,1563,1516,1410,1326,1230,1136,13110,Trets
1,vp,Véhicule particulier,Diesel,Crit'Air 4,1176,1161,1122,1090,1030,1001,927,834,778,641,551,463,13110,Trets
2,vp,Véhicule particulier,Diesel,Crit'Air 5,477,431,389,378,355,286,262,226,201,169,147,116,13110,Trets
3,vp,Véhicule particulier,Diesel,Non classé,828,703,607,527,457,377,300,256,189,129,96,73,13110,Trets
4,vp,Véhicule particulier,Diesel HNR,Crit'Air 2,0,0,1,6,7,7,8,6,7,9,14,24,13110,Trets
5,vp,Véhicule particulier,Hybride rechargeable,Crit'Air 1,3,4,3,3,4,6,8,9,16,14,25,49,13110,Trets
6,vp,Véhicule particulier,Electrique et hydrogène,Crit'Air E,0,0,0,3,5,7,12,13,18,24,68,146,13110,Trets
7,vp,Véhicule particulier,Essence,Crit'Air 1,44,141,222,297,405,525,684,925,1148,1372,1587,1732,13110,Trets
8,vp,Véhicule particulier,Essence,Crit'Air 2,477,488,483,488,463,460,457,464,448,449,442,424,13110,Trets
9,vp,Véhicule particulier,Essence,Crit'Air 3,946,898,838,821,791,771,690,656,616,525,439,422,13110,Trets


### Checking the columns names and type of data to get a better idea of what we are dealing with.

In [46]:
url = "https://data.statistiques.developpement-durable.gouv.fr/dido/api/v1/datafiles/37dd7056-6c4d-44e0-a720-32d4064f9a26/rows"
params = {
    "millesime": "2023-05",
    "page": 1,
    "pageSize": 100
}
response = requests.get(url, params=params)
if response.status_code != 200:
    print("HTTP error:", response.status_code)
    print(response.text)
else:
    data = response.json()
    batch = data.get("data", [])
    if batch:
        batch_df = pd.DataFrame(batch)
        print("Columns:", batch_df.columns)
        print(batch_df.head())
        print(batch_df.dtypes)
        if 'COMMUNE_CODE' in batch_df.columns:
            batch_df['COMMUNE_CODE'] = batch_df['COMMUNE_CODE'].astype(str)
            print("Postal codes in the batch:", batch_df['COMMUNE_CODE'].unique())
            batch_df = batch_df[batch_df['COMMUNE_CODE'].isin([f"751{str(i).zfill(2)}" for i in range(1, 21)])]
            print("After filtering Paris:", batch_df)
        else:
            print("Column 'COMMUNE_CODE' missing!")
    else:
        print("No data retreived...")

Columns: Index(['CLASSE_VEHICULE', 'CATEGORIE_VEHICULE', 'CARBURANT', 'CRITAIR',
       'PARC_2011', 'PARC_2012', 'PARC_2013', 'PARC_2014', 'PARC_2015',
       'PARC_2016', 'PARC_2017', 'PARC_2018', 'PARC_2019', 'PARC_2020',
       'PARC_2021', 'PARC_2022', 'COMMUNE_CODE', 'COMMUNE_LIBELLE'],
      dtype='object')
  CLASSE_VEHICULE    CATEGORIE_VEHICULE   CARBURANT     CRITAIR  PARC_2011  \
0              vp  Véhicule particulier      Diesel  Crit'Air 3       1469   
1              vp  Véhicule particulier      Diesel  Crit'Air 4       1176   
2              vp  Véhicule particulier      Diesel  Crit'Air 5        477   
3              vp  Véhicule particulier      Diesel  Non classé        828   
4              vp  Véhicule particulier  Diesel HNR  Crit'Air 2          0   

   PARC_2012  PARC_2013  PARC_2014  PARC_2015  PARC_2016  PARC_2017  \
0       1537       1573       1573       1652       1638       1563   
1       1161       1122       1090       1030       1001        927   
2 

### Code to explore and gradually pinpoint where Paris data is in the global dataset (finding the pages including codes starting with "75").

In [47]:
def find_paris_start_page():
    page = 4180
    pageSize = 100
    found = False
    while not found and page < 4500:
        url = "https://data.statistiques.developpement-durable.gouv.fr/dido/api/v1/datafiles/37dd7056-6c4d-44e0-a720-32d4064f9a26/rows"
        params = {
            "millesime": "2023-05",
            "page": page,
            "pageSize": pageSize
        }
        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"HTTP error: {response.status_code}.")
            break
        data = response.json()
        batch = data.get("data", [])
        if not batch:
            print("End of data.")
            break
        batch_df = pd.DataFrame(batch)
        batch_df['COMMUNE_CODE'] = batch_df['COMMUNE_CODE'].astype(str)
        print(f"Distinct postal codes on page {page}: {batch_df['COMMUNE_CODE'].unique()}")
        # Stopping the process if we find a code starting with "75"
        if any(code.startswith('75') for code in batch_df['COMMUNE_CODE']):
            print(f"Found a code starting with 75 on page {page}!")
            found = True
        else:
            page += 1  # Page increment (starting very large and then lowering as low as 1 when getting closer to Paris code)

find_paris_start_page()

Distinct postal codes on page 4180: ['76677' '76678' '76679' '76680' '76681' '76682' '76683' '76391' '74282']
Distinct postal codes on page 4181: ['74282' '74283' '74284' '74285' '74286' '74287' '74288']
Distinct postal codes on page 4182: ['74289' '74290' '74291' '74292' '74293' '74294' '74295']
Distinct postal codes on page 4183: ['74295' '74296' '74297' '74298' '74299' '74301' '74302' '74303']
Distinct postal codes on page 4184: ['74303' '74304' '74305' '74306' '74307' '74308' '74309']
Distinct postal codes on page 4185: ['74309' '74310' '74311' '74312' '74313' '74314' '74315']
Distinct postal codes on page 4186: ['74315' '75XXX' '75056' '75101' '75102' '75103' '75104' '75105']
Found a code starting with 75 on page 4186!


### Main function importing all needed data from DiDo's database.
#### (segmenting API calls according to the 100 results per page limit of the DiDo's API, and starting at page 4186 as found in the previous step).

In [48]:
def get_paris_data_fastest():
    columns = ['CLASSE_VEHICULE', 'CATEGORIE_VEHICULE', 'CARBURANT', 'CRITAIR', 'PARC_2014', 'PARC_2015', 'PARC_2016', 'PARC_2017', 'PARC_2018', 'PARC_2019', 'PARC_2020', 'PARC_2021', 'PARC_2022', 'COMMUNE_CODE', 'COMMUNE_LIBELLE']
    final_df = pd.DataFrame(columns=columns)
    batch_size = 100  # Results per page (100 = max allowed by API)
    page = 4186  # Starting page for Paris as found in the previous step
    # total = None
    paris_codes = [f"751{str(i).zfill(2)}" for i in range(1, 21)]
    found_paris = False

    while True:
        url = "https://data.statistiques.developpement-durable.gouv.fr/dido/api/v1/datafiles/37dd7056-6c4d-44e0-a720-32d4064f9a26/rows"
        params = {
            "millesime": "2023-05",
            "page": page,
            "pageSize": batch_size
        }
        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"HTTP error : {response.status_code}.")
            print(response.text)
            break
        data = response.json()
        batch = data.get("data", [])
        if not batch:
            print("End of data.")
            break
        batch_df = pd.DataFrame(batch)
        batch_df['COMMUNE_CODE'] = batch_df['COMMUNE_CODE'].astype(str)
        # Filtering Paris
        batch_df_paris = batch_df[batch_df['COMMUNE_CODE'].isin(paris_codes)]
        if not batch_df_paris.empty:
            found_paris = True
            final_df = pd.concat([final_df, batch_df_paris], ignore_index=True)
            print(f"Page {page}: {len(batch_df_paris)} Paris recordings. Cumulated total: {len(final_df)}.")
        else:
            print(f"Page {page}: 0 Paris recordings.")
        if len(batch) < batch_size:
            print("Last page reached.")
            break
        if found_paris and batch_df_paris.empty:
            break
        page += 1
        time.sleep(0.1)

    # Getting rid of the unwanted columns dealing with out-of-range years (2011, 2012, 2013)
    final_columns = [
    'CLASSE_VEHICULE', 'CATEGORIE_VEHICULE', 'CARBURANT', 'CRITAIR',
    'PARC_2014', 'PARC_2015', 'PARC_2016', 'PARC_2017', 'PARC_2018',
    'PARC_2019', 'PARC_2020', 'PARC_2021', 'PARC_2022',
    'COMMUNE_CODE', 'COMMUNE_LIBELLE'
    ]
    final_df = final_df[final_columns]
    print(f"Data retrieved. Paris total data: {len(final_df)} lines.")
    return final_df

# calling the function
df = get_paris_data_fastest()

Page 4186: 70 Paris recordings. Cumulated total: 70.
Page 4187: 100 Paris recordings. Cumulated total: 170.
Page 4188: 100 Paris recordings. Cumulated total: 270.
Page 4189: 100 Paris recordings. Cumulated total: 370.
Page 4190: 5 Paris recordings. Cumulated total: 375.
Page 4191: 0 Paris recordings.
Data retrieved. Paris total data: 375 lines.


### Checking imported data

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   CLASSE_VEHICULE     375 non-null    object
 1   CATEGORIE_VEHICULE  375 non-null    object
 2   CARBURANT           375 non-null    object
 3   CRITAIR             375 non-null    object
 4   PARC_2014           375 non-null    object
 5   PARC_2015           375 non-null    object
 6   PARC_2016           375 non-null    object
 7   PARC_2017           375 non-null    object
 8   PARC_2018           375 non-null    object
 9   PARC_2019           375 non-null    object
 10  PARC_2020           375 non-null    object
 11  PARC_2021           375 non-null    object
 12  PARC_2022           375 non-null    object
 13  COMMUNE_CODE        375 non-null    object
 14  COMMUNE_LIBELLE     375 non-null    object
dtypes: object(15)
memory usage: 44.1+ KB


In [50]:
df.head()

Unnamed: 0,CLASSE_VEHICULE,CATEGORIE_VEHICULE,CARBURANT,CRITAIR,PARC_2014,PARC_2015,PARC_2016,PARC_2017,PARC_2018,PARC_2019,PARC_2020,PARC_2021,PARC_2022,COMMUNE_CODE,COMMUNE_LIBELLE
0,vp,Véhicule particulier,Diesel,Crit'Air 2,2400,2770,2792,2957,2956,2901,2579,2351,1938,75101,Paris 1er Arrondissement
1,vp,Véhicule particulier,Diesel,Crit'Air 3,1000,815,703,645,559,491,413,370,318,75101,Paris 1er Arrondissement
2,vp,Véhicule particulier,Diesel,Crit'Air 4,419,383,355,312,292,256,205,172,148,75101,Paris 1er Arrondissement
3,vp,Véhicule particulier,Diesel,Crit'Air 5,104,104,88,75,58,42,40,35,29,75101,Paris 1er Arrondissement
4,vp,Véhicule particulier,Diesel,Non classé,97,77,69,59,49,37,32,28,26,75101,Paris 1er Arrondissement


### Exporting data to a .csv

In [52]:
df.to_csv("parc_vehicules_au_niveau_communal_Statistiques_Developpement_Durable.csv", sep=";", index=False)