# DPE

Données issues de https://www.data.gouv.fr/s/resources/base-des-diagnostics-de-performance-energetique-dpe/20160427-112416/DPE_GreenTech_noduplicates_yearonly.zip


Type de fichiers : CSV 

Nombre d'enregistrements : 1 653 194 

Champs : 
* Code_postal = "code postal" : 5 caractères 
* tr002_type_batiment_id = "type de bâtiment" : maison (code 1) ; appartement (code 2) 
* annee_construction = "année de construction" : 4 caractères 
* surface_habitable = "surface habitable" : en m² 
* consommation_energie = "consommation énergie" : Consommation tous usages en kWh/m² 
* date_reception_dpe = "année du DPE" : 4 caractères 
* nom_methode_dpe = "méthode utilisée" : il y a plusieurs méthodes aggrées pour calculer et rendre compte du DPE. 
* tr001_modele_dpe_id = "type de DPE" : caractérise s'il s'agit d'une vente, d'une location, du neuf, ... et comment le DPE a été calculé. Voir tableau ci-dessous 
* tr006_type_usage_id = code usage (cf champ suivant) 
* description = usage : chauffage dans pratiquement tous les cas de cette extraction 
* tr004_type_energie_id = code énergie 
* description = "énergie" : énergie correspondant à usage (chauffage) : liste de valeur (Bois, Biomasse, électricité, gaz, autre...) 
* consommation_energie_finale= "consommation énergie finale" : Consommation pour le usage uniquement (chauffage) (KWh)

In [1]:
import antigravity


In [2]:
import pandas as pd
import numpy as np
import re

## /!\ Code Postal

La colonne code postal contient parfois des adresses complètes ! 

Cela empêche le bon calcul des moyennes etc

On est obligés d'extraire le code postal via regex

In [3]:
usecols = ['code_postal', 'tr002_type_batiment_id', 'annee_construction',
       'surface_habitable', 'consommation_energie',
       'tr006_type_usage_id','description.1', 'consommation_energie_finale']

usecols = ['code_postal', 'annee_construction',
           'surface_habitable', 'consommation_energie',
           'consommation_energie_finale']



# fonction converter pour extraire le code postal

def get_code_postal(s):
    cp = re.match('\d{5}$', s)
    if cp!=None:
        return cp.group(0)
    return '00000'

# here we go 

df = pd.read_csv('data/dpe/DPE_GreenTech_noduplicates_yearonly.csv',
                usecols = usecols, dtype={'surface_habitable': float},
                sep=',', converters={0: get_code_postal})
df.head(3)

Unnamed: 0,code_postal,annee_construction,surface_habitable,consommation_energie,consommation_energie_finale
0,80800,2001,99.0,200.0,7133.95
1,87500,2006,101.0,131.56,5550.0
2,87400,1999,110.0,184.26,5656.45


In [3]:
df.shape

(1653194, 4)

## Moyennes 

In [4]:
dfAvg = df
colsAvg = ['annee_construction', 'surface_habitable', 
           'consommation_energie', 'consommation_energie_finale']
colsGroup = ['code_postal']

avg = dfAvg.groupby(colsGroup).agg(['mean', 'count'])
avg.head(3)

Unnamed: 0_level_0,annee_construction,annee_construction,surface_habitable,surface_habitable,consommation_energie,consommation_energie,consommation_energie_finale,consommation_energie_finale
Unnamed: 0_level_1,mean,count,mean,count,mean,count,mean,count
code_postal,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,1962.511991,6797,85.870122,6797,239.899756,6797,11907.640097,6797
1000,1893.678131,1541,80.199377,1541,229.300071,1541,12169.950299,1541
1090,1914.331461,178,114.225,178,228.726348,178,12771.357528,178


In [5]:
avg = avg.reset_index()

In [6]:
avg[avg['code_postal'] == '21200']

Unnamed: 0_level_0,code_postal,annee_construction,annee_construction,surface_habitable,surface_habitable,consommation_energie,consommation_energie,consommation_energie_finale,consommation_energie_finale
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count,mean,count,mean,count,mean,count
1195,21200,1959.261905,546,93.147363,546,218.985989,546,12012.525934,546


In [7]:
avg[avg['code_postal'] == '21000']

Unnamed: 0_level_0,code_postal,annee_construction,annee_construction,surface_habitable,surface_habitable,consommation_energie,consommation_energie,consommation_energie_finale,consommation_energie_finale
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count,mean,count,mean,count,mean,count
1184,21000,1924.9079,5342,64.6502,5342,234.000541,5342,15588.397074,5342


In [8]:
# aplatir l'imbrication des colonnes 
avg.columns = avg.columns.get_level_values(0)
avg.head(3)

Unnamed: 0,code_postal,annee_construction,annee_construction.1,surface_habitable,surface_habitable.1,consommation_energie,consommation_energie.1,consommation_energie_finale,consommation_energie_finale.1
0,0,1962.511991,6797,85.870122,6797,239.899756,6797,11907.640097,6797
1,1000,1893.678131,1541,80.199377,1541,229.300071,1541,12169.950299,1541
2,1090,1914.331461,178,114.225,178,228.726348,178,12771.357528,178


In [9]:
avg.columns = ['code_postal', 'annee_m', 'annee_c', 
               'surface_m', 'surface_c', 'conso_m', 'conso_c', 
               'conso_tot_m', 'conso_tot_c']
avg.head(3)

Unnamed: 0,code_postal,annee_m,annee_c,surface_m,surface_c,conso_m,conso_c,conso_tot_m,conso_tot_c
0,0,1962.511991,6797,85.870122,6797,239.899756,6797,11907.640097,6797
1,1000,1893.678131,1541,80.199377,1541,229.300071,1541,12169.950299,1541
2,1090,1914.331461,178,114.225,178,228.726348,178,12771.357528,178


## to SQL

In [10]:
import pymysql.cursors

connection = pymysql.connect( host='localhost',
                              user='root',
                              passwd='root',
                              db='energie',
                              charset='utf8mb4',
                              cursorclass=pymysql.cursors.DictCursor)

cursor = connection.cursor()

In [11]:
avg = avg.fillna('')

In [13]:
# fillna(' ')
# read_csv dtype str
# limiter la taille de la chaine envoyée[:n]
# code_postal	annee_construction	consommation_energie	consommation_energie_finale

with connection.cursor() as cursor:
    
    sql = ("INSERT INTO dpe_avg "
           " ( cp, annee, conso_m2, nb_dpe, surface, conso_totale)"
           "VALUES (%s, %s, %s, %s, %s, %s)"
          )

    for i, r in avg.iterrows():
        cp = str(r['code_postal']) + ''
        an = str(r['annee_m']) + ''
        m2 = str(r['conso_m']) + ''
        cm = str(r['conso_c']) + '' # count 
        su = str(r['surface_m']) + ''
        ct = str(r['conso_tot_m']) + ''
        cursor.execute(sql, 
                       (cp[:6], 
                        an[:4], 
                        m2[:10], 
                        cm[:10], 
                        su[:10], 
                        ct[:10])
                      )

connection.commit() 

In [31]:
columns = 'consommation_energie_finale'
columns[:4]

'cons'