# Collectivités 20K - 50K

Ce notebook sert à préparer les données des collectivités entre 20K et 50K habitants.

On va combiner des informations de ces sources :
* INSEE - données de population des communes : https://www.insee.fr/fr/statistiques/2521169#consulter
* ORE - : 
* NosDonnées - latitude et longitude :  http://www.nosdonnees.fr/wiki/index.php/Fichier:EUCircos_Regions_departements_circonscriptions_communes_gps.csv.gz

In [1]:
import pandas as pd 
import glob
import numpy as np

## Données INSEE

In [2]:
# données population des communes
# source : https://www.insee.fr/fr/statistiques/2521169#consulter

pop = pd.read_csv('data/insee/base_cc.csv', encoding='utf-8', sep=',', 
                  low_memory=False, keep_default_na = False,
                 dtype={'CODGEO': str, 'LIBGEO': str, 'REG': str,
                         'DEP': str, 'P15_POP': np.int64, 
                          'P10_POP': str, 'SUPERF': str,
                          'P15_MEN': str,'P15_LOG': str, 'P15_EMPLT_SAL': str,
                          'ETTOT15': str, 'ETAZ15': str, 
                          'ETBE15': str, 'ETFZ15': str, 
                          'ETGU15': str, 'ETGZ15': str, 'ETOQ15': str,
                          'ETTEF115': str, 'ETTEFP1015': str})
pop = pop.rename(columns={'CODGEO': 'insee', 'LIBGEO': 'libelle', 'REG': 'code_region',
                         'DEP': 'code_dept', 'P15_POP': 'population', 
                          'P10_POP': 'population_2010', 'SUPERF': 'superficie',
                          'P15_MEN': 'menages','P15_LOG': 'logements', 'P15_EMPLT_SAL': 'salaries',
                          'ETTOT15': 'ets_actifs', 'ETAZ15': 'ets_agriculture', 
                          'ETBE15': 'ets_industrie', 'ETFZ15': 'ets_construction', 
                          'ETGU15': 'ets_com_serv', 'ETGZ15': 'ets_repar_auto', 'ETOQ15': 'ets_adm',
                          'ETTEF115': 'ets_moins_10', 'ETTEFP1015': 'ets_plus_10'})
pop.head(3)

Unnamed: 0,insee,libelle,code_region,code_dept,population,population_2010,superficie,menages,logements,salaries,ets_actifs,ets_agriculture,ets_industrie,ets_construction,ets_com_serv,ets_repar_auto,ets_adm,ets_moins_10,ets_plus_10
0,55039,Beaumont-en-Verdunois,44,55,0,0,8,0,0,0,1,0,0,0,0,0,1,0,0
1,55050,Bezonvaux,44,55,0,0,9,0,0,0,1,0,0,0,0,0,1,0,0
2,55139,Cumières-le-Mort-Homme,44,55,0,0,6,0,1,0,1,0,0,0,0,0,1,1,0


In [3]:
pop.shape

(36735, 19)

## Données ORE

In [4]:
# données ORE : récupérer quelques informations complémentaires

edf = pd.read_csv('data/ore/conso-elec-annuelle-par-secteur-dactivite-agregee-commune.csv',
                 sep=';', thousands=' ', dtype = str,
                  usecols=['Année', 'Code Commune', 'Code EPCI', 'Libellé EPCI', 'Forme Commune', 
                           'Géolocalisation Commune'])

edf = edf.rename(columns={'Code EPCI' : 'code_epci', 'Libellé EPCI': 'epci', 'Forme Commune': 'forme', 
                           'Géolocalisation Commune': 'geolocalisation'})

# Garder une seule année
edf = edf.loc[edf['Année'] == '2016']
edf = edf.drop(['Année'], axis=1)

edf = edf.drop_duplicates(subset=None, keep='first')
edf.head(3)

Unnamed: 0,Code Commune,code_epci,epci,forme,geolocalisation
0,57237,245701172,CC du Centre Mosellan,"{""type"": ""Polygon"", ""coordinates"": [[[6.801446...","49.0151448414, 6.78526254792"
1,57234,245700216,CA Sarreguemines Confluences,"{""type"": ""Polygon"", ""coordinates"": [[[7.104239...","49.1346513232, 7.11662409209"
2,57227,245700372,CA de Forbach Porte de France,"{""type"": ""Polygon"", ""coordinates"": [[[6.860939...","49.1915770388, 6.89272878712"


In [42]:
edf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36018 entries, 0 to 209548
Data columns (total 7 columns):
Code Commune       36018 non-null object
code_epci          36013 non-null object
epci               36013 non-null object
forme              36013 non-null object
geolocalisation    36013 non-null object
latitude           4195 non-null object
longitude          4194 non-null object
dtypes: object(7)
memory usage: 2.2+ MB


In [51]:
# éclater la géolocalisation en latitude et longitude 
for i,r in edf.iterrows():
    coord = str(r['geolocalisation']).split(',')
    if len(coord) == 2 :
        edf.set_value(i, 'latitude', coord[0])
        edf.set_value(i, 'longitude', coord[1])


In [None]:
d = ['44.9451018992', ' 1.69270404959']


In [52]:
edf.head(10)

Unnamed: 0,Code Commune,code_epci,epci,forme,geolocalisation,latitude,longitude
0,57237,245701172,CC du Centre Mosellan,"{""type"": ""Polygon"", ""coordinates"": [[[6.801446...","49.0151448414, 6.78526254792",49.0151448414,6.78526254792
1,57234,245700216,CA Sarreguemines Confluences,"{""type"": ""Polygon"", ""coordinates"": [[[7.104239...","49.1346513232, 7.11662409209",49.1346513232,7.11662409209
2,57227,245700372,CA de Forbach Porte de France,"{""type"": ""Polygon"", ""coordinates"": [[[6.860939...","49.1915770388, 6.89272878712",49.1915770388,6.89272878712
3,57222,245700372,CA de Forbach Porte de France,"{""type"": ""Polygon"", ""coordinates"": [[[6.916155...","49.1516826122, 6.90508521929",49.1516826122,6.90508521929
4,57221,245701222,CA du Val de Fensch,"{""type"": ""Polygon"", ""coordinates"": [[[6.116100...","49.3277706085, 6.12333123466",49.3277706085,6.12333123466
5,57210,200039956,CC de Sarrebourg - Moselle Sud,"{""type"": ""Polygon"", ""coordinates"": [[[6.965657...","48.8506840513, 7.00955127721",48.8506840513,7.00955127721
6,57209,245700133,CC du District Urbain de Faulquemont (Duf),"{""type"": ""Polygon"", ""coordinates"": [[[6.555812...","49.0353093589, 6.59362819134",49.0353093589,6.59362819134
7,57203,245700695,CC de Cattenom et Environs,"{""type"": ""Polygon"", ""coordinates"": [[[6.179635...","49.5016592908, 6.19437033493",49.5016592908,6.19437033493
8,57187,200011625,CC du Pays Boulageois,"{""type"": ""Polygon"", ""coordinates"": [[[6.467266...","49.2217779236, 6.48732957998",49.2217779236,6.48732957998
9,57186,245701412,CC du Bouzonvillois,"{""type"": ""Polygon"", ""coordinates"": [[[6.387903...","49.2797790177, 6.40046244093",49.2797790177,6.40046244093


In [53]:
pop2 = pd.merge(pop, edf, how='left', left_on='insee', right_on='Code Commune')  
pop2 = pop2.drop(['Code Commune'], axis=1)
pop2.head(3)

Unnamed: 0,insee,libelle,code_region,code_dept,population,population_2010,superficie,menages,logements,salaries,...,ets_repar_auto,ets_adm,ets_moins_10,ets_plus_10,code_epci,epci,forme,geolocalisation,latitude,longitude
0,55039,Beaumont-en-Verdunois,44,55,0,0,8,0,0,0,...,0,1,0,0,,,,,,
1,55050,Bezonvaux,44,55,0,0,9,0,0,0,...,0,1,0,0,,,,,,
2,55139,Cumières-le-Mort-Homme,44,55,0,0,6,0,1,0,...,0,1,1,0,,,,,,


In [54]:
pop2 = pop2.fillna('')
pop2.shape

(36735, 25)

In [55]:
pop2 = pop2.drop_duplicates(subset=None, keep='first')
pop2.shape

(36735, 25)

## Population cible

In [56]:
# pour le concours on cible les collectivités entre 20K et 50K habitants :

pop_cible = pop2.loc[pop2['population'] > 19999]
pop_cible = pop_cible.loc[pop_cible['population'] < 50001]

#pop_cible.to_csv('dataviz/data/organismes_20_50.csv')

In [57]:
pop_cible.shape

(353, 25)

## NosDonnées 

http://www.nosdonnees.fr/wiki/index.php/Fichier:EUCircos_Regions_departements_circonscriptions_communes_gps.csv.gz

In [58]:
url = 'data/nosDonnees/eucircos_regions_departements_circonscriptions_communes_gps.csv'
usecols = ['EU_circo', 'nom_région', 'nom_département', 'codes_postaux', 'code_insee']
lalo = pd.read_csv(url, usecols=usecols, sep=";", dtype=str,
                   thousands=' ', decimal=',', encoding='utf-8')
lalo.head()

Unnamed: 0,EU_circo,nom_région,nom_département,codes_postaux,code_insee
0,Sud-Est,Rhône-Alpes,Ain,1340,1024
1,Sud-Est,Rhône-Alpes,Ain,1270,1029
2,Sud-Est,Rhône-Alpes,Ain,1370,1038
3,Sud-Est,Rhône-Alpes,Ain,1340,1040
4,Sud-Est,Rhône-Alpes,Ain,1250,1245


In [59]:
lalo = lalo.rename(columns = {'nom_région':'region', 'nom_département': 'dept', 
                              'codes_postaux': 'cp', 'code_insee': 'insee'})
lalo.head()

Unnamed: 0,EU_circo,region,dept,cp,insee
0,Sud-Est,Rhône-Alpes,Ain,1340,1024
1,Sud-Est,Rhône-Alpes,Ain,1270,1029
2,Sud-Est,Rhône-Alpes,Ain,1370,1038
3,Sud-Est,Rhône-Alpes,Ain,1340,1040
4,Sud-Est,Rhône-Alpes,Ain,1250,1245


In [60]:
tout = pd.merge(pop_cible, lalo, how='left')
tout.head()

Unnamed: 0,insee,libelle,code_region,code_dept,population,population_2010,superficie,menages,logements,salaries,...,code_epci,epci,forme,geolocalisation,latitude,longitude,EU_circo,region,dept,cp
0,92022,Chaville,11,92,20001,18668,4,8641,9447,2952,...,200023356,CA Grand Paris Seine Ouest (Gpso),"{""type"": ""Polygon"", ""coordinates"": [[[2.205507...","48.8074453729, 2.19196553763",48.8074453729,2.19196553763,Île-de-France,Île-de-France,Hauts-de-Seine,92370
1,26058,Bourg-lès-Valence,84,26,20074,18623,20,8960,9680,6395,...,200040483,CA Valence-Romans Sud Rhône-Alpes,"{""type"": ""Polygon"", ""coordinates"": [[[4.881368...","44.9653177526, 4.89420757628",44.9653177526,4.89420757628,Sud-Est,Rhône-Alpes,Drôme,26500
2,88413,Saint-Dié-des-Vosges,44,88,20079,21447,46,9731,11843,9723,...,200042141,CC de Saint-Dié-des-Vosges,"{""type"": ""Polygon"", ""coordinates"": [[[6.916602...","48.2968400138, 6.93787860007",48.2968400138,6.93787860007,Est,Lorraine,Vosges,88100
3,94059,Le Plessis-Trévise,11,94,20102,19194,4,8074,8256,2507,...,249400086,CA Haut Val de Marne,"{""type"": ""Polygon"", ""coordinates"": [[[2.557404...","48.8062253343, 2.57508931375",48.8062253343,2.57508931375,Île-de-France,Île-de-France,Val-de-Marne,94420
4,45284,Saint-Jean-de-Braye,24,45,20123,19057,14,8681,9370,9740,...,244500468,CA Orléans Val de Loire (Agglo),"{""type"": ""Polygon"", ""coordinates"": [[[1.993520...","47.9177164808, 1.97312389078",47.9177164808,1.97312389078,Centre,Centre,Loiret,45800


In [61]:
tout = tout.fillna('')
tout.shape

(371, 29)

## to sql

In [62]:
import pymysql.cursors

connection = pymysql.connect( host='localhost',
                              user='root',
                              passwd='root',
                              db='energie',
                              charset='utf8mb4',
                              cursorclass=pymysql.cursors.DictCursor)

cursor = connection.cursor()

In [63]:
# Insertion BDD
# Problèmes avec les types. Pour simplifier, tout est passé en string / varchar 
# Il a été nécessaire de changer les NaN en ''

with connection.cursor() as cursor:
    
    sql = ("INSERT INTO `communes_20_50` "
           "( insee, cp, libelle, EU_circo, code_region, "
           "  region, code_dept, dept, population, population_2010, " 
           "  superficie, menages, logements, salaries, "
           " `ets_actifs`, `ets_agriculture`,  `ets_industrie`, `ets_construction`, "
           " `ets_com_serv`, `ets_repar_auto`, `ets_adm`, `ets_moins_10`,  "
           " `ets_plus_10`, `code_epci`, `epci`, `forme`, `geolocalisation`,"
           " latitude, longitude )"
           "VALUES ("
           " %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,  %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,  %s, %s, %s, %s, %s, %s, %s, %s, %s)"
          )

    for i, r in tout.iterrows():
        cursor.execute(sql, 
                       (r['insee'], r['cp'], r['libelle'], r['EU_circo'], r['code_region'], 
                        r['region'], r['code_dept'], r['dept'], r['population'], r['population_2010'], 
                        r['superficie'], r['menages'], r['logements'], r['salaries'], 
                        r['ets_actifs'], r['ets_agriculture'], r['ets_industrie'], r['ets_construction'], 
                        r['ets_com_serv'], r['ets_repar_auto'], r['ets_adm'], r['ets_moins_10'], 
                        r['ets_plus_10'], r['code_epci'], r['epci'], r['forme'], r['geolocalisation'],
                        r['latitude'],r['longitude'])
                      )

connection.commit() 