<a href="https://colab.research.google.com/github/dataforgoodfr/batch8_mednum/blob/master/notebooks/acces_information.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Accès à l'information

## 1. Liste des médiathèques


In [20]:
# get mediatheques map
import pandas as pd
import numpy as np
from pathlib import Path

external_data = Path('../data/external/')
processed_data = Path('../data/processed/')
raw_data = Path('../data/raw/')
interim_data = Path('../data/interim/')

In [21]:
commune = pd.read_csv(raw_data/'table_insee_libcom_dep.csv')
del commune['Unnamed: 0']
commune

Unnamed: 0,CODE_INSEE,LIBCOM,DEP
0,01001,L'Abergement-Clémenciat,01
1,01002,L'Abergement-de-Varey,01
2,01004,Ambérieu-en-Bugey,01
3,01005,Ambérieux-en-Dombes,01
4,01006,Ambléon,01
...,...,...,...
35005,97613,M'Tsangamouji,976
35006,97614,Ouangani,976
35007,97615,Pamandzi,976
35008,97616,Sada,976


In [22]:
df = pd.read_csv(raw_data/'liste-donnees-mediatheques.csv', header=0, sep=';')
df.rename(columns={'Code postal': 'CODE_INSEE', 'Département': 'DEP'}, inplace=True)
print(df.shape)
df.head(2)

(844, 21)


Unnamed: 0,Nom structure,Adresse,CODE_INSEE,DEP,Ville,Année(s) subvention Région,Montant subvention 2000,Montant subvention 2001,Montant subvention 2002,Montant subvention 2003,...,Montant subvention 2005,Montant subvention 2006,Montant subvention 2007,Montant subvention 2008,Montant subvention 2009,Montant subvention 2010,Montant subvention 2011,Montant subvention 2012,Montant subvention 2013,wgs84
0,MÉDIATHÈQUE SAINT-JOHN PERSE,2 rue Édouard Poisson,93300,93,AUBERVILLIERS,,,,,,...,,,,,,,,,,"48.9104747,2.385008"
1,MÉDIATHÈQUE GEORGES PÉREC,8-10 rue du Marqis de Raies,91080,91,COURCOURONNES,,,,,,...,,,,,,,,,,"48.6293899,2.4135832"


In [23]:
df = df[['CODE_INSEE', 'Ville', 'DEP', 'Nom structure']]
df.rename(columns={'Ville': 'Ville_bibliotheques'}, inplace=True)
df['CODE_INSEE'] = df['CODE_INSEE'].astype(str)
df['DEP'] = df['DEP'].astype(str)
df

Unnamed: 0,CODE_INSEE,Ville_bibliotheques,DEP,Nom structure
0,93300,AUBERVILLIERS,93,MÉDIATHÈQUE SAINT-JOHN PERSE
1,91080,COURCOURONNES,91,MÉDIATHÈQUE GEORGES PÉREC
2,91310,LINAS,91,MÉDIATHÈQUE LES MUSES
3,91530,LE-VAL-SAINT-GERMAIN,91,MÉDIATHÈQUE MUNICIPALE
4,91800,BOUSSY-SAINT-ANTOINE,91,BIBLIOTHÈQUE LE GRENIER
...,...,...,...,...
839,78660,BOINVILLE-LE-GAILLARD,78,BIBLIOTHÈQUE MUNICIPALE
840,94000,CRETEIL,94,MÉDIATHÈQUE DE L'ABBAYE - NELSON MANDELA
841,78490,MERE,78,BIBLIOTHÈQUE FRANÇOIS QUESNAY
842,77860,QUINCY-VOISINS,77,MÉDIATHÈQUE MUNICIPALE


In [24]:
mediatheque = commune.merge(df, on=['CODE_INSEE', 'DEP'], how='left').reset_index(drop=True)
mediatheque['acces_information'] = mediatheque['Ville_bibliotheques'].apply(lambda x: 0 if pd.isna(x) else 1)
mediatheque = mediatheque[['CODE_INSEE', 'LIBCOM', 'DEP', 'acces_information']] 
mediatheque.drop_duplicates(inplace=True)
mediatheque

Unnamed: 0,CODE_INSEE,LIBCOM,DEP,acces_information
0,01001,L'Abergement-Clémenciat,01,0
1,01002,L'Abergement-de-Varey,01,0
2,01004,Ambérieu-en-Bugey,01,0
3,01005,Ambérieux-en-Dombes,01,0
4,01006,Ambléon,01,0
...,...,...,...,...
35114,97613,M'Tsangamouji,976,0
35115,97614,Ouangani,976,0
35116,97615,Pamandzi,976,0
35117,97616,Sada,976,0


In [28]:
mediatheque.to_csv(f'{interim_data}/data_mediatheques.csv', index=False)

## 2. Liste des centres sociaux

In [None]:
import json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from urllib import request

import time


def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()        
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result    
    return timed

In [None]:
@timeit
def scrap_table_page(url='https://www.senacs.fr/structure/csx?page=', page_number=1):
    """
    Generate a request on senacs page for a given 
    page number to get the name and location of 
    each social center.
    """

    url_page = url + str(page_number)
    request_text = request.urlopen(url_page).read()
    page = BeautifulSoup(request_text, 'html.parser')

    table = page.find('table', {'class': 'table table-striped table-hover'}).findAll('td')

    # fill the values of id, name and city
    ids = [id.getText() for id in table[::4]]
    names = [name.getText() for name in table[1::4]]
    cities = [city.getText() for city in table[2::4]]

    df = pd.DataFrame({'id': ids, 
                      'nom': names,
                      'ville': cities}
                      )
    df['page'] = page_number

    return df


def stack_pages(pages=range(91)):
    """
    Concatenate all empruntis scrapped data per region.
    """

    data = pd.DataFrame()    # empty Dataframe to fill
    for page_number in pages:
        temp = scrap_table_page(page_number=page_number)
        data = pd.concat([data, temp], axis=0)

    return data

In [None]:
data = stack_pages()

In [None]:
data['code_postal'] = data['ville'].apply(lambda x: x.split('-')[0])
data['ville'] = data['ville'].apply(lambda x: x.split('-')[1])

In [None]:
data

Unnamed: 0,id,nom,ville,page,code_postal
0,91,CENTRE SOCIAL DES GRANDES BORNES,Goussainville,0,95190
1,92,CENTRE SOCIAL ARCHIPELIA,PARIS 20EME ARRONDISSEMENT,0,75020
2,93,Espace Socioculturel Val de Charente,Ruffec,0,16700
3,94,CENTRE SOCIAL Maison Des Habitants Champaret,Bourgoin,0,38300
4,95,CENTRE SOCIAL MAISON DE LA CHALLE,Éragny,0,95610
...,...,...,...,...,...
6,16264,CENTRE SOCIOCULTUREL MUNICIPAL DE SAINT-DIZIER,Saint,90,52100
7,16267,ESPACE DE VIE SOCIALE L.E.P.H.A.R.E,Aniche,90,59580
8,16351,ASSOCIATION FERME BEAUREPAIRE,Boulogne,90,62200
9,16355,OFFICE DE LA JEUNESSE,Bruay,90,62700


In [None]:
dir = '/content/drive/My Drive/Colab Notebooks/'
data.to_csv(f'{dir}data_centre_sociaux.csv', index=False)

## 3. Distance des communes à l'accès le plus proche


In [None]:
# copy geoson file with every 'Franceservices' locations

import os

if 'COLAB_GPU' in os.environ:  # this is always set on Colab, the value is 0 or 1 depending on whether a GPU is attached
    from google.colab import auth
    #auth.authenticate_user()

    !rm -rf France-services/ 
    !git clone https://github.com/cget-carto/France-services.git
    !mv France-services/data/france_services.geojson .

Cloning into 'France-services'...
remote: Enumerating objects: 260, done.[K
remote: Counting objects: 100% (260/260), done.[K
remote: Compressing objects: 100% (169/169), done.[K
remote: Total 260 (delta 108), reused 221 (delta 78), pack-reused 0[K
Receiving objects: 100% (260/260), 2.12 MiB | 11.94 MiB/s, done.
Resolving deltas: 100% (108/108), done.


In [None]:
import pandas as pd
import json

with open("france_services.geojson", "r") as read_file: 
    fs = json.load(read_file) 

In [None]:
data_fs = pd.DataFrame()
for n, temp in enumerate(fs['features']): 
 
    df_temp = pd.DataFrame.from_records([{'latitude': temp['properties']['LATITUDE'], 
                            'longitude': temp['properties']['LATITUDE'], 
                            'departement': temp['properties']['DEPARTEMENT'],
                            'insee_com': temp['properties']['insee_com'],
                            'code_postal': temp['properties']['code_postal'], 
                            'lib_france_services': temp['properties']['lib_france_services']
                            }])
    
    data_fs = pd.concat([data_fs, df_temp], axis=0)

In [None]:
data_fs

Unnamed: 0,latitude,longitude,departement,insee_com,code_postal,lib_france_services
0,46.105747,46.105747,1,01033,01200,Valserhône
0,44.031548,44.031548,12,12197,12170,Réquista « Le Bercail »
0,44.475185,44.475185,12,12138,12330,Conques-Marcillac
0,49.291400,49.291400,14,14514,14130,Terre d’Auge
0,49.126169,49.126169,14,14057,14370,Val ès Dunes
...,...,...,...,...,...,...
0,47.617932,47.617932,89,89368,89520,Saint-Sauveur-en-Puisaye
0,48.980007,48.980007,95,95219,95120,Ermont
0,47.181000,47.181000,25,25527,25410,Saint Vit
0,43.208769,43.208769,31,31375,31310,Volvestre – Antenne Montesquieu


### Calculer la distance entre le centre des communes et la prefecture, sous-prefecture ou france-services la plus proche en km. 

- scrapper prefecture / sous-prefecture https://fr.wikipedia.org/wiki/Liste_des_pr%C3%A9fectures_de_France
- obtenir coordonnées des communes: OK via codes postaux + librairie `pgecode`

In [None]:
#data['distance'] = dist.query_postal_code(data[''].values, data[''].values)

# calculer le france-service (ou prefecture, sous-prefecture etc.) le plus proche de chaque commune et bueno 

In [None]:
def df_crossjoin(df1, df2, **kwargs):
    """
    Make a cross join (cartesian product) between two dataframes by using a constant temporary key.
    Also sets a MultiIndex which is the cartesian product of the indices of the input dataframes.
    See: https://github.com/pydata/pandas/issues/5401
    :param df1 dataframe 1
    :param df1 dataframe 2
    :param kwargs keyword arguments that will be passed to pd.merge()
    :return cross join of df1 and df2
    """
    df1['_tmpkey'] = 1
    df2['_tmpkey'] = 1

    res = pd.merge(df1, df2, on='_tmpkey', **kwargs).drop('_tmpkey', axis=1)
    res.index = pd.MultiIndex.from_product((df1.index, df2.index))

    df1.drop('_tmpkey', axis=1, inplace=True)
    df2.drop('_tmpkey', axis=1, inplace=True)

    return res

df = df_crossjoin(commune[['code_postal']], data_fs[['code_postal']], suffixes=('_communes', '_fs')).reset_index()
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,level_0,level_1,code_postal_communes,code_postal_fs
0,0,0,01500,01200
1,0,0,01500,12170
2,0,0,01500,12330
3,0,0,01500,14130
4,0,0,01500,14370
...,...,...,...,...
35586331,39191,0,98799,89520
35586332,39191,0,98799,95120
35586333,39191,0,98799,25410
35586334,39191,0,98799,31310


In [None]:
df.drop_duplicates()

Unnamed: 0,level_0,level_1,code_postal_communes,code_postal_fs
0,0,0,01500,01200
1,0,0,01500,12170
2,0,0,01500,12330
3,0,0,01500,14130
4,0,0,01500,14370
...,...,...,...,...
35586331,39191,0,98799,89520
35586332,39191,0,98799,95120
35586333,39191,0,98799,25410
35586334,39191,0,98799,31310


In [None]:
df[:len(data_fs)].groupby('code_postal_communes').apply(lambda row: dist.query_postal_code(row['code_postal_communes'], row['code_postal_fs']))

code_postal_communes
01500    []
dtype: object

In [None]:
df[:len(data_fs)].apply(lambda row: dist.query_postal_code(row['code_postal_communes'], row['code_postal_fs']), axis=1)

0       42.623548
1      301.434162
2      275.230403
3      527.270586
4      544.313797
          ...    
903    238.162672
904    409.646169
905    143.125315
906    446.226215
907    431.109707
Length: 908, dtype: float64

In [26]:
#df['distance_commune_france_service'] = df.apply(lambda row: dist.query_postal_code(row['code_postal_communes'], row['code_postal_fs']), axis=1)

In [None]:
# take the minimum per commune and OK
# df.groupby('code_postal_communes').agg({'distance_commune_france_service': ['mean', 'min']})

In [27]:
#df