In [51]:
from urllib.request import Request, urlopen, urlretrieve
import bs4 as BeautifulSoup
import os
import glob
import progressbar
import zipfile 
import tarfile

In [2]:
base_url = 'https://echanges.dila.gouv.fr/OPENDATA/BOAMP/'
casual_header = {'User-Agent': 'Mozilla/5.0'}

In [3]:
def clear_link(link):
    link = str(link)
    start = link.find('BOAMP')
    end = link.find('.taz')
    cleared = link[start: end + len('.taz')]
    return cleared 

In [4]:
def map(n, xStart, yStart, xTarget, yTarget):
    return xTarget + n/(yStart + xStart)*(yTarget - xTarget)

In [55]:
def extract_archives(folder, year):
    
    if year in [2017, 2018]:
        directories = glob.glob('{}/*'.format(folder))
    
        if not os.path.exists('tmp'):
                os.makedirs('tmp')
                
        for i in range(len(directories)):
            
            zipf = zipfile.ZipFile(directories[i], 'r')
            zipf.extractall('tmp/{}'.format(directories[i][5:-4]))
            zipf.close()

        directories2 = glob.glob('tmp/*/*')
    
        for i in range(len(directories2)):

            tarf = tarfile.open(directories2[i])
            tarf.extractall('{}/'.format(folder))
            tarf.close()
    
        for file in glob.glob('tmp/*/*'):
            os.remove(file)
    
        for directory in glob.glob('tmp/*'):
            os.rmdir(directory)
    
        os.rmdir('tmp')
    
        for file in glob.glob('{}/*.taz'.format(folder)):
            os.remove(file)
            
    elif year in [2005 + i for i in range(12)]:
        folders = glob.glob('{}/*/*'.format(folder))
        for file in folders:
            with zipfile.ZipFile(file) as zip:                
                for zip_info in zip.infolist():
                    if zip_info.filename[-1] == '/':
                        continue
                    zip_info.filename = os.path.basename(zip_info.filename)
                    zip.extract(zip_info, file[:-8])

In [6]:
api_dic = {2016: 'Boamp_v230',
           2015: 'Boamp_v230',
           2014: 'Boamp_v110',
           2013: 'Boamp_v110',
           2012: 'Boamp_v110',
           2011: 'Boamp_v110',
           2010: 'Boamp_v110',
           2009: 'Boamp_v110',
           2008: 'Boamp_v110',
           2007: 'Boamp_v110',
           2006: 'Boamp_v010',
           2005: 'Boamp_v010'}

recent = {'jao': 'BOAMP-J-AO',
            'jicaa': 'BOAMP-J-IC-AA',
            'nao': 'BOAMP-N-AO',
            'nicaa': 'BOAMP-N-IC-AA',
            'mapao': 'MAPA-AO',
            'maaicaa': 'MAPA-IC-AA'}

very_old = {'mpa': 'mpa',
                 'mpb': 'mpb',
                 'mpc': 'mpc'}

old = {'mpa': 'mpa',
            'mpb': 'mpb',
            'mpc': 'mpc',
            'mapa': 'mapa'}

utilidic = {'jao': 'BOAMP-J-AO',
            'jicaa': 'BOAMP-J-IC-AA',
            'nao': 'BOAMP-N-AO',
            'nicaa': 'BOAMP-N-IC-AA',
            'mapao': 'MAPA-AO',
            'maaicaa': 'MAPA-IC-AA',
            'mpa': 'mpa',
            'mpb': 'mpb',
            'mpc': 'mpc',
            'mapa': 'mapa'}

In [7]:
def loadnsave(key, year, folder):
    
    os.makedirs('{}/{}'.format(folder, utilidic[key]))
    url = base_url + 'FluxHistorique/{}/{}/{}/xml.zip'.format(api_dic[year], 
                                                              year,
                                                              utilidic[key])
    req = Request(url, headers=casual_header)
    f = open('{}/{}/xml.zip'.format(year, utilidic[key]), 'wb')
    f.write(urlopen(url).read())

In [8]:
def download_data(year, folder):
    
    years = [2005 + i for i in range(12)]
    
    if not os.path.exists(folder):
        os.makedirs(folder)
        
    if year in years[-2:]:
        for key in recent.keys():
            
            loadnsave(key, year, folder)
            
        extract_archives(folder, year)
    
    elif year in years[2:-2]:
        for key in old.keys():
            
            loadnsave(key, year, folder)
            
        extract_archives(folder, year)
         
    elif year in years[:2]:
        for key in very_old.keys():
            
            loadnsave(key, year, folder)
            
        extract_archives(folder, year)
    
    elif year == 2018 or 2017:
        
        base_url = 'https://echanges.dila.gouv.fr/OPENDATA/BOAMP/{}/'.format(year)
        req = Request(base_url, headers=casual_header)
        resp = str(urlopen(req).read())
        
        soup = BeautifulSoup.BeautifulSoup(resp)
        souped = soup.find_all('a')
        
        links = []
    
        for i in range(len(souped)):
            if ".taz" in str(souped[i]):
                cleared = clear_link(souped[i])
                if len(cleared) > 0:
                    links.append(cleared)
        
        links = list(set(links))
        
        for i in range(len(links)):
            f = open('{}/{}'.format(folder, links[i]),'wb')
            f.write(urlopen(base_url + links[i]).read())
        
        extract_archives(folder, year)

        
    


In [53]:
def download_all():
    years = [2005 + i for i in range(14)]
    
    bar = progressbar.ProgressBar(maxval=len(years), \
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage(), ' (', progressbar.ETA(), ') '])
    bar.start()

    for i in range(len(years)):
        bar.update(i)
        download_data(years[i], str(years[i]))


In [54]:
download_all()



In [257]:
years = [2005 + i for i in range(12)]

In [260]:
years[:2]

[2005, 2006]