## Grab files

In [15]:
import os
from pathlib import Path
from ftplib import FTP, error_perm

years = ['2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']

In [18]:
HOST='ted.europa.eu'
USER='guest'
PASSWORD='guest'

def connect():
    ftp = FTP()
    ftp = FTP(HOST)
    ftp.login(user=USER, passwd=PASSWORD)
    ftp.cwd('daily-packages')
    return ftp

def grab_files(ftp, year, month):
    ftp.cwd(f'{year}/{month}')
    filenames = ftp.nlst()
    for filename in filenames:
        Path(os.path.join(os.getcwd(), year, month)).mkdir(parents=True, exist_ok=True)
        host_file = os.path.join(os.getcwd(), year, month, filename)
        try:
            with open(host_file, 'wb') as local_file:
                ftp.retrbinary('RETR ' + filename, local_file.write)
        except error_perm as err:
            print(err)
            pass
    ftp.quit()
    
for year in years:
    for month in months:
        ftp = connect()
        grab_files(ftp, year, month)

## Unpack archives

In [12]:
import tarfile

def unpack_tar(year, month, gzipped=True):
    cwd = os.path.join(os.getcwd(), year, month)
    for file_name in os.listdir(cwd):
        tar = tarfile.open(os.path.join(cwd, file_name), 'r:gz' if gzipped else 'r:')
        tar.extractall(cwd)
        tar.close()

In [19]:
for year in years:
    for month in months[6:]:
        unpack_tar(year, month)
    break

## Parse XML's

In [62]:
from lxml import etree

def clean_xmlns(root):
    for elem in root.getiterator():
        elem.tag = etree.QName(elem).localname
    return root

root = etree.parse('./2011/01/20110104_001/000012_2011.xml').getroot()
root = clean_xmlns(root)

In [63]:
for el in root:
    print(el.tag)

TECHNICAL_SECTION
LINKS_SECTION
CODED_DATA_SECTION
TRANSLATION_SECTION
FORM_SECTION


In [140]:
from collections import namedtuple

DOC_INFO = namedtuple('DOC_INFO', 'doc_id category content')

def parse(root, node_type):
    try:
        doc_id = root.get('DOC_ID')
        lang = root.findall(f'.//{node_type}[@LG="DE"]')[0]
        category = lang.get('CATEGORY')
        content = '\n'.join([el.text for el in lang.findall('.//P') if el.text is not None])
    except IndexError as err:
        return None
    
    return DOC_INFO(doc_id=doc_id, category=category, content=content)

result_dict = dict()

for file_name in os.listdir('./2011/01/20110104_001'):
    root = etree.parse(os.path.join('./2011/01/20110104_001', file_name)).getroot()
    root = clean_xmlns(root)

    contract_info = parse(root, 'CONTRACT')
    result = contract_info or parse(root, 'OTH_NOT')

    if result is not None and result.content and result.doc_id:
        result_dict[result.doc_id] = {
            'category': result.category,
            'content': result.content,
        }
    else:
        print(f'{file_name} does not contain german text')

000181_2011.xml does not contain german text
000182_2011.xml does not contain german text
000183_2011.xml does not contain german text
000184_2011.xml does not contain german text
000185_2011.xml does not contain german text
000186_2011.xml does not contain german text
000187_2011.xml does not contain german text
000188_2011.xml does not contain german text
000189_2011.xml does not contain german text
000190_2011.xml does not contain german text
000191_2011.xml does not contain german text
000192_2011.xml does not contain german text
000193_2011.xml does not contain german text
000194_2011.xml does not contain german text
000195_2011.xml does not contain german text
000196_2011.xml does not contain german text
000197_2011.xml does not contain german text
000198_2011.xml does not contain german text
000199_2011.xml does not contain german text
000200_2011.xml does not contain german text
000201_2011.xml does not contain german text
000202_2011.xml does not contain german text
000203_201

000422_2011.xml does not contain german text
000423_2011.xml does not contain german text
000424_2011.xml does not contain german text
000425_2011.xml does not contain german text
000426_2011.xml does not contain german text
000427_2011.xml does not contain german text
000428_2011.xml does not contain german text
000429_2011.xml does not contain german text
000430_2011.xml does not contain german text
000431_2011.xml does not contain german text
000432_2011.xml does not contain german text
000433_2011.xml does not contain german text
000434_2011.xml does not contain german text
000435_2011.xml does not contain german text
000436_2011.xml does not contain german text
000437_2011.xml does not contain german text
000438_2011.xml does not contain german text
000439_2011.xml does not contain german text
000440_2011.xml does not contain german text
000441_2011.xml does not contain german text
000442_2011.xml does not contain german text
000443_2011.xml does not contain german text
000444_201

000682_2011.xml does not contain german text
000683_2011.xml does not contain german text
000684_2011.xml does not contain german text
000685_2011.xml does not contain german text
000686_2011.xml does not contain german text
000687_2011.xml does not contain german text
000688_2011.xml does not contain german text
000689_2011.xml does not contain german text
000690_2011.xml does not contain german text
000691_2011.xml does not contain german text
000692_2011.xml does not contain german text
000693_2011.xml does not contain german text
000694_2011.xml does not contain german text
000695_2011.xml does not contain german text
000696_2011.xml does not contain german text
000697_2011.xml does not contain german text
000698_2011.xml does not contain german text
000699_2011.xml does not contain german text
000700_2011.xml does not contain german text
000701_2011.xml does not contain german text
000702_2011.xml does not contain german text
000703_2011.xml does not contain german text
000704_201

000956_2011.xml does not contain german text
000957_2011.xml does not contain german text
000958_2011.xml does not contain german text
000959_2011.xml does not contain german text
000960_2011.xml does not contain german text
000961_2011.xml does not contain german text
000962_2011.xml does not contain german text
000963_2011.xml does not contain german text
000964_2011.xml does not contain german text
000965_2011.xml does not contain german text
000966_2011.xml does not contain german text
000967_2011.xml does not contain german text
000968_2011.xml does not contain german text
000969_2011.xml does not contain german text
000970_2011.xml does not contain german text
000971_2011.xml does not contain german text
000972_2011.xml does not contain german text
000973_2011.xml does not contain german text
000974_2011.xml does not contain german text
000975_2011.xml does not contain german text
000976_2011.xml does not contain german text
000977_2011.xml does not contain german text
000978_201

001235_2011.xml does not contain german text
001236_2011.xml does not contain german text
001237_2011.xml does not contain german text
001238_2011.xml does not contain german text
001239_2011.xml does not contain german text
001240_2011.xml does not contain german text
001241_2011.xml does not contain german text
001242_2011.xml does not contain german text
001243_2011.xml does not contain german text
001244_2011.xml does not contain german text
001245_2011.xml does not contain german text
001246_2011.xml does not contain german text
001247_2011.xml does not contain german text
001248_2011.xml does not contain german text
001249_2011.xml does not contain german text
001250_2011.xml does not contain german text
001251_2011.xml does not contain german text
001252_2011.xml does not contain german text
001253_2011.xml does not contain german text
001254_2011.xml does not contain german text
001256_2011.xml does not contain german text
001257_2011.xml does not contain german text
001258_201

001436_2011.xml does not contain german text
001437_2011.xml does not contain german text
001438_2011.xml does not contain german text
001439_2011.xml does not contain german text
001441_2011.xml does not contain german text
001442_2011.xml does not contain german text
001443_2011.xml does not contain german text
001444_2011.xml does not contain german text
001446_2011.xml does not contain german text
001447_2011.xml does not contain german text
001448_2011.xml does not contain german text
001449_2011.xml does not contain german text
001450_2011.xml does not contain german text
001451_2011.xml does not contain german text
001452_2011.xml does not contain german text
001453_2011.xml does not contain german text
001454_2011.xml does not contain german text
001455_2011.xml does not contain german text
001456_2011.xml does not contain german text
001457_2011.xml does not contain german text
001458_2011.xml does not contain german text
001459_2011.xml does not contain german text
001460_201

001678_2011.xml does not contain german text
001680_2011.xml does not contain german text
001681_2011.xml does not contain german text
001682_2011.xml does not contain german text
001683_2011.xml does not contain german text
001684_2011.xml does not contain german text
001685_2011.xml does not contain german text
001690_2011.xml does not contain german text
001691_2011.xml does not contain german text
001692_2011.xml does not contain german text
001693_2011.xml does not contain german text
001694_2011.xml does not contain german text
001695_2011.xml does not contain german text
001696_2011.xml does not contain german text
001697_2011.xml does not contain german text
001698_2011.xml does not contain german text
001699_2011.xml does not contain german text
001700_2011.xml does not contain german text
001701_2011.xml does not contain german text
001702_2011.xml does not contain german text
001703_2011.xml does not contain german text
001704_2011.xml does not contain german text
001705_201

In [141]:
len(result_dict)

215

In [142]:
result_dict

{'000001-2011': {'category': 'TRANSLATION',
  'content': 'RS-Pristina: IPA — Unterstützung des Justizrats und des Rats der Staatsanwälte des Kosovo\nStandort: Europa (nicht EU) — Kosovo\nAuftragsvorankündigung\nDienstleistungsaufträge\nGemeinsames Vokabular für öffentliche Aufträge (CPV):\nHauptgegenstand:\n75242000 Dienstleistungen im Bereich öffentliches Recht und öffentliche Ordnung\nEuropeAid/130893/C/SER/XK.\nNichtoffen.\nIPA.\nIPA 2010.\nAuf Honorarbasis.\nDer Justizrat des Kosovo spielt eine entscheidende Rolle bei der Sicherung der Unabhängigkeit, Effizienz und Gerechtigkeit des Justizsystems des Kosovo. \nDas System der internen Kontrolle der Leistung und des ethischen Verhaltens von Richtern und Staatsanwälten untersteht dem Justizrat des Kosovo. Ein neuer Rat der Staatsanwälte wird das für Staatsanwälte zuständige Organ sein; es wird seine Struktur, Richtlinien und Verwaltung noch festlegen müssen.\nDie folgenden Hauptbereiche müssen verbessert werden: \n1) Sicherstellung, d

In [145]:
import pandas as pd
pd.DataFrame(index=result_dict.keys(), data=result_dict.values())

Unnamed: 0,category,content
000001-2011,TRANSLATION,RS-Pristina: IPA — Unterstützung des Justizrat...
000002-2011,TRANSLATION,RS-Pristina: IPA — Stärkung der Institutionen ...
000003-2011,TRANSLATION,RS-Pristina: IPA — Unterstützung des Ministeri...
000004-2011,TRANSLATION,RS-Pristina: IPA — EU-Stipendienprogramm \nSta...
000005-2011,TRANSLATION,RS-Pristina: IPA — Unterstützung der Implement...
...,...,...
001679-2011,ORIGINAL,Dienstleistungsauftrag zur Durchführung des Pr...
001686-2011,ORIGINAL,Gebäudegruppe Erweiterungsbauten Campus Leupha...
001687-2011,ORIGINAL,"Berufsbildungszentrum I, Erweiterungsbau Druck..."
001688-2011,ORIGINAL,"Projekt: Neuordnung der östlichen Domumgebung,..."


In [100]:
lang = root.findall('.//OTH_NOT[@LG="DE"]')[0]
'\n'.join([el.text for el in lang.findall('.//P') if el.text is not None])

'EZB - T137 — Reprodienste für das neue EZB-Gebäude (D-Frankfurt am Main)\nBekanntmachung über vergebene Aufträge\nAusschreibung T137 \nAbschluss 22.12.2010\nHauptadresse des öffentlichen Auftraggebers: http://www.ecb.europa.eu\nAdresse des Ausschreibungsforums für die Ausschreibung betreffend den Neubau des EZB-Gebäudes:\nhttp://www.ausschreibung.ezb-neubau.de\nEuropäische Institution/Agentur oder internationale Organisation.\nWirtschaft und Finanzen.\nDer öffentliche Auftraggeber beschafft im Auftrag anderer öffentlicher Auftraggeber: Nein. \nT137 — Reprodienste für das neue EZB-Gebäude.\nDienstleistung.\nDienstleistungskategorie Nr. 15.\nHauptort der Ausführung, der Lieferung bzw. Dienstleistungserbringung:\nFrankfurt am Main, DEUTSCHLAND.\nNUTS-Code: DE712.\nAbschluss einer Rahmenvereinbarung.\nDie Europäische Zentralbank (EZB) errichtet ihre neue Hauptverwaltung auf dem Gelände der ehemaligen Großmarkthalle in Frankfurt am Main, Deutschland. Der Auftragnehmer erbringt während der 

In [103]:
doc_id = root.get('DOC_ID')
lang = root.findall('.//OTH_NOT[@LG="DE"]')[0]
category = lang.get('CATEGORY')
content = '\n'.join([el.text for el in lang.findall('.//P') if el.text is not None])