In [1]:
import os
import re
import json
import time
import pickle
import requests
from requests.adapters import HTTPAdapter, Retry
from pprint import pp
from collections import Counter

In [2]:
dirs={}
dirs['project']=os.path.abspath('..')+'\\'
dirs['code']=dirs['project']+'code\\'
dirs['data']=dirs['project']+'data\\'
dirs['models']=dirs['project']+'models\\'
dirs['results']=dirs['project']+'results\\'

In [3]:
class ftpreq:
    def __init__(self, uri, retries=5):
        self.u=uri
        self.s=requests.Session()
        self.retr=Retry(total=retries, backoff_factor=1, status_forcelist=[502, 503, 504], allowed_methods=frozenset([*list(Retry.DEFAULT_ALLOWED_METHODS), 'POST']))
        self.s.mount('http://', HTTPAdapter(max_retries=self.retr))

    def get(self, params=None, text='***', apiKey='SEDIA', page=1, pageSize=100):
        # if explicit parameters provided, take those.
        if params:
            self.p=params
        # if not, construct from defaults, or default overrides.
        else:
            self.p={'text': text, 'apiKey': apiKey, 'pageSize': str(pageSize), 'pageNumber': str(page)}
        self.resp=self.s.get(url=self.u, params=self.p)
        return self.resp

    def post(self, query=None, params=None, text='***', apiKey='SEDIA', page=1, pageSize=100, languages=["en"], sort={"field": "identifier", "order": "ASC"}):
        if params:
            self.p=params
        else:
            self.p={'text': text, 'apiKey': apiKey, 'pageSize': str(pageSize), 'pageNumber': str(page)}
        
        if query:
            self.f={
                # language and sort was taken from stackoverflow, not sure if it works, not sure if I care.
                'query': (None, json.dumps(query), 'application/json'),
                'languages': (None, json.dumps(languages), "application/json"),
                'sort': (None, json.dumps(sort), "application/json")
            }
        else:
            self.f=None

        self.resp=self.s.post(url=self.u, params=self.p, files=self.f)
        
        return self.resp

In [4]:
# run once, immediately get results?
load_from_disk=False
if load_from_disk and os.path.isfile(dirs['data']+'facets.pkl'):
    with open(dirs['data']+'facets.pkl', 'rb') as f:
        facets=pickle.load(f)
else:
    facets=ftpreq('https://api.tech.ec.europa.eu/search-api/prod/rest/facet').post()
    with open(dirs['data']+'facets.pkl', 'wb') as f:
        pickle.dump(facets, f)
facets

<Response [200]>

In [5]:
pp(facets.json(), depth=1)

{'apiVersion': '2.120', 'terms': '*', 'facets': [...]}


In [6]:
# Each list entry of "facets" is a dict with "name" and list of "values" - print number of distinct values per name
pp({f['name']: len(f['values']) for f in facets.json().get('facets')})

{'focusArea': 4,
 'programmeDivision': 686,
 'contractType': 5,
 'status': 4,
 'type': 6,
 'crossCuttingPriorities': 42,
 'mainCpv': 2520,
 'placesOfDeliveryOrPerformance': 424,
 'callIdentifier': 15892,
 'frameworkProgramme': 83,
 'programmeDivisionProspect': 31,
 'geographicalZones': 217,
 'procedureType': 20,
 'programmePeriod': 2,
 'mission': 13,
 'destination': 101,
 'typeOfMGAs': 72,
 'missionGroup': 8,
 'destinationGroup': 55,
 'cftSubmissionMethodCode': 4,
 'cftProcurementType': 3,
 'cftPartyLegalEntityId': 111,
 'cftEstimatedOverallContractAmount': 373,
 'cftEstimatedOverallContractCurrency': 19,
 'keywords': 28765}


In [7]:
# retrieve relevant facets and identifiers
facets.simple = {}
for facet in [f['name'] for f in facets.json().get('facets')]:
    facets.simple[facet] = {p['value']: {'raw': p['rawValue'], 'count': p['count']} for p in [f for f in facets.json().get('facets') if f['name']==facet][0]['values']}

In [8]:
facets.simple['frameworkProgramme']

{'Horizon 2020 Framework Programme (H2020 - 2014-2020)': {'raw': '31045243',
  'count': 98878},
 'Horizon Europe (HORIZON)': {'raw': '43108390', 'count': 58822},
 'EU External Action (RELEX)': {'raw': '111111', 'count': 6109},
 'Connecting Europe Facility (CEF)': {'raw': '43251567', 'count': 5222},
 'Erasmus+ (ERASMUS+)': {'raw': '43353764', 'count': 4293},
 'European Defence Fund (EDF)': {'raw': '44181033', 'count': 3937},
 'Digital Europe Programme (DIGITAL)': {'raw': '43152860', 'count': 3896},
 'HORIZON': {'raw': 'HORIZON', 'count': 3752},
 'Programme for the Environment and Climate Action (LIFE)': {'raw': '43252405',
  'count': 3392},
 'Creative Europe Programme (CREA)': {'raw': '43251814', 'count': 2562},
 'EU4Health Programme (EU4H)': {'raw': '43332642', 'count': 1591},
 'Programme for the Competitiveness of Enterprises and small and medium-sized enterprises (COSME - 2014-2020)': {'raw': '31059643',
  'count': 1547},
 'Rights, Equality and Citizenship Programme (REC - 2014-2020)

In [9]:
load_from_disk=False
if load_from_disk and os.path.isfile(dirs['data']+'search_allresults.json'):
    print("Loading from disk")
    with open(dirs['data']+'search_allresults.json', 'rb') as f:
        results=json.load(f)
else:

    results = {}
    for programme in facets.simple['frameworkProgramme'].keys():
    # for programme in ['Digital Europe Programme (DIGITAL)', 'Horizon Europe (HORIZON)']:
        print(programme)
        raw = facets.simple['frameworkProgramme'][programme]['raw']

        dir = dirs['data']+str(raw)+'\\'
        if not os.path.isdir(dir):
            os.makedirs(dir)
        dir = dir+'search\\'
        if not os.path.isdir(dir):
            os.makedirs(dir)

        q={"bool": {"must": [
            {"terms": {"type": ["0", "1", "2", "8"]}},
            {"terms": {"frameworkProgramme": [raw]}},
        ]}}

        search=ftpreq('https://api.tech.ec.europa.eu/search-api/prod/rest/search')
        resp = search.post(q)
        data = resp.json()

        result=data['results']

        p = 1
        with open(dir+'search_page'+str(p)+'.json', 'w') as f:
            json.dump(data, f)

        remaining = data['totalResults']-data['pageSize']
        
        while remaining > 0:
            p+=1
            resp = search.post(q, page=p)
            data = resp.json()
            result.extend(data['results'])
            with open(dir+'search_page'+str(p)+'.json', 'w') as f:
                json.dump(data, f)
            remaining = remaining-data['pageSize']

        # with open(dir+'search_results.json', 'w') as f:
        #     json.dump({'results': result}, f)

        results[programme]=result

    with open(dirs['data']+'search_allresults.json', 'w') as f:
        json.dump(results, f)

Horizon 2020 Framework Programme (H2020 - 2014-2020)
Horizon Europe (HORIZON)
EU External Action (RELEX)
Connecting Europe Facility (CEF)
Erasmus+ (ERASMUS+)
European Defence Fund (EDF)
Digital Europe Programme (DIGITAL)
HORIZON
Programme for the Environment and Climate Action (LIFE)
Creative Europe Programme (CREA)
EU4Health Programme (EU4H)
Programme for the Competitiveness of Enterprises and small and medium-sized enterprises (COSME - 2014-2020)
Rights, Equality and Citizenship Programme (REC - 2014-2020)
Promotion of Agricultural Products (AGRIP)
Citizens, Equality, Rights and Values Programme (CERV)
Promotion of Agricultural Products (AGRIP - 2014-2020)
Single Market Programme (SMP)
European Defence Industrial Development Programme (EDIDP - 2014-2020)
Internal Security Fund Police (ISFP - 2014-2020)
3rd Health Programme (3HP - 2014-2020)
Justice Programme (JUST - 2014-2020)
Hercule III (HERC - 2014-2020)
Europe Direct (ED)
Euratom Research and Training Programme (EURATOM)
Union An

In [10]:
# only cl4 non-cascade for now
test = [x for x in results['Horizon Europe (HORIZON)'] if x['metadata']['identifier'][0].startswith('HORIZON-CL4-') and x['metadata']['type'][0]!='8']
Counter([x['metadata']['type'][0] for x in test])

Counter({'1': 315})

In [11]:
[k for k,v in Counter([x['metadata']['identifier'][0] for x in test]).items() if v!=1]

[]

Note - we changed sorting from relevance to identifier, as apparently relevance did not result in a stable order and thus produced duplicates & omissions. Note that callIdentifiers in facet search contain a lot of non-call identifiers due to the database containing depreceated entries (cursory glance identifies these as type=11, the projects, that shouldn't be in here)

Now, does our data cover all callIdentifiers?

In [12]:
CL4_calls_facets = list(set([k for k in facets.simple['callIdentifier'].keys() if k.startswith('HORIZON-CL4-')]))
CL4_calls_search = list(set([x['metadata']['callIdentifier'][0] for x in test]))
overlap = [e in CL4_calls_facets for e in CL4_calls_search]
print(len(overlap))
print(len(overlap)==sum(overlap))
CL4_calls_facets_pruned = list(set([c for c in CL4_calls_facets if re.search(r'[A-Z]-[0-9]{2}$', c)]))
overlap = [e in CL4_calls_search for e in CL4_calls_facets_pruned]
print(len(overlap))
print(len(overlap)==sum(overlap))

39
True
31
True


All good. In theory, we are now ready to use the returned list of searched topics to start parsing their details json.

Discussion point: do we want to source call identifiers from the facetsearch api? I'm leaning towards no, because of the pollution by type=11 entries listing topics as parent call identifier. Still, at least facets is stable, while theoretically search may encounter changes as we are running the search. For me, that's a risk I'm willing to take.

In [13]:
#repair the json issue.
def properly_get_topicDetails_json(url):
    front, back = url.split('/topicDetails/')
    url = front + '/topicDetails/' + back.lower()
    return ftpreq(url).get()

Seems to work. Now to save all these jsons to disk and process them.

In [14]:
load_from_disk=False
details = {}
failed = {}
nojson = {}
for programme in facets.simple['frameworkProgramme'].keys():
    print(programme+'                                   ')
    details[programme]={}
    failed[programme]=[]
    nojson[programme]=[]
    raw = facets.simple['frameworkProgramme'][programme]['raw']
    dir=dirs['data']+str(raw)+'\\'
    i=0
    for r in results[programme]:
        i+=1
        if r['metadata']['type']!=['8'] and len(r['metadata']['identifier'])==1 and r['url'].endswith('.json'):    # ignore cascade for now
            identifier=r['metadata']['identifier'][0]
            print(str(i)+' / '+str(len(results[programme]))+'  '+identifier, end='                                   \r') #ugly print to monitor progress
            if load_from_disk and os.path.isfile(dir+identifier+'.json'):
                with open(dir+identifier+'.json', 'r') as f:
                    details[programme][identifier]=json.load(f)
            else:
                try:
                    resp=properly_get_topicDetails_json(r['url'])
                    if resp.status_code==200:
                        data=resp.json()
                        with open(dir+identifier+'.json', 'w') as f:
                            json.dump(data, f)
                        details[programme][identifier]=data
                    else:
                        failed[programme].append(identifier)
                    time.sleep(0.1) #brief nap?
                except:
                    failed[programme].append(identifier)
        elif r['metadata']['type']!=['8'] and len(r['metadata']['identifier'])==1 and not r['url'].endswith('.json'):
            nojson[programme].append(identifier)

Horizon 2020 Framework Programme (H2020 - 2014-2020)                                   
Horizon Europe (HORIZON)                                                              
EU External Action (RELEX)                                                                        
Connecting Europe Facility (CEF)                                   
Erasmus+ (ERASMUS+)                                                                   
European Defence Fund (EDF)                                                                     
Digital Europe Programme (DIGITAL)                                                
HORIZON                                                                                         
Programme for the Environment and Climate Action (LIFE)                                   
Creative Europe Programme (CREA)                                                  
EU4Health Programme (EU4H)                                                          
Programme for the Competitiveness o

In [15]:
pp({k:len(v) for k,v in failed.items() if len(v)>0})
pp({k:len(v) for k,v in nojson.items() if len(v)>0})

{'Horizon Europe (HORIZON)': 5,
 'Programme for the Environment and Climate Action (LIFE)': 1,
 'Europe Direct (ED)': 1}
{'EU External Action (RELEX)': 2600}


In [16]:
len(details['Horizon Europe (HORIZON)'])

2254

In [17]:
len([r for r in results['Horizon Europe (HORIZON)'] if r['metadata']['type']!=['8']])

2259