In [2]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import xml.etree.ElementTree as ET
from tqdm import tqdm
import matplotlib.pyplot as plt
import os
import numpy as np

---
# V1

In [5]:
def extract_elements_to_dict(element):
    data = {}
    for child in element:
        tag = child.tag
        if (tag!="relations" and tag in ["id","title","teaser","body","keywords","contentCreationDate","language"]):
            text = child.text.strip() if child.text else ''
            if len(child) > 0:
                def get_all_text(el):
                    texts = []
                    if el.text and el.text.strip():
                        texts.append(el.text.strip())
                    for c in el:
                        texts.append(get_all_text(c))
                    if el.tail and el.tail.strip():
                        texts.append(el.tail.strip())
                    return ' '.join(texts)
                
                text = get_all_text(child).strip()

            data[tag] = text
    return data

In [6]:
root_path = 'C:/Users/barsro/Downloads/Data_projets_EU/'
dirs = [name for name in os.listdir(root_path) if os.path.isdir(os.path.join(root_path, name))]

rows = []
for dir in dirs :
    print(dir)
    folder_path = os.path.join(root_path,dir)
    files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    for file in tqdm(files):
        try :
            tree = ET.parse(os.path.join(folder_path,file))
            root = tree.getroot()
            result = extract_elements_to_dict(root)
            result["Domain"]=dir
            rows.append(result)
        except :
            print(os.path.join(folder_path,file))

Digital_Economy


100%|██████████| 4659/4659 [00:13<00:00, 347.22it/s] 


Energy


100%|██████████| 3323/3323 [00:08<00:00, 394.04it/s] 


Environment


100%|██████████| 5320/5320 [00:15<00:00, 345.77it/s]


Food_Agriculture


100%|██████████| 2816/2816 [00:07<00:00, 357.60it/s]


Fundammental


100%|██████████| 3389/3389 [00:07<00:00, 431.59it/s] 


Health


100%|██████████| 6852/6852 [00:13<00:00, 513.52it/s] 


Industrie


100%|██████████| 3693/3693 [00:10<00:00, 358.11it/s]


Society


100%|██████████| 5080/5080 [00:12<00:00, 397.80it/s] 


Space


100%|██████████| 960/960 [00:02<00:00, 341.68it/s]


Transport


100%|██████████| 1900/1900 [00:05<00:00, 371.35it/s]

C:/Users/barsro/Downloads/Data_projets_EU/Transport\xml.zip





In [7]:
df = pd.DataFrame(rows)
df['year'] = pd.to_datetime(df['contentCreationDate']).dt.year
df.head()

Unnamed: 0,language,id,title,teaser,body,keywords,contentCreationDate,Domain,year
0,en,182009-smart-grid-redirects-computing-power-to...,Smart grid redirects computing power to heat h...,EU-funded researchers have developed a decentr...,The EU-funded project can benefit end users by...,"EeHPC, high performance computing, HPC, Qarnot...",2016-05-05 15:53:11,Digital_Economy,2016
1,en,182988-spanish-sme-aims-for-top-spot-in-virtua...,Spanish SME aims for top spot in virtual fitti...,Online clothing retailers will soon be able to...,With eCommerce sales set to increase by 18.4 %...,"Smart textiles, safety shoes, biometric sensors",2016-07-12 17:56:55,Digital_Economy,2016
2,en,188522-robotic-solutions-to-give-dementia-pati...,Robotic solutions to give dementia patients be...,The EU-funded MARIO project is developing a co...,"In the absence of a cure, solutions such as MA...","MARIO, Kompaï-2 robot, dementia, DOMEO",2016-09-09 15:16:41,Digital_Economy,2016
3,en,188757-wisers-free-tools-will-help-large-and-s...,WISER’s free tools will help large and small e...,Companies and governments are bombarded by bil...,The vast majority of SMEs have to sacrifice li...,"WISER, cyber security, threat, trojan horse, S...",2016-10-28 13:13:54,Digital_Economy,2016
4,en,190722-pcp-and-ppi-a-public-boost-to-societal-...,PCP and PPI: a public boost to societal challe...,Rising public needs and interests require not ...,"In the EU, two mechanisms are increasingly use...","PCP, PPI, R&D, innovation, intelligent transpo...",2016-12-09 16:28:14,Digital_Economy,2016


In [8]:
df.to_csv('./cordis_v1.csv')

---
# V2 Semantics

In [4]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')  

climate_keywords = [
    "climate change", "global warming", "greenhouse gas", "carbon emissions", "decarbonization",
    "carbon footprint", "co2", "methane", "sea level rise", "glacier melt", "drought", "flood",
    "storm", "heatwave", "resilience", "adaptation", "mitigation", "climate policy", "renewable energy",
    "energy efficiency", "sustainable agriculture", "climate finance", "carbon neutral", "net zero",
    "emissions reduction", "deforestation", "reforestation", "afforestation", "biodiversity loss",
    "ecosystem collapse", "climate migration", "climate risk", "climate action", "nature-based solutions",
    "climate scenario", "climate model", "extreme weather", "carbon pricing", "carbon tax", "GHG inventory",
    "permafrost", "ocean acidification", "desertification", "weather variability", "climate adaptation"
]

keyword_embeddings = model.encode(climate_keywords, convert_to_tensor=True)

def compute_climate_score(text):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return 0.0
    text_embedding = model.encode(text, convert_to_tensor=True)
    cosine_scores = util.cos_sim(text_embedding, keyword_embeddings)
    return float(np.max(cosine_scores.numpy()))

In [5]:
list = ["DEVELOPMENT OF APPROPRIATE DIGITAL IMAGE PROCESSING SOFTARE FOR ANALYSIS OF IMAGES. DEVELOPMENT OF SPECIALIZED SOFTWARE FOR THERMAL IMAGES. DURING SMALL SCALE TESTS IN WHICH TWO-PHASE JETS OR CLOUDS ARE RELEASED FLOW VISUALISATIONS WILL BE PERFORMED. THE VKI WILL GIVE ADVICES TO WHICH TYPE OF VISUALISATIONS SHOULD BE PERFORMED. EXAMPLES OF METHODS ARE : BULK FLOW ILLUMINATION, LONGITUDINAL AND TRANSVERSAL LASER SHEETS, HIGH SPEED VIDEO RECORDING, PHOTOGRAPHY, MOVIE. THE MOST SUITABLE METHODS WILL BE SELECTED AND COMPUTER PROGRAMMES WILL BE PREPARED AT VKI IN ORDER TO EXTRACT INFORMATION FROM THESE VISUAL RECORDS. THE DATA TO BE OBTAINED ARE OF DIFFERENT NATURE : GEOMETRICAL CHARACTERISTICS SUCH AS CROSS AREA OR LONGITUDINAL AREA OF THE CLOUD WILL BE QUANTIFIED. THE CONCENTRATION OF DROPLET WITHIN THE JET OR CLOUD WILL BE EVALUATED. THESE TASKS REQUIRE SPECIAL SOFTWARE THAT WILL ALLOW A CORRECT GEOMETRICAL QUANTIFICATION. THIS WILL INCLUDE METHODS FOR CORRECTING FOR PERSPECTIVE AND FOR UNEVEN ILLUMINATION. FINALLY, ATTEMPTS WILL BE MADE AT MEASURING THE SIZE OF DROPLETS. THE SUCCESS OF THIS METHOD WILL DEPEND UPON THE NATURE OF THE VISUAL RECORDS WHICH WILL BE AVAILABLE. BECAUSE OF THE LIMITATIONS IN THE RESOLUTION OF THE DIGITAL IMAGE PROCESSING SYSTEM USED, AN OPTICAL MAGNIFICATION OF THE VISUAL RECORDS WILL BE MADE AND THIS MAGNIFICATION WILL BE LIMITED BY THE NATURE OF THE ORIGINAL IMAGE. IT IS ALSO ANTICIPATED THAT VISUAL RECORDS ARE MADE OF THE LARGE SCALE TESTS. THE VAN KARMAN INSTITUTE WILL INVESTIGATE THE POSSIBILITIES TO EXTRACT INFORMATION FROM THESE TRIALS. PRIOR TO THE TESTS, METHODS USED FOR THESE VISUALISATIONS WILL BE DISCUSSED. FINALLY IN THE TESTS P6, P10 AND P11, AN INFRARED CAMERA SHOULD BE USED TO OBTAIN THERMAL IMAGES OF THE TWO PHASE CLOUD OR JET. AN ANALYSIS PERFORMED WITH THE DIGITAL IMAGE PROCESSING SHOULD ALLOW FOR CORRECTION OF EMISSIVITY AND EMISSION ANGLE OF THE RADIATION. IT IS ANTICIPATED THAT INFORMATION ON THE HEAT TRANSFER WITHIN THE TWO-PHASE FLOW SHOULD BE DETERMINED.",
        "TO DEVELOP AN ECONOMIC MODEL OF A GEOTHERMAL HOT DRY ROCK DEVELOPMENT. A cost model of hot dry rock (HDR) geothermal systems is being developed under contract to the United Kingdom Department of Energy and the Commission of the European Communities. During the first phase of this study, the basic structure of the model has been defined and modelling approaches which are appropriate to the level of knowledge of different areas of this system have been identified. The development of the cost model is now in progress and the main features of the modelling approach have been ascertained. An interim model has also been brought together in order to obtain some indications of sensitivities and so assist with the model process.Although the results are not definite at this stage, they do indicate important features of the technoeconomic aspects of the system. Thus the size and the spatial arrangement of the reservoir emerge as important issues, together with thermal gradient and depth.The ultimate objective of this study is to develop a full engineering cost model of electricity producing HDR systems which includes all surface and subsurface systems and components.Once developed the aims of the model are to:estimate capital and operating costs of HDR systems and their development over time;investigate the sensitivity of unit electricity costs to changes in parameters defining the HDR reservoir and also those defining the size and design of the power station;assist in the assessment of European HDR resources, as a function of cost and location.The cost of creating a reservoir, by drilling and stimulation, is likely to form a significant fraction of the total cost of constructing any proposed hot dry rock (HDR) geothermal energy system. The cost of these operations can be estimated using a cost model that has been developed to the assist the management of HDR geothermal energy research and development programmes.The model, which is formulated as a spreadsheet programme written on SMART software for an Olivetti M24 personal computer, can either be run as an integrated package or as individual components. Although preliminary versions of the cost model have been completed, development will continue.The final version of the drilling and stimulation cost model will be able to account for the affect of varying depth, geothermal gradient, geology (in terms of subsequent rates of penetration and bit life), borehole breakout, thickness of sedimentary cover, well design (including configuration angle of deviation and casing programme) and reservoir characteristics (such as volume, shape and number of stimulated zones).Other factors that will be taken into consideration include the type of logging and coring programmes selected and the general drilling market conditions which affect the cost of drilling supplies and services.DESCRIPTION: THIS STUDY BUILDS ON PREVIOUS WORK BY THE CONTRACTOR IN MODELLING THE VARIOUS ASPECTS OF AN HDR OPERATION, AND WILL BE DESIGNED TO PROVIDE RESULTS APPLICABLE IN ALL THE EUROPEAN MEMBER STATES. IT WILL ALLOW ESTIMATION OF THE COSTS OF ENERGY DERIVED FROM AN HDR RESERVOIR TOGETHER WITH AN ANALYSIS OF THE SENSITIVITY OF THOSE COSTS TO VARIOUS ASPECTS OF RESERVOIR CREATION, DEVELOPMENT AND OPERATION. THE MODEL WILL RUN ON A PC-COMPATIBLE MICROCOMPUTER.",
        "The newly-developed code, CYCLOP, allows efficient calculation for optimisation studies. The CYCLOP project has brought together many areas of theoretical, experimental and numerical study. The main achievements highlighted by project partners are: - New modules for turbulence, agglomeration of particles and twophase flow treatment - New algorithms adapted to collocated approach and curvilinear grids - Experimental data on cyclone flow-field (by LDV) and performances in cold and hot conditions - and also some results on fluidized bed operating characteristics - A software package allowing calculation of cyclones and similar separators in 2D and 3D, with various graphic facilities will validated in 2D on different experimental data - Better overall knowledge of physics in cyclones.",
        "AN IMPORTANT TASK IN THE DECOMMISSIONNING OF NUCLEAR INSTALLATIONS    IS THE PROOF OF THE VERY LOW RADIOACTIVITY LEVELS, ALLOWING FOR FREE RELEASE    OF THE GENERATED WASTE. THIS PROOF INVOLVESLONG MEASURING TIMES ON A GREAT      NUMBER  OF REPRESENTATIVE SAMPLES OUT OF IMPORTANT MASSES OF METAL STRUCTURES   AND CONCRETE, AND CONSIDERABLE RADIATION EXPOSURE OF THE MEASURING STAFF.       THE MAIN OBJECTIVE OF THE PRESENT RESEARCH IS THE DEVELOPMENT, CONSTRUCTION AND LARGE-SCALE TESTING OF A PROTOTYPE FOR AN AUTOMATIC MEASURING SYSTEM, APPROPRIATE TO TREAT IMPORTANT MASSES OF WASTE, WITH LOW-LEVEL ACTIVITIES AND DIFFERENT  NUCLIDE COMPOSITIONS AND SHAPES. IT IS EXPECTED TO MINIMISE HUMAN ERRORS BY AUTOMATIC OPERATION. THE MEASURING SYSTEM WILL BE DESIGNED AS A MOBILE UNIT, WITH A MODULAR STRUCTURE ALLOWING FOR A GENERAL PURPOSE APPLICATION TO LWR TYPICAL WASTE ARISING, AT DIFFERENT DECOMMISSIONING SITES. THE PRACTICAL TESTING WILL BE DONE A TOTAL MASS OF 1000 MG IN THE FRAMEWORK OF THE KKN DECOMMISSIONING. THE STUDY WILL BE COMPLETED BY A CONCLUSIVE ASSESSMENT OF THE MERITS OF THE DEVELOPED MEASURING SYSTEM FOR LARGE-SCALE OPERATION. The dismantling of nuclear facilities requires proof that the radioactivity levels of materials to be released from restricted areas remain below low limiting values. Up till now, decisive measurements have been almost impossible on parts and material with complex geometries. In order to keep measurement costs low, a device has been developed which uses a fast automatic procedure to examine large amounts of dismantled and potentially contaminating components. The device measures the gross gamma-radiation which has a higher penetrating capacity into the material than beta radiation. The measuring tunnel is 1.2 m broad and 1.2 m high. Parts to be measured can be up to 4 m long and weigh 1 tonne. Analysis of measurements has shown that the specified minimal detectable activity level of 1000 Bq cobalt-60 can be achieved, even with steel shielding of 2 cm thickness.B.1.CONCEPTUAL STUDIES FOR THE DEFINITION OF THE REQUIREMENTS FOR A MEASURING SYSTEM,INCLUDING ASSESSMENT OF EXISTING LOW-LEVEL ACTIVITY MEASURING TECHNIQUES, DEFINITION OF THE TYPES OF WASTE TO BE TREATED, AND HEALTH PHYSICS PROTECTION CONSIDERATIONS. B.2. PREPARATION OF A DESIGN OF THE COMPLETE MEASURING SYSTEM,INCLUDING DETECTORS,CONTROL AND TRANSPORT SYSTEM, GENERAL PURPOSE SOFTWARE FOR MEASURING DATA PROCESSING, FOLLOWED BY A CALL FOR TENDERS AND THE CHOICE OF MANUFACTURES. B.3. PREPARATION OF A LICENSING DOSSIER FOR EXPERIMENTAL OPERATION OF THE MEASURING SYSTEM PROGRAMME. IN THE FRAMEWORK OF THE DECOMMISSIONING OF KKN. B.4. EXECUTION OF A LARGE-SCALE TEST PROGRAMME. B.5.CONCLUSIVE ASSESSMENT OF THE APPROPRIATENESS OF THE DEVELOPED MEASURING SYSTEM, CONSIDERING TECHNICAL AND ECONOMIC ASPECTS.",
        "Work has been directed towards the provision of key components of real time computerised support systems, embodied in software packages to be made generally available for use in European Community countries. These software packages include numerical models suitable for the simulation of atmospheric transport, dispersal and deposition of a release of radioactive material over local (out to a few tens of kilometres at most), mesoscale (out to 100 to 200 kilometres) and long range distances (over the whole of Europe). To aid in accidents where there are large uncertainties about the source term, packages have been developed addressing the deduction of estimates of the quantities of radionuclides released by combining measurements and model simulations and optimising the agreement between them. Finally, as a tool to aid in the assessment of doses and the efficacy of possible countermeasures, a special package has been produced for dose assessment taking into account different exposure pathways."]

for elem in list :
    print(compute_climate_score(elem))

0.30510270595550537
0.4423593282699585
0.4547749161720276
0.45463743805885315
0.3841760754585266
