# Visualization of Datasets

This notebook provides visualizations for the Horizon Europe projects dataset (2021-2027), containing information about EU-funded research projects, their deliverables, publications, and summaries.

In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

## 1. Data Loading

In [5]:
# Load datasets
try:
    euroSciVoc_df = pd.read_excel('dataset/projects/euroSciVoc.xlsx')
    legalBasis_df = pd.read_excel('dataset/projects/legalBasis.xlsx')
    organization_df = pd.read_excel('dataset/projects/organization.xlsx')
    project_df = pd.read_excel('dataset/projects/project.xlsx')
    topics_df = pd.read_excel('dataset/projects/topics.xlsx')
    webLink_df = pd.read_excel('dataset/projects/webLink.xlsx')
    deliverables_df = pd.read_excel('dataset/deliverables.xlsx')
    publications_df = pd.read_excel('dataset/publications.xlsx')
    summaries_df = pd.read_excel('dataset/summaries.xlsx')
    
    print(f"EuroSciVoc dataset shape: {euroSciVoc_df.shape}")
    print(f"Legal Basis dataset shape: {legalBasis_df.shape}")
    print(f"Organizations dataset shape: {organization_df.shape}")
    print(f"Project dataset shape: {project_df.shape}")
    print(f"Topics dataset shape: {topics_df.shape}")
    print(f"Web Link dataset shape: {webLink_df.shape}")
    print(f"Deliverables dataset shape: {deliverables_df.shape}")
    print(f"Publications dataset shape: {publications_df.shape}")
    print(f"Summaries dataset shape: {summaries_df.shape}")
except FileNotFoundError as e:
    print(f"File not found: {e}")
    print("Please adjust the file paths to match your dataset locations.")

EuroSciVoc dataset shape: (38789, 5)
Legal Basis dataset shape: (20512, 4)
Organizations dataset shape: (100249, 25)
Project dataset shape: (15341, 20)
Topics dataset shape: (15341, 3)
Web Link dataset shape: (21440, 9)
Deliverables dataset shape: (20815, 10)
Publications dataset shape: (21310, 16)
Summaries dataset shape: (3521, 7)


## 2. Preprocessing 

Delete some columns(manually on excel files)

In [None]:
# check if `project_df`'s column `nature` has non-NaN values
# project_df['nature'].notnull().sum() # 0 indicates that the column is empty

# delete the column `nature` from `project_df`
# project_df.drop('nature', axis = 1, inplace = True)

In [None]:
# check if `webLink_df`'s column `status` and `archivedDate` has non-NaN values
# print(webLink_df['status'].notnull().sum())
# print(webLink_df['archivedDate'].notnull().sum())

# check if `organization_df`'s column `active` has non-NaN values
# print(organization_df['active'].notnull().sum())

# check if euroSciVoc_df's column `euroSciVocDescription` has non-NaN values
# print(euroSciVoc_df['euroSciVocDescription'].notnull().sum())

0
0
0
0


In [None]:
# delete these columns from `webLink_df`, `organization_df`, and `euroSciVoc_df`
# webLink_df.drop(['status', 'archivedDate'], axis = 1, inplace = True)
# organization_df.drop('active', axis = 1, inplace = True)
# euroSciVoc_df.drop('euroSciVocDescription', axis = 1, inplace = True)


In [13]:
# check if `publications_df`'s column `publishedPages` has non-NaN values
print(publications_df['publishedPages'].notnull().sum()) # has 7 non-NaN values

# Extract all the obs. where `publishedPages` has non-NaN values
publications_df_pubPages = publications_df[publications_df['publishedPages'].notnull()]
publications_df_pubPages

7


Unnamed: 0,id,title,isPublishedAs,authors,journalTitle,journalNumber,publishedYear,publishedPages,issn,isbn,doi,projectID,projectAcronym,collection,contentUpdateDate,rcn
7304,101058527_6903_PUBLIHORIZON,Skills4EOSC Draft Open Science Career Profiles...,Other,"Whyte, Angus; Green, Dominique; Avanço, Karla;...",,,2023,SKILLS4EOSC,,,10.5281/zenodo.7686263,101058527,Skills4EOSC,Project publication,2024-02-27 16:42:00,1030499
7684,101046203_2970_PUBLIHORIZON,Building a FAIR image data ecosystem for micro...,Other,"Kemmer, Isabel; Keppler, Antje; Serrano-Solano...",Histochemestry and Cell Biology,160,2023,199-209,,,10.5281/zenodo.7788899,101046203,BY-COVID,Project publication,2023-08-22 11:50:08,970580
13770,101056884_13976_PUBLIHORIZON,Exploring the Temperature Dependent Magnetic P...,Conference proceedings,"P. Corte-León, I. Skorvanek, F. Andrejka, V. Z...",IARIA Congress 2023 : The 2023 IARIA Annual Co...,9 (2),2024,26-30,,978-1-68558-089-6,,101056884,INFINITE,Project publication,2024-05-07 11:16:19,1035181
16767,101039206_5424_PUBLIHORIZON,Cortex,Peer reviewed articles,"Matilde Conti, Alice Teghil, Antonella Di Vita...",Cortex,163,2023,80-91,0010-9452,,10.1016/j.cortex.2023.03.004,101039206,ATENA,Project publication,2023-09-20 10:06:06,972559
21043,101056939_21387_PUBLIHORIZON,Aligning climate scenarios to emissions invent...,Peer reviewed articles,"Matthew J. Gidden, Thomas Gasser, Giacomo Gras...",Nature,624,2023,102-108,1476-4687,,10.1038/s41586-023-06724-y,101056939,RESCUE,Project publication,2024-05-07 11:40:00,1077788
21049,101056939_21383_PUBLIHORIZON,Brief communication: Surface energy balance di...,Peer reviewed articles,U. Krebs-Kanzow; C. B. Rodehacke; C. B. Rodeha...,"The Cryosphere, Vol 17, Pp 5131-5136 (2023)",17 (12),2023,5131-5136,1994-0424,,10.5194/tc-17-5131-2023,101056939,RESCUE,Project publication,2024-05-07 11:34:01,1069484
21141,101046133_5713_PUBLIHORIZON,"""Preprint: """"Be Sustainable Recommendations"""" ...",Peer reviewed articles,"David, R; Rybina, A; Burel, J; Heriche, J; Aud...",Zenodo,,2023,Zenodo,,,10.5281/zenodo.8247376,101046133,ISIDORe,Project publication,2024-01-24 17:22:57,1030002


In [35]:
all_df = [euroSciVoc_df, legalBasis_df, organization_df, project_df, topics_df, webLink_df, deliverables_df, publications_df, summaries_df]

# search for all columns with unique value 
print("Columns with unique values: \n")
for df in all_df:
    for col in df.columns:
        if len(df[col].dropna().unique()) == 1:
            print(f"Dataset: {[name for name, df_obj in globals().items() if df_obj is df][0]}")
            print(f"Column: {col}")
            print(f"Unique values: {df[col].unique()}")
            print(f"Number of NaN values: {df[col].isnull().sum()}")
            print(f"Number of non-NaN values: {df[col].notnull().sum()}")
            print(f"Number of observations: {df.shape[0]}")
            print('\n')

Columns with unique values: 

Dataset: legalBasis_df
Column: uniqueProgrammePart
Unique values: [True nan]
Number of NaN values: 5171
Number of non-NaN values: 15341
Number of observations: 20512


Dataset: project_df
Column: frameworkProgramme
Unique values: ['HORIZON']
Number of NaN values: 0
Number of non-NaN values: 15341
Number of observations: 15341


Dataset: deliverables_df
Column: collection
Unique values: ['Project deliverable']
Number of NaN values: 0
Number of non-NaN values: 20815
Number of observations: 20815


Dataset: publications_df
Column: collection
Unique values: ['Project publication']
Number of NaN values: 0
Number of non-NaN values: 21310
Number of observations: 21310




In [34]:
# search for columns with binary outcomes
print("Columns with binary outcomes: \n")

for df in all_df:
    for col in df.columns:
        if len(df[col].dropna().unique()) == 2:
            print(f"Dataset: {[name for name, df_obj in globals().items() if df_obj is df][0]}")
            print(f"Column: {col}")
            print(f"Unique values: {df[col].unique()}")
            print(f"Number of NaN values: {df[col].isnull().sum()}")
            print(f"Number of non-NaN values: {df[col].notnull().sum()}")
            print(f"Number of observations: {df.shape[0]}")
            print('\n')



Columns with binary outcomes: 

Dataset: organization_df
Column: SME
Unique values: [False True nan]
Number of NaN values: 263
Number of non-NaN values: 99986
Number of observations: 100249


Dataset: organization_df
Column: endOfParticipation
Unique values: [False  True]
Number of NaN values: 0
Number of non-NaN values: 100249
Number of observations: 100249


