# Descriptive Statistics

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from itertools import combinations
from datetime import datetime

In [2]:
org_df = pd.read_excel('dataset/projects/organization.xlsx') 
print(f'Loaded {len(org_df)} organization observations.')

proj_df = pd.read_excel('dataset/projects/project.xlsx')
print(f'Loaded {len(proj_df)} projects.')

topic_df = pd.read_excel('dataset/projects/topics.xlsx')    
print(f'Loaded {len(topic_df)} topics.')


Loaded 100249 organization observations.
Loaded 15341 projects.
Loaded 15341 topics.


## 0. Preprocessing

In [None]:
# Group organizations by `projectID`
project_org = org_df.groupby('projectID').apply(lambda x: x.to_dict('records')).to_dict()
# lambda x: x.to_dict('records') converts each df to a list of dictionaries. keys = col, values = row  
print(f'{len(project_org)} unique projects with organizations.')

# Identify collaborations between organizations 
collaborative_proj = [project_id for project_id, orgs in project_org.items() if len(orgs) > 1] 
print(f'{len(collaborative_proj)} projects with multiple organizations.')

# Calculate the distribution of organizations per project
orgs_per_proj_counts = org_df['projectID'].value_counts().value_counts().sort_index()
print('\n Distribution of organizations per project:\n')
for count, projects in orgs_per_proj_counts.items():
    print(f'{count} organizations: {projects} projects')

## 1. Network Pairs

In [8]:
collaborations = []
total_collaborations_pairs = 0 

for project_id in collaborative_proj:
    orgs = project_org[project_id]

    # Generate all possible pairs of organizations
    for org1, org2 in combinations(orgs, 2):
    # combinations() returns all possible pairs between orgs
        collaborations.append({
            'projectID': project_id,
            'org1': org1.get('organizationID'),
            'org1_name': org1.get('name'),
            'org1_country': org1.get('country'),
            'org2': org2.get('organizationID'),
            'org2_name': org2.get('name'),
            'org2_country': org2.get('country')
        })
        total_collaborations_pairs += 1 

print(f'Generated {total_collaborations_pairs} collaboration pairs.')
collaborations[:2]

Generated 914397 collaboration pairs.


[{'projectID': 101039060,
  'org1': None,
  'org1_name': 'ARANZADI ZIENTZI ELKARTEA',
  'org1_country': 'ES',
  'org2': None,
  'org2_name': 'UNIVERSITAT DE BARCELONA',
  'org2_country': 'ES'},
 {'projectID': 101039060,
  'org1': None,
  'org1_name': 'ARANZADI ZIENTZI ELKARTEA',
  'org1_country': 'ES',
  'org2': None,
  'org2_name': 'UNIVERSIDAD DEL PAIS VASCO/ EUSKAL HERRIKO UNIBERTSITATEA',
  'org2_country': 'ES'}]

## 2. Calculate degrees( # ! parteners)

In [None]:
org_partners = defaultdict(set) # defaultdict is a dictionary that initializes missing keys with a default value

# Create a dictionary of organizations and their partners
for collab in collaborations:
    org_partners[collab['org1']].add(collab['org2'])
    org_partners[collab['org2']].add(collab['org1'])

# Convert sets to counts 
org_degrees = {org: len(partners) for org, partners in org_partners.items()}

# Find organizations with the highest degree of collaborations
print('\n Top10 organizations with the highest number of collaborations:')
top10_orgs = sorted(org_degrees.items(), key = lambda x: x[1], reverse = True)[:10]
