In [60]:
import os
import gc
import rasterio
import geopandas as gpd
import numpy as np
import pandas as pd
from tqdm import tqdm
from tqdm.auto import trange
from osgeo import gdal

In [9]:
# get the name, lon and lat of unique institutions, and summarise the ratio of valid data for each column (attribtue)
read_folder = r'C:\1-Data\higher education'
len_all = 0
dict_attribute = {}
df_institute_comb = None
for file in tqdm(os.listdir(read_folder)):
    df = pd.read_excel(read_folder + '\\' + file)
    len_all += len(df)
    df_institute_temp = df[df.columns[:5]]
    df_institute_comb = df_institute_temp if df_institute_comb is None else pd.concat([df_institute_comb, df_institute_temp])
    
    for column in df.columns[5:]:
        len_valid = df[column].apply(lambda x: str(x).replace('.', '', 1).isdigit()).sum()
        if column in dict_attribute.keys():
            dict_attribute[column] += len_valid
        else:
            dict_attribute[column] = len_valid
        

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [03:28<00:00, 18.95s/it]


In [114]:
# divide the number of valid data by the total number to get the ratio
df_attribute = pd.DataFrame([dict_attribute]).T
df_attribute.columns = ['ratio of valid data']
df_attribute['ratio of valid data'] = df_attribute['ratio of valid data']/len_all
df_attribute.to_excel(r'C:\1-Data\higher education indicators.xlsx')

In [29]:
# # save the df of unique institutes 
# df_institute_unique = df_institute_comb.drop_duplicates(subset='ETER ID')
# df_institute_unique.to_excel(r'C:\1-Data\higher education\unique_institutes0.xlsx')

In [19]:
# import list of unique institutes with missing lon and lat of institutes manually added
df_institute_unique = pd.read_excel(r'C:\1-Data\higher education\unique_institutes.xlsx', index_col=0)
df_institute_unique.rename(columns={'Geographic coordinates - latitude':'lat', 
                                    'Geographic coordinates - longitude':'lon'}, inplace=True)
# import the list of kept indicators
df_kept_indicators = pd.read_excel(r'C:\1-Data\higher education' + '\\' + 'higher education indicators.xlsx', sheet_name='keep', index_col=0)

In [79]:
read_folder = r'C:\1-Data\higher education'
temp_folder = r'C:\2-Case studies\Higher Education Observatory\temp'
nuts_folder = r'C:\1-Data\NUTS'
years = [2003, 2006, 2010, 2013, 2016, 2021, 2024]
for year in tqdm(years):
    nuts = gpd.read_file(nuts_folder + '\\' + 'NUTS_RG_01M_' + str(year) + '_3035.shp')
    
    for ObsTime in tqdm(range(2011, 2022)):
        read_path = read_folder + '\\' + str(ObsTime) + '.xlsx'
        df_raw_institute = pd.read_excel(read_path, index_col=0)
        # append lon and lat data from df_institute_uniqe (because some lon and lat data are missing in raw data),
        # and convert raw data of each year to shp
        df_raw_institute = pd.merge(df_raw_institute, df_institute_unique[['ETER ID', 'lat', 'lon']], how='left', on='ETER ID')
        gdf_raw_institute = gpd.GeoDataFrame(df_raw_institute, geometry=gpd.points_from_xy(df_raw_institute.lon, df_raw_institute.lat),
                                    crs="EPSG:4326").to_crs('EPSG:3035')
        # spatial join nuts region and raw data of institutes
        gdf_joined = gpd.sjoin(gdf_raw_institute, nuts[['NUTS_ID', 'geometry']], how="left", predicate='within')
    
        # for each indicator, get the sum and export the df to a temp folder
        for indicator in df_kept_indicators.index:
            gdf_joined[indicator] = pd.to_numeric(gdf_joined[indicator], errors='coerce')
            grouped_sum = pd.DataFrame(gdf_joined.groupby('NUTS_ID')[indicator].sum()).reset_index()
            grouped_sum.columns = ['geo', 'ObsValue']
            grouped_sum['geo_source'] = 'NUTS' + str(year)
            grouped_sum['Indicator'] = indicator
            grouped_sum['freq'] = 'year'
            grouped_sum['ObsTime'] = ObsTime
            grouped_sum['Unit'] = 'See indicator'
            grouped_sum.to_csv(temp_folder + '\\' + indicator + '_' + str(ObsTime) + '_NUTS' + str(year) + '.csv')

  0%|                                                                                            | 0/7 [00:00<?, ?it/s]
  0%|                                                                                           | 0/11 [00:00<?, ?it/s][A
  9%|███████▌                                                                           | 1/11 [00:25<04:11, 25.10s/it][A
 18%|███████████████                                                                    | 2/11 [00:50<03:47, 25.25s/it][A
 27%|██████████████████████▋                                                            | 3/11 [01:18<03:31, 26.47s/it][A
 36%|██████████████████████████████▏                                                    | 4/11 [01:45<03:08, 26.90s/it][A
 45%|█████████████████████████████████████▋                                             | 5/11 [02:14<02:44, 27.37s/it][A
 55%|█████████████████████████████████████████████▎                                     | 6/11 [02:39<02:12, 26.56s/it][A
 64%|██████████████

In [134]:
for file in [i for i in os.listdir(temp_folder) if i.startswith('Students enrolled ISCED 8')]:
    os.rename(temp_folder + '\\' + file, temp_folder + '\\' + file.replace('Students enrolled ISCED 8', 'Students enrolled at ISCED 8'))

for file in [i for i in os.listdir(temp_folder) if i.startswith('Students ISCED 7 long degree')]:
    os.rename(temp_folder + '\\' + file, temp_folder + '\\' + file.replace('Students ISCED 7 long degree', 'Students enrolled ISCED 7 long degree'))

In [144]:
list_main_indicator = []
for file in os.listdir(temp_folder):
    main_indicator = file.split(' - ')[0].split('_')[0].strip()
    if main_indicator not in list_main_indicator:
        list_main_indicator.append(main_indicator)

In [146]:
dict_indicator = {}
for file in os.listdir(temp_folder):
    if len(file.split(' - '))<2:
        pass
    else:
        main_indicator = file.split(' - ')[0].split('_')[0].strip()
        breakdown = file.split(' - ')[1].split('_')[0].strip()
        if main_indicator in dict_indicator.keys():
            if breakdown in dict_indicator[main_indicator]:
                pass
            else:
                dict_indicator[main_indicator].append(breakdown)
        else:
            dict_indicator[main_indicator] = [breakdown]

In [148]:
num_indicator = {}
for indicator in dict_indicator.keys():
    num_indicator[indicator] = len(dict_indicator[indicator])
num_indicator

{'Academic personnel': 6,
 'Classification': 1,
 'Erasmus incoming students': 2,
 'Erasmus outgoing students': 2,
 'Graduates at ISCED 5': 17,
 'Graduates at ISCED 5-7': 18,
 'Graduates at ISCED 6': 17,
 'Graduates at ISCED 7': 17,
 'Graduates at ISCED 7 long degree': 15,
 'Graduates at ISCED 8': 17,
 'Number of EU-FP projects': 13,
 'Number of senior academic personnel': 3,
 'Students enrolled at ISCED 5': 16,
 'Students enrolled at ISCED 5-7': 18,
 'Students enrolled at ISCED 6': 16,
 'Students enrolled at ISCED 7': 16,
 'Students enrolled at ISCED 8': 16,
 'Students enrolled ISCED 7 long degree': 16}

In [100]:
def reorganise_by_breakdown(indicator, dict_breakdown, save_path):
    for breakdown in tqdm(dict_breakdown.keys()):
        df_breakdown_comb = None
        for criteria in tqdm(dict_breakdown[breakdown]):
            for year in [2003, 2006, 2010, 2013, 2016, 2021, 2024]:
                for ObsTime in range(2011, 2022):
                    breakdown_file = indicator + ' - ' + criteria + '_' + str(ObsTime) + '_' + 'NUTS' + str(year) + '.csv'
                    df_breakdown_temp = pd.read_csv(temp_folder + '\\' + breakdown_file, index_col=0)
                    df_breakdown_temp.drop(columns=['Indicator', 'freq', 'Unit'], inplace=True)
                    df_breakdown_temp['indicator'] = indicator
                    df_breakdown_temp[breakdown] = criteria
                    df_breakdown_comb = df_breakdown_temp if df_breakdown_comb is None else pd.concat([df_breakdown_comb, df_breakdown_temp])
        df_breakdown_comb.to_csv(save_path + '\\' + indicator + '-' + breakdown + '.csv')

In [199]:
dict_breakdown_18 = {'gender' : ['gender unclassified', 'men', 'women'], 
                     'citizenship' : ['citizenship unclassified', 'foreigner', 'national'],
                     'field':['Agriculture, forestry, fisheries and veterinary',
                              'Arts and Humanities',
                              'Business, administration and law',
                              'Education',
                              'Engineering, manufacturing and construction',
                              'Generic programmes and qualifications',
                              'Health and welfare',
                              'Information and Communication Technologies',
                              'Natural sciences, mathematics and statistics',
                              'Services',
                              'Social sciences, journalism and information',
                              'ISCED-FoE unclassified']}

dict_breakdown_17 = {'gender' : ['gender unclassified', 'men', 'women'], 
                     'citizenship' : ['citizenship unclassified', 'foreigner', 'national'],
                     'field':['Agriculture, forestry, fisheries and veterinary',
                              'Arts and Humanities',
                              'Business, administration and law',
                              'Education',
                              'Engineering, manufacturing and construction',
                              'Generic programmes and qualifications',
                              'Health and welfare',
                              'Information and Communication Technologies',
                              'Natural sciences, mathematics and statistics',
                              'Services',
                              'Social sciences, journalism and information']}

dict_breakdown_16 = {'gender' : ['gender unclassified', 'men', 'women'], 
                     'citizenship' : ['foreigner', 'national'],
                     'field':['Agriculture, forestry, fisheries and veterinary',
                              'Arts and Humanities',
                              'Business, administration and law',
                              'Education',
                              'Engineering, manufacturing and construction',
                              'Generic programmes and qualifications',
                              'Health and welfare',
                              'Information and Communication Technologies',
                              'Natural sciences, mathematics and statistics',
                              'Services',
                              'Social sciences, journalism and information']}

dict_breakdown_15 = {'gender' : ['men', 'women'], 
                     'citizenship' : ['foreigner', 'national'],
                     'field':['Agriculture, forestry, fisheries and veterinary',
                              'Arts and Humanities',
                              'Business, administration and law',
                              'Education',
                              'Engineering, manufacturing and construction',
                              'Generic programmes and qualifications',
                              'Health and welfare',
                              'Information and Communication Technologies',
                              'Natural sciences, mathematics and statistics',
                              'Services',
                              'Social sciences, journalism and information']}

dict_breakdown_13 = {'field':['Agriculture, Forestry, Fisheries and Veterinary',
                              'Arts and Humanities',
                              'Business, Administration and Law',
                              'Coordination and Support Action',
                              'Education',
                              'Engineering, Manufactoring and Construction',
                              'European Research Council Grants',
                              'Health and Welfare',
                              'Information and Communication Technologies',
                              'Innovation Action',
                              'Natural Sciences, Mathematics and Statistics',
                              'Research and Innovation Action',
                              'Social Sciences, Journalism and Information']}

dict_breakdown_6 = {'gender' : ['gender unclassified (HC)', 'men (HC)', 'women (HC)'], 
                  'citizenship' : ['citizenship unclassified', 'foreigner (HC)', 'national (HC)']}

dict_breakdown_3 = {'gender' : ['gender unclassified (HC)', 'men (HC)', 'women (HC)']}

dict_breakdown_2 = {'education level':['ISCED 6', 'ISCED 7']}

In [201]:
for indicator in ['Graduates at ISCED 5-7', 'Students enrolled at ISCED 5-7']:
    reorganise_by_breakdown(indicator, dict_breakdown_18, save_path)

for indicator in ['Graduates at ISCED 5', 'Graduates at ISCED 6', 'Graduates at ISCED 7', 'Graduates at ISCED 8']:
    reorganise_by_breakdown(indicator, dict_breakdown_17, save_path)

for indicator in ['Students enrolled at ISCED 5', 'Students enrolled at ISCED 6', 'Students enrolled at ISCED 7', 'Students enrolled at ISCED 8']:
    reorganise_by_breakdown(indicator, dict_breakdown_16, save_path)

for indicator in ['Graduates at ISCED 7 long degree']:
    reorganise_by_breakdown(indicator, dict_breakdown_15, save_path)

for indicator in ['Number of EU-FP projects']:
    reorganise_by_breakdown(indicator, dict_breakdown_13, save_path)

for indicator in ['Academic personnel']:
    reorganise_by_breakdown(indicator, dict_breakdown_6, save_path)

for indicator in ['Number of senior academic personnel']:
    reorganise_by_breakdown(indicator, dict_breakdown_3, save_path)

for indicator in ['Erasmus incoming students', 'Erasmus outgoing students']:
    reorganise_by_breakdown(indicator, dict_breakdown_2, save_path)

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]
  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 33%|████████████████████████████                                                        | 1/3 [00:01<00:03,  1.73s/it][A
 67%|████████████████████████████████████████████████████████                            | 2/3 [00:03<00:02,  2.04s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:06<00:00,  2.22s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.73s/it]
  0%|                                                                                            | 0/1 [00:00<?, ?it/s]
  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A
 50%|████████████████████