In [12]:
import os
import gc
import rasterio
import geopandas as gpd
import numpy as np
import pandas as pd
from tqdm import tqdm
from osgeo import gdal

# 1. Aggregate institutes to NUTS regions by their lon and lat

In [9]:
# get the name, lon and lat of unique institutions, and summarise the ratio of valid data for each column (attribtue)
read_folder = r'C:\1-Data\higher education'
len_all = 0
dict_attribute = {}
df_institute_comb = None
for file in tqdm(os.listdir(read_folder)):
    df = pd.read_excel(read_folder + '\\' + file)
    len_all += len(df)
    df_institute_temp = df[df.columns[:5]]
    df_institute_comb = df_institute_temp if df_institute_comb is None else pd.concat([df_institute_comb, df_institute_temp])
    
    for column in df.columns[5:]:
        len_valid = df[column].apply(lambda x: str(x).replace('.', '', 1).isdigit()).sum()
        if column in dict_attribute.keys():
            dict_attribute[column] += len_valid
        else:
            dict_attribute[column] = len_valid
        

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [03:28<00:00, 18.95s/it]


In [114]:
# divide the number of valid data by the total number to get the ratio
df_attribute = pd.DataFrame([dict_attribute]).T
df_attribute.columns = ['ratio of valid data']
df_attribute['ratio of valid data'] = df_attribute['ratio of valid data']/len_all
df_attribute.to_excel(r'C:\1-Data\higher education indicators.xlsx')

In [29]:
# # save the df of unique institutes 
# df_institute_unique = df_institute_comb.drop_duplicates(subset='ETER ID')
# df_institute_unique.to_excel(r'C:\1-Data\higher education\unique_institutes0.xlsx')

In [19]:
# import list of unique institutes with missing lon and lat of institutes manually added
df_institute_unique = pd.read_excel(r'C:\1-Data\higher education\unique_institutes.xlsx', index_col=0)
df_institute_unique.rename(columns={'Geographic coordinates - latitude':'lat', 
                                    'Geographic coordinates - longitude':'lon'}, inplace=True)
# import the list of kept indicators
df_kept_indicators = pd.read_excel(r'C:\1-Data\higher education' + '\\' + 'higher education indicators.xlsx', sheet_name='keep', index_col=0)

In [79]:
read_folder = r'C:\1-Data\higher education'
temp_folder = r'C:\2-Case studies\Higher Education Observatory\temp'
nuts_folder = r'C:\1-Data\NUTS'
years = [2003, 2006, 2010, 2013, 2016, 2021, 2024]
for year in tqdm(years):
    nuts = gpd.read_file(nuts_folder + '\\' + 'NUTS_RG_01M_' + str(year) + '_3035.shp')
    
    for obsTime in tqdm(range(2011, 2022)):
        read_path = read_folder + '\\' + str(obsTime) + '.xlsx'
        df_raw_institute = pd.read_excel(read_path, index_col=0)
        # append lon and lat data from df_institute_uniqe (because some lon and lat data are missing in raw data),
        # and convert raw data of each year to shp
        df_raw_institute = pd.merge(df_raw_institute, df_institute_unique[['ETER ID', 'lat', 'lon']], how='left', on='ETER ID')
        gdf_raw_institute = gpd.GeoDataFrame(df_raw_institute, geometry=gpd.points_from_xy(df_raw_institute.lon, df_raw_institute.lat),
                                    crs="EPSG:4326").to_crs('EPSG:3035')
        # spatial join nuts region and raw data of institutes
        gdf_joined = gpd.sjoin(gdf_raw_institute, nuts[['NUTS_ID', 'geometry']], how="left", predicate='within')
    
        # for each indicator, get the sum and export the df to a temp folder
        for indicator in df_kept_indicators.index:
            gdf_joined[indicator] = pd.to_numeric(gdf_joined[indicator], errors='coerce')
            grouped_sum = pd.DataFrame(gdf_joined.groupby('NUTS_ID')[indicator].sum()).reset_index()
            grouped_sum.columns = ['geo', 'obsValue']
            grouped_sum['geo_source'] = 'NUTS' + str(year)
            grouped_sum['indicator'] = indicator
            grouped_sum['freq'] = 'year'
            grouped_sum['obsTime'] = obsTime
            grouped_sum['Unit'] = 'See indicator'
            grouped_sum.to_csv(temp_folder + '\\' + indicator + '_' + str(obsTime) + '_NUTS' + str(year) + '.csv')

  0%|                                                                                            | 0/7 [00:00<?, ?it/s]
  0%|                                                                                           | 0/11 [00:00<?, ?it/s][A
  9%|███████▌                                                                           | 1/11 [00:25<04:11, 25.10s/it][A
 18%|███████████████                                                                    | 2/11 [00:50<03:47, 25.25s/it][A
 27%|██████████████████████▋                                                            | 3/11 [01:18<03:31, 26.47s/it][A
 36%|██████████████████████████████▏                                                    | 4/11 [01:45<03:08, 26.90s/it][A
 45%|█████████████████████████████████████▋                                             | 5/11 [02:14<02:44, 27.37s/it][A
 55%|█████████████████████████████████████████████▎                                     | 6/11 [02:39<02:12, 26.56s/it][A
 64%|██████████████

# 2. Merge csv by topics

In [134]:
# some files need to be renamed to make sure the formats align
for file in [i for i in os.listdir(temp_folder) if i.startswith('Students enrolled ISCED 8')]:
    os.rename(temp_folder + '\\' + file, temp_folder + '\\' + file.replace('Students enrolled ISCED 8', 'Students enrolled at ISCED 8'))

for file in [i for i in os.listdir(temp_folder) if i.startswith('Students ISCED 7 long degree')]:
    os.rename(temp_folder + '\\' + file, temp_folder + '\\' + file.replace('Students ISCED 7 long degree', 'Students enrolled ISCED 7 long degree'))

# for some files, there is no space before and after '-'. Rename them to unify the formats
keywords_to_add_space = ['-citizenship unclassified', '-citizenship unclassified']
files_to_add_space = [file for file in os.listdir(temp_folder) if any(kw in file for kw in keywords_to_add_space)]
for file in files_to_add_space:
    new_name = file.split('-')[0] + ' - ' + file.split('-')[1] 
    os.rename(temp_folder + '\\' + file, temp_folder + '\\' + new_name)

for file in os.listdir(temp_folder):
    if '-citiz. unclassified' in file:
        new_name = file.split('-')[0] + ' - citizenship unclassified_' + file.split('_')[1] + '_' + file.split('_')[2]
        os.rename(temp_folder + '\\' + file, temp_folder + '\\' + new_name)

In [75]:
# get a dictionary of main indicators and their breakdown
dict_indicator = {}
for file in os.listdir(temp_folder):
    if len(file.split(' - '))>1:
        main_indicator = file.split(' - ')[0].split('_')[0].strip()
        breakdown = file.split(' - ')[1].split('_')[0].strip()
    elif len(file.split('('))>1:
        main_indicator = file.split('(')[0].strip()
        breakdown = file.split('(')[1].split(')')[0]
    else:
        main_indicator = file.split('_')[0]
        breakdown = ''
    if main_indicator in dict_indicator.keys():
        if breakdown in dict_indicator[main_indicator]:
            pass
        else:
            dict_indicator[main_indicator].append(breakdown)
    else:
        dict_indicator[main_indicator] = [breakdown]

In [77]:
num_indicator = {}
for indicator in dict_indicator.keys():
    num_indicator[indicator] = len(dict_indicator[indicator])
num_indicator

{'Academic personnel': 6,
 'Capital expenditure': 2,
 'Classification': 1,
 'Erasmus incoming staff': 1,
 'Erasmus incoming students': 2,
 'Erasmus outgoing staff': 1,
 'Erasmus outgoing students': 2,
 'Expenditure unclassified': 2,
 'Graduates at ISCED 5': 17,
 'Graduates at ISCED 5-7': 18,
 'Graduates at ISCED 6': 17,
 'Graduates at ISCED 7': 17,
 'Graduates at ISCED 7 long degree': 15,
 'Graduates at ISCED 8': 17,
 'Graduates ISCED 7 long degree': 1,
 'Graduates ISCED 7 long degree-gender unclassified': 1,
 'Non-personnel expenditure': 2,
 'Number of EU-FP projects': 13,
 'Number of senior academic personnel': 3,
 'Number of support and administrative personnel': 2,
 'Personnel expenditure': 2,
 'Revenue unclassified': 2,
 'Student fees funding': 2,
 'Students at ISCED 5': 1,
 'Students at ISCED 6': 1,
 'Students at ISCED 7': 1,
 'Students enrolled at ISCED 5': 16,
 'Students enrolled at ISCED 5-7': 18,
 'Students enrolled at ISCED 6': 16,
 'Students enrolled at ISCED 7': 16,
 'Stud

In [189]:
def reorganise_by_breakdown(indicator, dict_breakdown, save_folder):
    for breakdown in tqdm(dict_breakdown.keys()):
        df_breakdown_comb = None
        for criteria in tqdm(dict_breakdown[breakdown]):
            for year in [2003, 2006, 2010, 2013, 2016, 2021, 2024]:
                for obsTime in range(2011, 2022):
                    breakdown_file = indicator + ' - ' + criteria + '_' + str(obsTime) + '_' + 'NUTS' + str(year) + '.csv'
                    df_breakdown_temp = pd.read_csv(temp_folder + '\\' + breakdown_file, index_col=0)
                    df_breakdown_temp.rename(columns={'Indicator':'indicator', 'ObsValue':'obsValue', 'Unit':'unit', 'ObsTime':'obsTime'}, inplace=True)
                    df_breakdown_temp.drop(columns=['freq', 'unit'], inplace=True)
                    df_breakdown_temp['indicator'] = indicator
                    df_breakdown_temp[breakdown] = criteria
                    df_breakdown_comb = df_breakdown_temp if df_breakdown_comb is None else pd.concat([df_breakdown_comb, df_breakdown_temp])
        df_breakdown_comb.to_csv(save_folder + '\\' + indicator + '-' + breakdown + '.csv', index=False)

In [165]:
def reorganise_by_unit_and_breakdown(indi, breakdown, indicator_list, save_folder):
    df_breakdown_comb = None
    for indicator in tqdm(indicator_list):
        for unit in ['EURO', 'PPP']:
            for year in [2003, 2006, 2010, 2013, 2016, 2021, 2024]:
                    for obsTime in range(2011, 2022):
                        breakdown_file = indicator + ' (' + unit + ')' + '_' + str(obsTime) + '_' + 'NUTS' + str(year) + '.csv'
                        df_breakdown_unit = pd.read_csv(temp_folder + '\\' + breakdown_file, index_col=0)
                        df_breakdown_unit.rename(columns={'Indicator':'indicator', 'ObsValue':'obsValue', 'Unit':'unit', 'ObsTime':'obsTime'}, inplace=True)
                        df_breakdown_unit['class'] = indicator
                        df_breakdown_unit['unit'] = unit
                        df_breakdown_unit['indicator'] = indi
                        df_breakdown_unit['obsTime'] = obsTime
                        df_breakdown_comb = df_breakdown_unit if df_breakdown_comb is None else pd.concat([df_breakdown_comb, df_breakdown_unit])
    df_breakdown_comb.to_csv(save_folder + '\\' + indi + '-' + breakdown + '.csv', index=False)

In [169]:
dict_breakdown_18 = {'gender' : ['gender unclassified', 'men', 'women'], 
                     'citizenship' : ['citizenship unclassified', 'foreigner', 'national'],
                     'field':['Agriculture, forestry, fisheries and veterinary',
                              'Arts and Humanities',
                              'Business, administration and law',
                              'Education',
                              'Engineering, manufacturing and construction',
                              'Generic programmes and qualifications',
                              'Health and welfare',
                              'Information and Communication Technologies',
                              'Natural sciences, mathematics and statistics',
                              'Services',
                              'Social sciences, journalism and information',
                              'ISCED-FoE unclassified']}

dict_breakdown_17 = {'gender' : ['gender unclassified', 'men', 'women'], 
                     'citizenship' : ['citizenship unclassified', 'foreigner', 'national'],
                     'field':['Agriculture, forestry, fisheries and veterinary',
                              'Arts and Humanities',
                              'Business, administration and law',
                              'Education',
                              'Engineering, manufacturing and construction',
                              'Generic programmes and qualifications',
                              'Health and welfare',
                              'Information and Communication Technologies',
                              'Natural sciences, mathematics and statistics',
                              'Services',
                              'Social sciences, journalism and information']}

dict_breakdown_16 = {'gender' : ['gender unclassified', 'men', 'women'], 
                     'citizenship' : ['foreigner', 'national'],
                     'field':['Agriculture, forestry, fisheries and veterinary',
                              'Arts and Humanities',
                              'Business, administration and law',
                              'Education',
                              'Engineering, manufacturing and construction',
                              'Generic programmes and qualifications',
                              'Health and welfare',
                              'Information and Communication Technologies',
                              'Natural sciences, mathematics and statistics',
                              'Services',
                              'Social sciences, journalism and information']}

dict_breakdown_15 = {'gender' : ['men', 'women'], 
                     'citizenship' : ['foreigner', 'national'],
                     'field':['Agriculture, forestry, fisheries and veterinary',
                              'Arts and Humanities',
                              'Business, administration and law',
                              'Education',
                              'Engineering, manufacturing and construction',
                              'Generic programmes and qualifications',
                              'Health and welfare',
                              'Information and Communication Technologies',
                              'Natural sciences, mathematics and statistics',
                              'Services',
                              'Social sciences, journalism and information']}

dict_breakdown_13 = {'field':['Agriculture, Forestry, Fisheries and Veterinary',
                              'Arts and Humanities',
                              'Business, Administration and Law',
                              'Coordination and Support Action',
                              'Education',
                              'Engineering, Manufactoring and Construction',
                              'European Research Council Grants',
                              'Health and Welfare',
                              'Information and Communication Technologies',
                              'Innovation Action',
                              'Natural Sciences, Mathematics and Statistics',
                              'Research and Innovation Action',
                              'Social Sciences, Journalism and Information']}

dict_breakdown_6 = {'gender' : ['gender unclassified (HC)', 'men (HC)', 'women (HC)'], 
                  'citizenship' : ['citizenship unclassified', 'foreigner (HC)', 'national (HC)']}

dict_breakdown_3 = {'gender' : ['gender unclassified (HC)', 'men (HC)', 'women (HC)']}

dict_breakdown_2_Erasmus_st = {'education level':['ISCED 6', 'ISCED 7']}


In [191]:
save_folder = r'C:\2-Case studies\Higher Education Observatory'

for indicator in ['Graduates at ISCED 5-7', 'Students enrolled at ISCED 5-7']:
    reorganise_by_breakdown(indicator, dict_breakdown_18, save_folder)

for indicator in ['Graduates at ISCED 5', 'Graduates at ISCED 6', 'Graduates at ISCED 7', 'Graduates at ISCED 8']:
    reorganise_by_breakdown(indicator, dict_breakdown_17, save_folder)

for indicator in ['Students enrolled at ISCED 5', 'Students enrolled at ISCED 6', 'Students enrolled at ISCED 7', 'Students enrolled at ISCED 8']:
    reorganise_by_breakdown(indicator, dict_breakdown_16, save_folder)

for indicator in ['Graduates at ISCED 7 long degree']:
    reorganise_by_breakdown(indicator, dict_breakdown_15, save_folder)

for indicator in ['Number of EU-FP projects']:
    reorganise_by_breakdown(indicator, dict_breakdown_13, save_folder)

for indicator in ['Academic personnel']:
    reorganise_by_breakdown(indicator, dict_breakdown_6, save_folder)

for indicator in ['Number of senior academic personnel']:
    reorganise_by_breakdown(indicator, dict_breakdown_3, save_folder)

for indicator in ['Erasmus incoming students', 'Erasmus outgoing students']:
    reorganise_by_breakdown(indicator, dict_breakdown_2_Erasmus_st, save_folder)

  0%|                                                                                            | 0/3 [00:00<?, ?it/s]
  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 33%|████████████████████████████                                                        | 1/3 [00:00<00:01,  1.02it/s][A
 67%|████████████████████████████████████████████████████████                            | 2/3 [00:02<00:01,  1.12s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.42s/it][A
 33%|████████████████████████████                                                        | 1/3 [00:05<00:10,  5.28s/it]
  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 33%|████████████████████████████                                                        | 1/3 [00:01<00:03,  1.77s/it][A
 67%|█████████████████

In [167]:
indicator_list = ['Capital expenditure', 'Personnel expenditure', 'Non-personnel expenditure', 'Expenditure unclassified', 'Total Current expenditure']
reorganise_by_unit_and_breakdown('Expenditure', 'class', indicator_list, save_folder)

indicator_list = ['Total Current revenues', 'Revenue unclassified']
reorganise_by_unit_and_breakdown('Revenue', 'class', indicator_list, save_folder)

reorganise_by_unit_and_breakdown('Budget', 'class', ['Total core budget'], save_folder)

reorganise_by_unit_and_breakdown('Funding', 'class', ['Student fees funding', 'Total third party funding'], save_folder)

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:31<00:00,  6.30s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.23s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.51s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.26s/it]


In [5]:
# combine some of the files on related indicators so that on dashboard they appear in one theme
read_folder = r'C:\2-Case studies\Higher Education Observatory'
save_folder = r'C:\2-Case studies\Higher Education Observatory\old'

# Erasmus students
erasmus_out = pd.read_csv(read_folder + '\\' + 'Erasmus outgoing students-education level.csv')
erasmus_in = pd.read_csv(read_folder + '\\' + 'Erasmus incoming students-education level.csv')
erasmus_in['direction'] = 'incoming'
erasmus_in['indicator'] = 'Erasmus students'
erasmus_out['direction'] = 'outgoing'
erasmus_out['indicator'] = 'Erasmus students'
df_erasmus = pd.concat([erasmus_in, erasmus_out])
df_erasmus.to_csv(save_folder + '\\' + 'Erasmus students-education level.csv', index=False)

# Academic personnel
academic_citizenship = pd.read_csv(read_folder + '\\' + 'Academic personnel-citizenship.csv')
academic_gender = pd.read_csv(read_folder + '\\' + 'Academic personnel-gender.csv')
academic_citizenship['gender'] = 'all'
academic_gender['citizenship'] = 'all'
df_academic = pd.concat([academic_citizenship, academic_gender])
df_academic.to_csv(save_folder + '\\' + 'academic personnel.csv', index=False)

In [61]:
# students enrolled
all_breakdown = ['citizenship', 'gender', 'field']

df_st_enrolled = None
for file in tqdm(os.listdir(read_folder)):
    if file.startswith('Students enrolled'):
        df_temp = pd.read_csv(read_folder + '\\' + file)
        if len(file.split('-'))>2:
            level = 'ISCED 5-7'
        else:
            level = file.split('-')[0][20:]
        df_temp['indicator'] = 'students enrolled'
        df_temp['educational_level'] = level
        for i in [i for i in all_breakdown if i not in df_temp.columns]:
            df_temp[i] = 'all'
        df_st_enrolled = df_temp if df_st_enrolled is None else pd.concat([df_st_enrolled, df_temp])

df_st_enrolled.to_csv(save_folder + '\\' + 'Students enrolled.csv')

100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [00:16<00:00,  2.16it/s]


In [81]:
# graduates
all_breakdown = ['citizenship', 'gender', 'field']

df_graduates = None
for file in tqdm(os.listdir(read_folder)):
    if file.startswith('Graduates'):
        df_temp = pd.read_csv(read_folder + '\\' + file)
        if len(file.split('-'))>2:
            level = 'ISCED 5-7'
        else:
            level = file.split('-')[0][13:]
        df_temp['indicator'] = 'graduates'
        df_temp['educational_level'] = level
        for i in [i for i in all_breakdown if i not in df_temp.columns]:
            df_temp[i] = 'all'
        df_graduates = df_temp if df_graduates is None else pd.concat([df_graduates, df_temp])

df_graduates.to_csv(save_folder + '\\' + 'Graduates.csv')

100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [00:18<00:00,  1.90it/s]


In [None]:
# total academic personnel
df_total_ap = None
for file in tqdm([i for i in os.listdir(read_folder) if i.startswith('Total academic personnel')]):
    df_temp = pd.read_csv(read_folder+'//'+file, index_col=0)
    df_temp['Unit'] = df_temp['Indicator'].apply(lambda x: x.split('(')[1][:-1])
    df_temp.drop(columns=['Indicator',	'freq'], inplace=True)
    df_total_ap = df_temp if df_total_ap is None else pd.concat([df_total_ap, df_temp])

df_total_ap['id'] = df_total_ap.reset_index().index
df_total_ap = df_total_ap[['id'] + [i for i in df_total_ap.columns if i not in ['geo_source', 'id']]+['geo_source']]
df_total_ap.to_csv(save_folder + '\\' + 'Total academic personnel.csv', index=False)

# 3. Add gender ratio

In [4]:
def reformatting(df):
    # add the id column (sequential integers). Move id to the first and geo_source to the last column
    df['id'] = df.reset_index().index
    df = df[['id'] + [i for i in df.columns if i not in ['geo_source', 'id']]+['geo_source']]

    # drop the unwanted column
    drop_columns = [i for i in ['Unnamed: 0', 'indicator', 'freq'] if i in df.columns]
    df = df.drop(columns = drop_columns)   

    # transform the fisrt letter of column names to lowercase
    df.columns = lower_columns = [i[0].lower()+i[1:] for i in df.columns]
    
    return df

In [6]:
def add_gender_ratio(df, unique_class=''):
    # remove (HC) from women (HC) or men (HC)
    df['gender'] = df['gender'].apply(lambda x: x.split('(')[0].strip())
    # add a unique identifier for merge
    if unique_class:
        df['unique'] = df['geo'] + '_' + df['geo_source'] + '_' + df['obsTime'].astype(str) + '_' + df[unique_class]
    else:
        df['unique'] = df['geo'] + '_' + df['geo_source'] + '_' + df['obsTime'].astype(str)
    female = df[df['gender']=='women']
    male = df[df['gender']=='men']
    df_gender = pd.merge(female, male[['obsValue', 'unique']], how='outer', on='unique')
    
    # calculate gender ratio for those units with valid data of male and female worker. Otherwise assign NA
    df_gender['obsValue'] = df_gender.apply(lambda x: x['obsValue_x']/x['obsValue_y'] if x['obsValue_y']*x['obsValue_x'] != 0 else 'NA', axis=1)
    
     # formatting the df_gender columns
    df_gender = df_gender.drop(columns=['obsValue_x', 'obsValue_y'])
    df_gender.loc[:,'gender'] = 'gender ratio'
    df = pd.concat([df, df_gender])
    df = df.drop('unique', axis = 1)
    df = reformatting(df)

    return df

In [8]:
read_raw_folder = r'C:\2-Case studies\Higher Education Observatory\temp'
read_folder = r'C:\2-Case studies\Higher Education Observatory\old'
save_folder = r'C:\2-Case studies\Higher Education Observatory\new'

In [None]:
gender_dict = {'Students enrolled.csv':'level', 'Graduates.csv':'level', 'Number of senior academic personnel-gender.csv':''}
for file in tqdm(gender_dict.keys()):
    df = pd.read_csv(read_folder + '\\' + file)
    unique_class = gender_dict[file]
    df = add_gender_ratio(df, unique_class)
    df.to_csv(save_folder + '\\' + file, index=False)

In [None]:
for file in tqdm(['Erasmus students-education level.csv', 'Expenditure-class.csv', 'Funding-class.csv', 
             'Number of EU-FP projects-field.csv', 'Revenue-class.csv']):
    df = pd.read_csv(read_folder + '\\' + file)
    df = reformatting(df)
    df.to_csv(save_folder + '\\' + file, index=False)

In [20]:
file = 'Budget-class.csv'
df = pd.read_csv(read_folder + '\\' + file)
df = reformatting(df)
df = df.drop('class', axis = 1)
df.to_csv(save_folder + '\\' + 'Total core budget.csv', index=False)

In [22]:
# concatenante total academic personnel and academic personnel csv
file = 'Total academic personnel.csv'
df_0 = pd.read_csv(read_folder + '\\' + file)
df_0['citizenship'] = 'all'
df_0['gender'] = 'all'
df_0 = reformatting(df_0)

file = 'academic personnel.csv'
df_1 = pd.read_csv(read_folder + '\\' + file)
df_1 = add_gender_ratio(df_1)
df_1['unit'] = 'HC'

df = pd.concat([df_0, df_1])
df['id'] = df.reset_index().index

df.to_csv(save_folder + '\\' + 'academic personnel.csv', index=False)