### Academic Freedom index data preprocessing

This script serves to modify the **academic freedom index** dataset (available here: https://www.v-dem.net/data/the-v-dem-dataset/country-year-v-dem-fullothers-v13/) and merge it with the diploma thesis main data frame. The output of the script is a single `.csv` file that contains the data with the academic freedom index for each observation of the master data frame.

#### How to run:
* Place both data frames in the working directory
* Run the script

In [1]:
import os

import pandas as pd

In [2]:
def checkCountryNames(index_data, master_data, country_dict):
    '''Input the two data frames, a translation dictionary and check whether all countries of the master frame are
    named correctly in the index data frame or they are a part of the country dictionary. If no translation
    is known for a country, throw an error.

    As for the dictionary, the keys should be the names in the index data, and the values should be the names
    in the master data frame.
    '''
    # Input validation
    if not ('country' in master_data.columns and 'country_name' in index_data.columns):
        raise ValueError("Check that the data column names are correct.")
    # Main 
    missing_countries = []
    master_countries = master_data['country'].unique()
    index_countries = index_data['country_name'].unique()
    for country in master_countries:
        if not ((country in index_countries) or (country in country_dict.values())):
            print(f"Unhandled country: {country}")
            missing_countries.append(country)
            continue
    if len(missing_countries) > 0:
        raise ValueError("Unhandled country names.")
    print("All countries are well specified or are a part of the country dictionary.")

def preprocessData(index_data:pd.DataFrame, master_data:pd.DataFrame, country_dict:dict, to_excel:bool = True):    
    '''Input the two data frames, merge them togther, and output a single .csv file called
    "output.csv" that will contain the necessary column.
    '''
    # First, rename the columns in index_data to match the column names in master_data
    index_data = index_data.rename(columns={'country_name': 'country', 'year': 'data_avgyear'})

    # Replace the country names in index_data based on the country_dict
    index_data['country'] = index_data['country'].replace(country_dict)

    # Merge the two data frames on the 'country' and 'data_avgyear' columns
    master_data = master_data.merge(index_data[['country', 'data_avgyear', 'v2xca_academ']], on=['country', 'data_avgyear'], how='left')

    # Rename the 'v2xca_academ' column to 'freedom_index'
    master_data = master_data.rename(columns={'v2xca_academ': 'freedom_index'})

    # Master data to a .csv file
    if to_excel:
        master_data.to_csv("output.csv", index = False)

    return master_data

In [5]:
index_data_path = 'freedom_index_data.xlsx'
# master_data_path = 'master_data.xlsx'
master_data_path = 'twin_data.xlsx'

if (not os.path.exists(index_data_path)) or (not os.path.exists(master_data_path)):
    raise ValueError("Missing source file")

index_data = pd.read_excel(index_data_path)
master_data = pd.read_excel(master_data_path)

# Define a translation dictionary that will allow the function to link different country names to one country
country_dict = {
    "United States of America": "United States",
    "Palestine/West Bank": "Palestine",
    "Czechia": "Czech Republic",
    # "United Kingdom": "Great Britain", # In case there is Great Britain instead of United Kingdom
}

# Check whether all countries in the dataset can be found inside the index data frame or the country dictionary
checkCountryNames(index_data, master_data, country_dict)

# Create the output
output_data = preprocessData(index_data, master_data, country_dict, to_excel = True)

All countries are well specified or are a part of the country dictionary.
