In [38]:
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("preparation").getOrCreate()

import csv
import pycountry
import pycountry_convert as pc
from pyspark.sql.functions import when

In [39]:
def get_continent(country_name):
    try:
        country_code = pycountry.countries.lookup(country_name).alpha_2
        continent_code = pc.country_alpha2_to_continent_code(country_code)
        continent_name = continent_code_to_name(continent_code)
        return continent_name
    except LookupError:

        manual_mapping = {
            'The Bahamas': 'Americas', 'The Gambia': 'Africa', 'Brunei': 'Asia', 'Cape Verde': 'Africa',
            'Ivory Coast': 'Africa', 'Democratic Republic of the Congo': 'Africa', 'Guinea0Bissau': 'Africa',
            'Vatican City': 'Europe', 'Republic of Ireland': 'Europe', 'Russia': 'Europe',
            'Palestinian National Authority': 'Asia', 'East Timor': 'Asia', 'Turkey': 'Asia'}
        return manual_mapping.get(country_name, None)

def continent_code_to_name(continent_code):
    continent_mapping = {
        'AF': 'Africa', 'AS': 'Asia', 'EU': 'Europe', 'NA': 'Americas', 'OC': 'Oceania', 'SA': 'Americas'}
    return continent_mapping.get(continent_code, None)

input_file = "global_education_data.csv"
output_file = "region.csv"

# Read the CSV file and add the continent information
with open(input_file, mode='r', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    fieldnames = ['Countries and areas', 'Region']
    rows = []

    for row in reader:
        country_name = row['Countries and areas']
        region = get_continent(country_name)
        rows.append({'Countries and areas': country_name, 'Region': region})

# Write the new CSV file 
with open(output_file, mode='w', encoding='utf-8', newline='') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(rows)


In [40]:
region = spark.read.csv('region.csv', inferSchema=True, header=True)

In [41]:
region.show()

+-------------------+--------+
|Countries and areas|  Region|
+-------------------+--------+
|        Afghanistan|    Asia|
|            Albania|  Europe|
|            Algeria|  Africa|
|            Andorra|  Europe|
|             Angola|  Africa|
|           Anguilla|Americas|
|Antigua and Barbuda|Americas|
|          Argentina|Americas|
|            Armenia|    Asia|
|          Australia| Oceania|
|            Austria|  Europe|
|         Azerbaijan|    Asia|
|        The Bahamas|Americas|
|            Bahrain|    Asia|
|         Bangladesh|    Asia|
|           Barbados|Americas|
|            Belarus|  Europe|
|            Belgium|  Europe|
|             Belize|Americas|
|              Benin|  Africa|
+-------------------+--------+
only showing top 20 rows

