# **Exploitation Zone (KG-based)**

In [1]:
!git clone https://github.com/OscarMoliina/betterlifebetterhealth

Cloning into 'betterlifebetterhealth'...
remote: Enumerating objects: 342, done.[K
remote: Counting objects: 100% (187/187), done.[K
remote: Compressing objects: 100% (152/152), done.[K
remote: Total 342 (delta 58), reused 124 (delta 35), pack-reused 155[K
Receiving objects: 100% (342/342), 100.31 MiB | 16.11 MiB/s, done.
Resolving deltas: 100% (108/108), done.


In [11]:
#!pip3 install pyspark
import pyspark

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, mean, when, lit, count
from pyspark.sql.window import Window
from pyspark.sql.functions import mean
from pyspark.sql.functions import mean
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.functions import collect_list

import pandas as pd
import matplotlib.pyplot as plt
import duckdb

In [4]:
spark = SparkSession.builder \
    .appName("Preprocessing") \
    .config("spark.jars", "/content/betterlifebetterhealth/src/utils/duckdb.jar") \
    .getOrCreate()

join = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:duckdb:/content/betterlifebetterhealth/data/db/exploitation_zone.db") \
    .option("driver", "org.duckdb.DuckDBDriver") \
    .option("dbtable", "join_table") \
    .load()

### **Classificació dels països per regió:**
Afegim relació per dividir Europa en 4 zones: Europa del Este, Europa del Sur, Europa Occidental y Europa del Norte.

1. Europa del Nord: Suècia, Finlàndia, Noruega, Dinamarca, Islàndia, Estònia, Letònia, Lituània
2. Europa de l'Est: Polònia, Eslovàquia, República Txeca, Hongria, Romania, Bulgària, Bielorússia, Ucraïna, Moldàvia, Rússia, Albània, Kosovo, Macedònia del Nord, Montenegro, Sèrbia, Bòsnia i Hercegovina
3. Europa del Sud: Itàlia, Espanya, Portugal, Grècia, Turquia, Xipre, Malta, Croàcia, Eslovènia
4. Europa Occidental: Alemanya, França, Bèlgica, Països Baixos, Àustria, Suïssa, Luxemburg, Regne Unit, Irlanda

In [5]:
# Lista de países por región incluyendo Asia
north_europe = ['Sweden', 'Finland', 'Norway', 'Denmark', 'Iceland', 'Estonia', 'Latvia', 'Lithuania']
east_europe = ['Poland', 'Slovakia', 'Hungary', 'Romania', 'Bulgaria', 'Belarus', 'Ukraine', 'Albania', 'Moldova', 'Czech Republic', 'Russia', 'North Macedonia', 'Montenegro', 'Serbia', 'Bosnia and Herzegovina']
south_europe = ['Italy', 'Spain', 'Portugal', 'Greece', 'Cyprus', 'Malta', 'Croatia', 'Slovenia']
west_europe = ['Germany', 'France', 'Belgium', 'Netherlands', 'Austria', 'Switzerland', 'Luxembourg', 'United Kingdom', 'Ireland']
asia = ['Turkey', 'Israel', 'Georgia', 'Azerbaijan', 'Armenia', 'Kazakhstan', 'Uzbekistan', 'Turkmenistan', 'Tajikistan', 'Kyrgyzstan']

# Función para asignar la región basada en el país
def assign_region(country):
    if country in north_europe:
        return 'Europe North'
    elif country in east_europe:
        return 'Europe East'
    elif country in south_europe:
        return 'Europe South'
    elif country in west_europe:
        return 'Europe West'
    elif country in asia:
        return 'Asia'
    else:
        return 'Other'

region_udf = udf(assign_region, StringType())

# Crear una nueva columna con la región asignada
join = join.withColumn('Region', region_udf(join['Country']))

### **Pertanyença a UE:**
Afegim relació per distingir els que pertanyen a la Unió Europea i els que no.

In [6]:
eu_countries = [
    'Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark',
    'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Ireland', 'Italy',
    'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal',
    'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden'
]

# Función para determinar la membresía en la UE
def is_eu_member(country):
    return 'EU Member' if country in eu_countries else 'Non-EU Member'

# UDF para aplicar la función en Spark
eu_member_udf = udf(is_eu_member, StringType())

join = join.withColumn('EU Membership', eu_member_udf(join['Country']))


### **Pertanyença a NATO:**



In [7]:
# Lista de países miembros de la OTAN
nato_countries = [
    'Albania', 'Belgium', 'Bulgaria', 'Canada', 'Croatia', 'Czech Republic', 'Denmark',
    'Estonia', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Italy', 'Latvia',
    'Lithuania', 'Luxembourg', 'Montenegro', 'Netherlands', 'North Macedonia', 'Norway',
    'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Turkey', 'United Kingdom', 'United States'
]

# Función para determinar la membresía en la OTAN
def is_nato_member(country):
    return 'NATO Member' if country in nato_countries else 'Non-NATO Member'

# UDF para aplicar la función en Spark
nato_member_udf = udf(is_nato_member, StringType())

join = join.withColumn('NATO Membership', nato_member_udf(join['Country']))


### **Països que fan frontera:**

In [15]:
fronteres = spark.read.option("header", "true").csv('/content/betterlifebetterhealth/data/csv/GEODATASOURCE-COUNTRY-BORDERS.CSV')

In [17]:
borders_grouped = fronteres.groupBy("country_name").agg(collect_list("country_border_name").alias("border_countries"))

In [24]:
join = join.join(borders_grouped, join['Country'] == borders_grouped['country_name'], 'left')

In [29]:
join = join.drop("country_name")