# **Exploitation Zone (KG-based)**

In [1]:
!git clone https://github.com/OscarMoliina/betterlifebetterhealth

Cloning into 'betterlifebetterhealth'...
remote: Enumerating objects: 360, done.[K
remote: Counting objects: 100% (205/205), done.[K
remote: Compressing objects: 100% (170/170), done.[K
remote: Total 360 (delta 60), reused 129 (delta 35), pack-reused 155[K
Receiving objects: 100% (360/360), 100.73 MiB | 24.91 MiB/s, done.
Resolving deltas: 100% (110/110), done.


In [2]:
!pip3 install pyspark
import pyspark

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, mean, when, lit, count, to_json
from pyspark.sql.window import Window
from pyspark.sql.functions import mean
from pyspark.sql.functions import mean
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.functions import collect_list

import pandas as pd
import matplotlib.pyplot as plt
import duckdb

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=d8c31cf0e7e309fde83d6cbe17f900f7eb769cc326ecafecefe4a30c3c92aeed
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [3]:
spark = SparkSession.builder \
    .appName("Preprocessing") \
    .config("spark.jars", "/content/betterlifebetterhealth/src/utils/duckdb.jar") \
    .getOrCreate()

join = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:duckdb:/content/betterlifebetterhealth/data/db/exploitation_zone.db") \
    .option("driver", "org.duckdb.DuckDBDriver") \
    .option("dbtable", "join_table") \
    .load()

### **Classificació dels països per regió:**
Afegim relació per dividir Europa en 4 zones: Europa del Este, Europa del Sur, Europa Occidental y Europa del Norte.

1. Europa del Nord: Suècia, Finlàndia, Noruega, Dinamarca, Islàndia, Estònia, Letònia, Lituània
2. Europa de l'Est: Polònia, Eslovàquia, República Txeca, Hongria, Romania, Bulgària, Bielorússia, Ucraïna, Moldàvia, Rússia, Albània, Kosovo, Macedònia del Nord, Montenegro, Sèrbia, Bòsnia i Hercegovina
3. Europa del Sud: Itàlia, Espanya, Portugal, Grècia, Turquia, Xipre, Malta, Croàcia, Eslovènia
4. Europa Occidental: Alemanya, França, Bèlgica, Països Baixos, Àustria, Suïssa, Luxemburg, Regne Unit, Irlanda

In [4]:
# Lista de países por región incluyendo Asia
north_europe = ['Sweden', 'Finland', 'Norway', 'Denmark', 'Iceland', 'Estonia', 'Latvia', 'Lithuania']
east_europe = ['Poland', 'Slovakia', 'Hungary', 'Romania', 'Bulgaria', 'Belarus', 'Ukraine', 'Albania', 'Moldova', 'Czech Republic', 'Russia', 'North Macedonia', 'Montenegro', 'Serbia', 'Bosnia and Herzegovina']
south_europe = ['Italy', 'Spain', 'Portugal', 'Greece', 'Cyprus', 'Malta', 'Croatia', 'Slovenia']
west_europe = ['Germany', 'France', 'Belgium', 'Netherlands', 'Austria', 'Switzerland', 'Luxembourg', 'United Kingdom', 'Ireland']
asia = ['Turkey', 'Israel', 'Georgia', 'Azerbaijan', 'Armenia', 'Kazakhstan', 'Uzbekistan', 'Turkmenistan', 'Tajikistan', 'Kyrgyzstan']

# Función para asignar la región basada en el país
def assign_region(country):
    if country in north_europe:
        return 'Europe North'
    elif country in east_europe:
        return 'Europe East'
    elif country in south_europe:
        return 'Europe South'
    elif country in west_europe:
        return 'Europe West'
    elif country in asia:
        return 'Asia'
    else:
        return 'Other'

region_udf = udf(assign_region, StringType())

# Crear una nueva columna con la región asignada
join = join.withColumn('Region', region_udf(join['Country']))

### **Pertanyença a UE:**
Afegim relació per distingir els que pertanyen a la Unió Europea i els que no.

In [5]:
eu_countries = [
    'Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark',
    'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Ireland', 'Italy',
    'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal',
    'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden'
]

# Función para determinar la membresía en la UE
def is_eu_member(country):
    return 'EU Member' if country in eu_countries else 'Non-EU Member'

# UDF para aplicar la función en Spark
eu_member_udf = udf(is_eu_member, StringType())

join = join.withColumn('EU Membership', eu_member_udf(join['Country']))

### **Pertanyença a NATO:**



In [6]:
# Lista de países miembros de la OTAN
nato_countries = [
    'Albania', 'Belgium', 'Bulgaria', 'Canada', 'Croatia', 'Czech Republic', 'Denmark',
    'Estonia', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Italy', 'Latvia',
    'Lithuania', 'Luxembourg', 'Montenegro', 'Netherlands', 'North Macedonia', 'Norway',
    'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Turkey', 'United Kingdom', 'United States'
]

# Función para determinar la membresía en la OTAN
def is_nato_member(country):
    return 'NATO Member' if country in nato_countries else 'Non-NATO Member'

# UDF para aplicar la función en Spark
nato_member_udf = udf(is_nato_member, StringType())

join = join.withColumn('NATO Membership', nato_member_udf(join['Country']))


### **Països que fan frontera:**

In [7]:
fronteres = spark.read.option("header", "true").csv('/content/betterlifebetterhealth/data/csv/GEODATASOURCE-COUNTRY-BORDERS.CSV')

In [8]:
borders_grouped = fronteres.groupBy("country_name").agg(collect_list("country_border_name").alias("border_countries"))

In [9]:
join = join.join(borders_grouped, join['Country'] == borders_grouped['country_name'], 'left')

In [10]:
join = join.drop("country_name")

In [16]:
join = join.withColumn("border_countries", to_json(col("border_countries")))

In [20]:
join.write \
    .format("jdbc") \
    .option("url", "jdbc:duckdb:exploitation_zone_2.db") \
    .option("dbtable", "join_new_relations") \
    .option("driver", "org.duckdb.DuckDBDriver") \
    .save()


### **Guardar db a github:**

In [21]:
# Clone the repository
%cd betterlifebetterhealth/data/db

# Copy new files from your local directory to this directory
!cp -R /content/*.db .

/content/betterlifebetterhealth/data/db


In [22]:
#CODI A LA DOCUMENTACIO NO POSAT AQUÍ PER PRIVACITAT DE USUARI

[main 50cd8a3] Update db files new
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 data/db/exploitation_zone_2.db
Enumerating objects: 8, done.
Counting objects: 100% (8/8), done.
Delta compression using up to 2 threads
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 197.17 KiB | 5.19 MiB/s, done.
Total 5 (delta 1), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/OscarMoliina/betterlifebetterhealth.git
   1fa8450..50cd8a3  main -> main
