# **Discretització i càlcul de variables**

## *Requisits d'execució*

In [1]:
!pip3 install pyspark rdflib duckdb

# Importació de mòduls de pyspark
import pyspark
from pyspark.sql import SparkSession

from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, DoubleType, StringType, FloatType
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline
from pyspark.sql.types import BooleanType


# Importació de mòduls de rdflib
from rdflib import Graph, URIRef, Literal, Namespace, RDF, RDFS, XSD

# Importacions generals de Python
import pandas as pd
import matplotlib.pyplot as plt
import duckdb
from decimal import Decimal, InvalidOperation
import json
from urllib.parse import quote

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rdflib
  Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m531.9/531.9 kB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
Collecting isodate<0.7.0,>=0.6.0 (from rdflib)
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=9314e02c49a45fca5fa621cce91854a5077a51717739185c3ba101b7b67b11e9
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dd

In [2]:
!git clone https://github.com/OscarMoliina/betterlifebetterhealth.git

Cloning into 'betterlifebetterhealth'...
remote: Enumerating objects: 456, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 456 (delta 8), reused 4 (delta 4), pack-reused 439[K
Receiving objects: 100% (456/456), 99.68 MiB | 17.38 MiB/s, done.
Resolving deltas: 100% (141/141), done.


In [3]:
spark = SparkSession.builder \
    .appName("Afegir dades") \
    .config("spark.jars", "/content/betterlifebetterhealth/src/utils/duckdb.jar") \
    .getOrCreate()

result = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:duckdb:/content/betterlifebetterhealth/data/db/explotaiton_zone_e2_def.db") \
    .option("driver", "org.duckdb.DuckDBDriver") \
    .option("dbtable", "relations_added") \
    .load()



In [4]:
df = result

## *Càlcul de l'índex de poder de la dona*

In [6]:
# Fer un càlcul de l'índex de poder de la dona per cada registre
positive = ['Female tertiary students, percent of total', 'Female legislators, senior officials and managers, percent of total', 'Female professionals, percent of total for both sexes',
 'Female members of parliament, percent of total','Female plant and machine operators and assemblers, percent of total for both sexes', 'Women in the Labour Force, Percent of corresponding total for both sexes']

negative = ['Female clerks, percent of total for both sexes', 'Female craft and related workers, percent of total for both sexes']

def calculate_female_power(*cols):
    power = 0
    # cols[0:len(positive)] correspon a les columnes positives
    # cols[len(positive):] correspon a les columnes negatives
    power += sum(cols[0:len(positive)])
    power -= sum(cols[len(positive):])
    return power

# Registra l'UDF
calculate_female_power_udf = F.udf(calculate_female_power, FloatType())
df = df.withColumn('Female_Power', calculate_female_power_udf(*[F.col(c) for c in positive + negative]))

In [7]:
df = df.drop(*positive)
df = df.drop(*negative)

## *Càlcul del GDP_Main*

In [8]:
columnes_GDP = [c for c in df.columns if 'GDP:' in c]

def calculate_gdp_main(*cols):
    max_value = 0
    category = None
    for i, col_value in enumerate(cols):
        if col_value is not None and col_value > max_value:
            max_value = col_value
            # La columna corresponent és la part després de "GDP: in "
            category = columnes_GDP[i].split(",")[0].split("GDP: in ")[-1].strip()
    return category

# Registra l'UDF
calculate_gdp_main_udf = F.udf(calculate_gdp_main, StringType())
df = df.withColumn('GDP_Main', calculate_gdp_main_udf(*[F.col(c) for c in columnes_GDP]))
df = df.drop(*columnes_GDP)

## *Codificar Països*

Cal treure els espais en el nom del pais per estalviar-nos errors de lectura

In [9]:
df = df.withColumn('Country', F.regexp_replace('Country', ' ', '_'))

## *Tractament de la variable Immigrants*

In [10]:
# Calcular el valor màxim i mínim de la columna 'Immigrants'
max_min_immigrants = df.select(
    F.max("Immigrants").alias("Max Immigrants"),
    F.min("Immigrants").alias("Min Immigrants")
)

# Mostrar els resultats
max_min_immigrants.show()


+--------------+--------------+
|Max Immigrants|Min Immigrants|
+--------------+--------------+
|          9978|             0|
+--------------+--------------+



In [11]:
df = df.withColumn("Num_Immigrants", F.when(F.col("Immigrants") > 5000, F.lit(True)).otherwise(F.lit(False)))
df = df.drop("Immigrants")


In [12]:
df.show()

+-------+----+--------+-----+-----+-------+-----+------+-----+--------+----+------------------+--------------------+--------------------+---------------------+----------------------+------------------+-------------------------+-------------------+----------------------------------+--------------------------+----------------------------+-----------------------------------------+-----------------------------+-----------------+-----------------------+-----------------------------------------------+----------------------------------------------+------------------------------------------------------------------+------------------------------------------+---------------------------------+---------------------------------------------+---------------------------------------------+-------------------------------------------------------+--------------------+---------------------+-------------------+------------------------+-------------------------+----------------------+-------------+----------

## *Esborrar variables que no utilitzarem*

In [13]:
columns_to_drop=['Deaths', 'E0','MR0_4', 'GSCA', 'Schizophrenia (%)',
       'Bipolar disorder (%)', 'Eating disorders (%)', 'Anxiety disorders (%)',
       'Drug use disorders (%)', 'Depression (%)', 'Alcohol use disorders (%)',
       'Population density, pers per sq km',
       'Total population, male (%)', 'Total population, female (%)',
       'Mean age of women at birth of first child',
       'Total employment, growth rate',
       'Youth unemployment rate',
       'GDP at current prices and PPPs, millions of US$',
       'GDP per capita at current prices and PPPs, US$',
       'Final consumption expenditure per capita, US Dollars, current PPPs',
       'Purchasing power parity (PPP), NCU per US$',
       'Consumer price index, growth rate',
       'Export of goods and services, per cent of GDP',
       'Import of goods and services, per cent of GDP',
       'External balance on goods and services, per cent of GDP',
       'Population aged 0-14', 'Population aged 15-64', 'Population aged 64+',
       'Life expectancy at birth', 'Life expectancy at age 65', 'quantitat_exportada']

df = df.drop(*columns_to_drop)
df.show()

+-------+----+--------+-----+-----+------+--------+-------------------+-----------------+----------------------+-------------+------------+-------------+---------------+--------------------+-----------+------------+--------------------+--------------+
|Country|Year|Area_Km2|  CBR|  CDR|Medage|Pop_Dens|   Total population|Unemployment rate|Economic activity rate|pais_exportat|      Region|EU Membership|NATO Membership|    border_countries|Destination|Female_Power|            GDP_Main|Num_Immigrants|
+-------+----+--------+-----+-----+------+--------+-------------------+-----------------+----------------------+-------------+------------+-------------+---------------+--------------------+-----------+------------+--------------------+--------------+
| Sweden|2016|  410335|11.78| 9.13|  41.0|    24.3|2.197927993360785E7|              6.9|     70.05823988193126|       France|Europe North|    EU Member|Non-NATO Member|["Finland","Norway"]|     Turkey|   161.96797|               trade|        

# **Ontologia**

In [14]:
# Definir el Namespace
onto_ns = Namespace("http://entrega_2_ojj.org/onto#")

# Crear el graf
g = Graph()

# Definir les classes
year_class = URIRef(onto_ns.Year)

country_class = URIRef(onto_ns.Country)
medage_class = URIRef(onto_ns.Medage)
CBR_class = URIRef(onto_ns.CBR)
CDR_class = URIRef(onto_ns.CDR)
FP_class = URIRef(onto_ns.FemalePower)
EU_class = URIRef(onto_ns.EU_Membership)
NATO_class = URIRef(onto_ns.NATO_Membership)
region_class = URIRef(onto_ns.region)
area_class = URIRef(onto_ns.Area_Km2)
TotalPop_class = URIRef(onto_ns.Total_Pop)
PopDens_class = URIRef(onto_ns.Pop_Dens)
EcoRate_class = URIRef(onto_ns.Economic_Activity_Rate)
Unemployment_class = URIRef(onto_ns.Unemployment)
GDP_Main_class = URIRef(onto_ns.GDP_Main)
Region_class = URIRef(onto_ns.region)

# Afegir les subclasses
g.add((year_class, RDF.type, RDFS.Class))
g.add((country_class, RDF.type, RDFS.Class))
g.add((medage_class, RDF.type, RDFS.Class))
g.add((CBR_class, RDF.type, RDFS.Class))
g.add((CDR_class, RDF.type, RDFS.Class))
g.add((EU_class, RDF.type, RDFS.Class))
g.add((FP_class,RDF.type, RDFS.Class))
g.add((NATO_class, RDF.type, RDFS.Class))
g.add((region_class, RDF.type, RDFS.Class))
g.add((area_class, RDF.type, RDFS.Class))
g.add((TotalPop_class, RDF.type, RDFS.Class))
g.add((PopDens_class, RDF.type, RDFS.Class))
g.add((EcoRate_class, RDF.type, RDFS.Class))
g.add((Unemployment_class, RDF.type, RDFS.Class))
g.add((GDP_Main_class, RDF.type, RDFS.Class))
g.add((Region_class, RDF.type, RDFS.Class))

g.add((country_class, RDFS.subClassOf, year_class))
g.add((medage_class, RDFS.subClassOf, country_class))
g.add((CBR_class, RDFS.subClassOf, country_class))
g.add((CDR_class, RDFS.subClassOf, country_class))
g.add((EU_class, RDFS.subClassOf, country_class))
g.add((FP_class, RDFS.subClassOf, country_class))
g.add((NATO_class, RDFS.subClassOf, country_class))
g.add((region_class, RDFS.subClassOf, country_class))
g.add((area_class, RDFS.subClassOf, country_class))
g.add((TotalPop_class, RDFS.subClassOf, country_class))
g.add((PopDens_class, RDFS.subClassOf, country_class))
g.add((EcoRate_class, RDFS.subClassOf, country_class))
g.add((Unemployment_class, RDFS.subClassOf, country_class))
g.add((GDP_Main_class, RDFS.subClassOf, country_class))
g.add((Region_class, RDFS.subClassOf, country_class))

# Definir les propietats
properties = {
    'in': {'domain': year_class, 'range': country_class},
    'hasMedAge': {'domain': country_class, 'range': medage_class},
    'hasCBR': {'domain': country_class, 'range': CBR_class},
    'hasCDR': {'domain': country_class, 'range': CDR_class},
    'hasFP': {'domain': country_class, 'range': FP_class},
    'hasEUStatus': {'domain': country_class, 'range': EU_class},
    'inRegion': {'domain': country_class, 'range': region_class},
    'hasArea': {'domain': country_class, 'range': area_class},
    'hasTotalPopulation': {'domain': country_class, 'range': TotalPop_class},
    'hasPopulationDensity': {'domain': country_class, 'range': PopDens_class},
    'hasEconomicActivityRate': {'domain': country_class, 'range': EcoRate_class},
    'hasUnemploymentRate': {'domain': country_class, 'range': Unemployment_class},
    'hasGDPMain': {'domain': country_class, 'range': GDP_Main_class},
    'borders_with': {'domain': country_class, 'range': country_class},
    'belongs_to': {'domain': country_class, 'range': NATO_class},
    'exports_to': {'domain': country_class, 'range': country_class},
    'situated_in': {'domain': country_class, 'range': Region_class},
    'migrates_to':  {'domain': country_class, 'range': country_class}
}

# Afegir les propietats al graf
for prop, details in properties.items():
    prop_uri = URIRef(onto_ns[prop])
    g.add((prop_uri, RDF.type, RDF.Property))
    g.add((prop_uri, RDFS.domain, details['domain']))
    g.add((prop_uri, RDFS.range, details['range']))

def encode_country_name(country):
    return quote(country) if country is not None else None

encode_udf = F.udf(encode_country_name, StringType())


In [15]:
def replace_spaces_in_column_names(df):
    """Reemplaça els espais en els noms de les columnes per guions baixos."""
    for col in df.columns:
        new_col = col.replace(" ", "_")
        df = df.withColumnRenamed(col, new_col)
    return df
df = replace_spaces_in_column_names(df)

In [16]:
processed_data = df.select(
'Country', 'Year', 'Area_Km2', 'CBR', 'CDR', 'Medage', 'Pop_Dens', 'Total_population',
'Unemployment_rate', 'Economic_activity_rate', 'pais_exportat', 'Region',
'EU_Membership', 'NATO_Membership', 'border_countries', 'Destination',
'Female_Power', 'GDP_Main', 'Num_Immigrants'
).collect()

def process_row(processed_data):
  for row in processed_data:
    country_encoded = quote(row.Country)
    country_uri = URIRef(onto_ns[country_encoded])
    year_uri = URIRef(onto_ns[str(row.Year)])

    g.add((country_uri, RDF.type, country_class))
    g.add((year_uri, RDF.type, year_class))
    g.add((year_uri, URIRef(onto_ns['in']), country_uri))
    # Afegir propietats si no són nul·les
    if row.Area_Km2 is not None:
        g.add((country_uri, URIRef(onto_ns['hasMedAge']), Literal(row.Area_Km2, datatype=XSD.decimal)))

    if row.CBR is not None:
        g.add((country_uri, URIRef(onto_ns['hasCBR']), Literal(row.CBR, datatype=XSD.double)))

    if row.CDR is not None:
        g.add((country_uri, URIRef(onto_ns['hasCDR']), Literal(row.CDR, datatype=XSD.double)))

    if row.Medage is not None:
        g.add((country_uri, URIRef(onto_ns['hasMedAge']), Literal(row.Medage, datatype=XSD.double)))

    if row.Pop_Dens is not None:
        g.add((country_uri, URIRef(onto_ns['hasPopulationDensity']), Literal(row.Pop_Dens, datatype=XSD.double)))

    if row.Total_population is not None:
        g.add((country_uri, URIRef(onto_ns['hasTotalPopulation']), Literal(row.Total_population, datatype=XSD.double)))

    if row.Unemployment_rate is not None:
        g.add((country_uri, URIRef(onto_ns['hasUnemploymentRate']), Literal(row.Unemployment_rate, datatype=XSD.double)))

    if row.Economic_activity_rate is not None:
        g.add((country_uri, URIRef(onto_ns['hasEconomicActivityRate']), Literal(row.Economic_activity_rate, datatype=XSD.double)))

    if row.Region is not None:
        g.add((country_uri, URIRef(onto_ns['inRegion']), Literal(row.Region, datatype=XSD.string)))

    if row.EU_Membership is not None:
        g.add((country_uri, URIRef(onto_ns['hasEUStatus']), Literal(row.EU_Membership, datatype=XSD.string)))

    if row.NATO_Membership is not None:
        g.add((country_uri, URIRef(onto_ns['belongs_to']), Literal(row.NATO_Membership, datatype=XSD.string)))

    if row.GDP_Main is not None:
        g.add((country_uri, URIRef(onto_ns['hasGDPMain']), Literal(row.GDP_Main, datatype=XSD.string)))

    if row.Female_Power is not None:
      g.add((country_uri, URIRef(onto_ns['hasFP']), Literal(row.Female_Power, datatype=XSD.float)))

    if row.Destination is not None:
      if row.Num_Immigrants:
        g.add((country_uri, URIRef(onto_ns['migrates_to']), Literal(row.Destination, datatype=XSD.string)))

    if row.pais_exportat is not None:
        g.add((country_uri, URIRef(onto_ns['exports_to']), Literal(row.pais_exportat, datatype=XSD.string)))

    # Processar països fronterers
    if row.border_countries:
        borders = json.loads(row.border_countries)
        for border in borders:
            border_encoded = quote(border)
            border_uri = URIRef(onto_ns[border_encoded])
            g.add((country_uri, URIRef(onto_ns['borders_with']), border_uri))


In [17]:
df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Year: decimal(20,0) (nullable = true)
 |-- Area_Km2: decimal(20,0) (nullable = true)
 |-- CBR: double (nullable = true)
 |-- CDR: double (nullable = true)
 |-- Medage: double (nullable = true)
 |-- Pop_Dens: double (nullable = true)
 |-- Total_population: double (nullable = true)
 |-- Unemployment_rate: double (nullable = true)
 |-- Economic_activity_rate: double (nullable = true)
 |-- pais_exportat: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- EU_Membership: string (nullable = true)
 |-- NATO_Membership: string (nullable = true)
 |-- border_countries: string (nullable = true)
 |-- Destination: string (nullable = true)
 |-- Female_Power: float (nullable = true)
 |-- GDP_Main: string (nullable = true)
 |-- Num_Immigrants: boolean (nullable = false)



In [18]:
process_row(processed_data)

In [21]:
df.show()

+-------+----+--------+-----+-----+------+--------+-------------------+-----------------+----------------------+-------------+------------+-------------+---------------+--------------------+-----------+------------+--------------------+--------------+
|Country|Year|Area_Km2|  CBR|  CDR|Medage|Pop_Dens|   Total_population|Unemployment_rate|Economic_activity_rate|pais_exportat|      Region|EU_Membership|NATO_Membership|    border_countries|Destination|Female_Power|            GDP_Main|Num_Immigrants|
+-------+----+--------+-----+-----+------+--------+-------------------+-----------------+----------------------+-------------+------------+-------------+---------------+--------------------+-----------+------------+--------------------+--------------+
| Sweden|2016|  410335|11.78| 9.13|  41.0|    24.3|2.197927993360785E7|              6.9|     70.05823988193126|       France|Europe North|    EU Member|Non-NATO Member|["Finland","Norway"]|     Turkey|   161.96797|               trade|        

In [23]:
# Serialitzar en format OWL/XML
owl_data = g.serialize(format='xml').encode('utf-8')  # Converteix a bytes utilitzant utf-8

# Guardar en un fitxer OWL
with open('ontologia_bda_2.owl', 'wb') as owl_file:  # 'wb' per mode binari
    owl_file.write(owl_data)

In [24]:
# Serialitzar en format N-Triples
nt_data = g.serialize(format='nt').encode('utf-8')  # Converteix a bytes utilitzant utf-8

# Guardar en un fitxer .nt
with open('ontologia_bda_2.nt', 'wb') as nt_file:  # 'wb' per mode binari
    nt_file.write(nt_data)