# **Discretització i càlcul de variables**

## *Requisits d'execució*

In [1]:
!pip3 install pyspark rdflib duckdb

# Importació de mòduls de pyspark
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, DoubleType, StringType, FloatType
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline

# Importació de mòduls de rdflib
from rdflib import Graph, URIRef, Literal, Namespace, RDF, RDFS, XSD

# Importacions generals de Python
import pandas as pd
import matplotlib.pyplot as plt
import duckdb
from decimal import Decimal, InvalidOperation
import json
from urllib.parse import quote

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=cbdadd9131467df4f872539b52b0edb057e64354d2200859cf5ef6f107848605
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
!git clone https://github.com/OscarMoliina/betterlifebetterhealth.git

Cloning into 'betterlifebetterhealth'...
remote: Enumerating objects: 409, done.[K
remote: Counting objects: 100% (254/254), done.[K
remote: Compressing objects: 100% (213/213), done.[K
remote: Total 409 (delta 73), reused 144 (delta 40), pack-reused 155[K
Receiving objects: 100% (409/409), 104.07 MiB | 29.68 MiB/s, done.
Resolving deltas: 100% (123/123), done.


In [4]:
spark = SparkSession.builder \
    .appName("Afegir dades") \
    .config("spark.jars", "/content/betterlifebetterhealth/src/utils/duckdb.jar") \
    .getOrCreate()

result = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:duckdb:/content/explotaiton_zone_e2.db") \
    .option("driver", "org.duckdb.DuckDBDriver") \
    .option("dbtable", "relations_added") \
    .load()



## *Discretització de les variables*

In [5]:
df = result

In [6]:
# Definir una llista de columnes per discretitzar
columns_to_discretize = ['Area_Km2', 'CBR', 'CDR', 'Medage', 'Pop_Dens', 'Total population', 'Unemployment rate', 'Economic activity rate']

# Afegeix una nova columna per cada discretització
for column in columns_to_discretize:
    # Calcular el mínim i màxim per a cada columna
    min_col = df.select(min(col(column))).collect()[0][0]
    max_col = df.select(max(col(column))).collect()[0][0]
    # Calcular els llindars
    range_width = (float(max_col) - float(min_col)) / 3.0
    first_threshold = float(min_col) + float(range_width)
    second_threshold = float(min_col) + 2 * float(range_width)

    # Crear categories basades en els llindars
    df = df.withColumn(column,
                       when(col(column) <= first_threshold, lit('valor_baix'))
                       .when(col(column) <= second_threshold, lit('valor_mig'))
                       .otherwise(lit('valor_alt')))


## *Càlcul de l'índex de poder de la dona*

In [7]:
# Fer un càlcul de l'índex de poder de la dona per cada registre
positive = ['Female tertiary students, percent of total', 'Female legislators, senior officials and managers, percent of total', 'Female professionals, percent of total for both sexes',
 'Female members of parliament, percent of total','Female plant and machine operators and assemblers, percent of total for both sexes', 'Women in the Labour Force, Percent of corresponding total for both sexes']

negative = ['Female clerks, percent of total for both sexes', 'Female craft and related workers, percent of total for both sexes']

def calculate_female_power(*cols):
    power = 0
    # cols[0:len(positive)] correspon a les columnes positives
    # cols[len(positive):] correspon a les columnes negatives
    power += sum(cols[0:len(positive)])
    power -= sum(cols[len(positive):])
    return power

# Registra l'UDF
calculate_female_power_udf = udf(calculate_female_power, FloatType())
df = df.withColumn('Female_Power', calculate_female_power_udf(*[col(c) for c in positive + negative]))
df.show()

+-------+----+----------+----------+---------+-------+-----+---------+-----+----------+----+------------------+--------------------+--------------------+---------------------+----------------------+------------------+-------------------------+----------------+----------------------------------+--------------------------+----------------------------+-----------------------------------------+------------------------------------------------------------------------+------------------------------------------+-------------------------------------------------------------------+-----------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------+----------------------------------------------------------------------------------+----------------------------------------------+-----------------------------+-----------------+-----------------------+-----------------------------------------------+------

In [8]:
df = df.drop(*positive)
df = df.drop(*negative)

## *Càlcul del GDP_Main*

In [9]:
columnes_GDP = [c for c in df.columns if 'GDP:' in c]

def calculate_gdp_main(*cols):
    max_value = 0
    category = None
    for i, col_value in enumerate(cols):
        if col_value is not None and col_value > max_value:
            max_value = col_value
            # La columna corresponent és la part després de "GDP: in "
            category = columnes_GDP[i].split(",")[0].split("GDP: in ")[-1].strip()
    return category

# Registra l'UDF
calculate_gdp_main_udf = udf(calculate_gdp_main, StringType())
df = df.withColumn('GDP_Main', calculate_gdp_main_udf(*[col(c) for c in columnes_GDP]))
df = df.drop(*columnes_GDP)

## *Codificar Països*

Cal treure els espais en el nom del pais per estalviar-nos errors de lectura

In [10]:
df = df.withColumn('Country', regexp_replace('Country', ' ', '_'))

## *Esborrar variables que no utilitzarem*

In [11]:
columns_to_drop=['Deaths', 'E0','MR0_4', 'GSCA', 'Schizophrenia (%)',
       'Bipolar disorder (%)', 'Eating disorders (%)', 'Anxiety disorders (%)',
       'Drug use disorders (%)', 'Depression (%)', 'Alcohol use disorders (%)',
       'Population density, pers per sq km',
       'Total population, male (%)', 'Total population, female (%)',
       'Mean age of women at birth of first child',
       'Total employment, growth rate',
       'Youth unemployment rate',
       'GDP at current prices and PPPs, millions of US$',
       'GDP per capita at current prices and PPPs, US$',
       'Final consumption expenditure per capita, US Dollars, current PPPs',
       'Purchasing power parity (PPP), NCU per US$',
       'Consumer price index, growth rate',
       'Export of goods and services, per cent of GDP',
       'Import of goods and services, per cent of GDP',
       'External balance on goods and services, per cent of GDP',
       'Population aged 0-14', 'Population aged 15-64', 'Population aged 64+',
       'Life expectancy at birth', 'Life expectancy at age 65']

df = df.drop(*columns_to_drop)
df.show()

+-------+----+----------+----------+---------+---------+----------+----------------+-----------------+----------------------+-------------+--------------------+------------+-------------+---------------+--------------------+-----------+----------+------------+--------------------+
|Country|Year|  Area_Km2|       CBR|      CDR|   Medage|  Pop_Dens|Total population|Unemployment rate|Economic activity rate|pais_exportat| quantitat_exportada|      Region|EU Membership|NATO Membership|    border_countries|Destination|Immigrants|Female_Power|            GDP_Main|
+-------+----+----------+----------+---------+---------+----------+----------------+-----------------+----------------------+-------------+--------------------+------------+-------------+---------------+--------------------+-----------+----------+------------+--------------------+
| Sweden|2016|valor_baix|valor_baix|valor_mig|valor_alt|valor_baix|      valor_baix|       valor_baix|             valor_alt|       France|2.5424241566166

## *Tractament de la variable Immigrants*

In [12]:
# Calcular el valor màxim i mínim de la columna 'Immigrants'
max_min_immigrants = df.select(
    max("Immigrants").alias("Max Immigrants"),
    min("Immigrants").alias("Min Immigrants")
)

# Mostrar els resultats
max_min_immigrants.show()


+--------------+--------------+
|Max Immigrants|Min Immigrants|
+--------------+--------------+
|          9978|             0|
+--------------+--------------+



In [13]:
df = df.withColumn("Num_Immigrants", when(col("Immigrants") > 5000, lit(True)).otherwise(lit(False)))
df = df.drop("Immigrants")


# **Ontologia**

In [16]:
# Definir el Namespace
onto_ns = Namespace("http://entrega_2_ojj.org/onto#")

# Crear el graf
g = Graph()

# Definir les classes
year_class = URIRef(onto_ns.Year)

country_class = URIRef(onto_ns.Country)
medage_class = URIRef(onto_ns.Medage)
CBR_class = URIRef(onto_ns.CBR)
CDR_class = URIRef(onto_ns.CDR)
FP_class = URIRef(onto_ns.FemalePower)
EU_class = URIRef(onto_ns.EU_Membership)
NATO_class = URIRef(onto_ns.NATO_Membership)
region_class = URIRef(onto_ns.region)
area_class = URIRef(onto_ns.Area_Km2)
TotalPop_class = URIRef(onto_ns.Total_Pop)
PopDens_class = URIRef(onto_ns.Pop_Dens)
EcoRate_class = URIRef(onto_ns.Economic_Activity_Rate)
Unemployment_class = URIRef(onto_ns.Unemployment)
GDP_Main_class = URIRef(onto_ns.GDP_Main)
Region_class = URIRef(onto_ns.region)

# Afegir les subclasses
g.add((year_class, RDF.type, RDFS.Class))
g.add((country_class, RDF.type, RDFS.Class))
g.add((medage_class, RDF.type, RDFS.Class))
g.add((CBR_class, RDF.type, RDFS.Class))
g.add((CDR_class, RDF.type, RDFS.Class))
g.add((EU_class, RDF.type, RDFS.Class))
g.add((FP_class,RDF.type, RDFS.Class))
g.add((NATO_class, RDF.type, RDFS.Class))
g.add((region_class, RDF.type, RDFS.Class))
g.add((area_class, RDF.type, RDFS.Class))
g.add((TotalPop_class, RDF.type, RDFS.Class))
g.add((PopDens_class, RDF.type, RDFS.Class))
g.add((EcoRate_class, RDF.type, RDFS.Class))
g.add((Unemployment_class, RDF.type, RDFS.Class))
g.add((GDP_Main_class, RDF.type, RDFS.Class))
g.add((Region_class, RDF.type, RDFS.Class))
g.add((country_class, RDFS.subClassOf, year_class))
g.add((medage_class, RDFS.subClassOf, country_class))
g.add((CBR_class, RDFS.subClassOf, country_class))
g.add((CDR_class, RDFS.subClassOf, country_class))
g.add((EU_class, RDFS.subClassOf, country_class))
g.add((FP_class, RDFS.subClassOf, country_class))
g.add((NATO_class, RDFS.subClassOf, country_class))
g.add((region_class, RDFS.subClassOf, country_class))
g.add((area_class, RDFS.subClassOf, country_class))
g.add((TotalPop_class, RDFS.subClassOf, country_class))
g.add((PopDens_class, RDFS.subClassOf, country_class))
g.add((EcoRate_class, RDFS.subClassOf, country_class))
g.add((Unemployment_class, RDFS.subClassOf, country_class))
g.add((GDP_Main_class, RDFS.subClassOf, country_class))
g.add((Region_class, RDFS.subClassOf, country_class))

# Definir les propietats
properties = {
    'in': {'domain': year_class, 'range': country_class},
    'hasMedAge': {'domain': country_class, 'range': medage_class},
    'hasCBR': {'domain': country_class, 'range': CBR_class},
    'hasCDR': {'domain': country_class, 'range': CDR_class},
    'hasFP': {'domain': country_class, 'range': FP_class},
    'hasEUStatus': {'domain': country_class, 'range': EU_class},
    'inRegion': {'domain': country_class, 'range': region_class},
    'hasArea': {'domain': country_class, 'range': area_class},
    'hasTotalPopulation': {'domain': country_class, 'range': TotalPop_class},
    'hasPopulationDensity': {'domain': country_class, 'range': PopDens_class},
    'hasEconomicActivityRate': {'domain': country_class, 'range': EcoRate_class},
    'hasUnemploymentRate': {'domain': country_class, 'range': Unemployment_class},
    'hasGDPMain': {'domain': country_class, 'range': GDP_Main_class},
    'borders_with': {'domain': country_class, 'range': country_class},
    'belongs_to': {'domain': country_class, 'range': NATO_class},
    'exports_to': {'domain': country_class, 'range': country_class},
    'situated_in': {'domain': country_class, 'range': Region_class},
    'migrates_to':  {'domain': country_class, 'range': country_class}
}

# Afegir les propietats al graf
for prop, details in properties.items():
    prop_uri = URIRef(onto_ns[prop])
    g.add((prop_uri, RDF.type, RDF.Property))
    g.add((prop_uri, RDFS.domain, details['domain']))
    g.add((prop_uri, RDFS.range, details['range']))

def encode_country_name(country):
    return quote(country) if country is not None else None

encode_udf = udf(encode_country_name, StringType())


In [20]:
# Serialitzar en format OWL/XML
owl_data = g.serialize(format='xml').encode('utf-8')  # Converteix a bytes utilitzant utf-8

# Guardar en un fitxer OWL
with open('completed_ontology.owl', 'wb') as owl_file:  # 'wb' per mode binari
    owl_file.write(owl_data)

In [23]:
# Serialitzar en format N-Triples
nt_data = g.serialize(format='nt').encode('utf-8')  # Converteix a bytes utilitzant utf-8

# Guardar en un fitxer .nt
with open('completed_ontology.nt', 'wb') as nt_file:  # 'wb' per mode binari
    nt_file.write(nt_data)