# **EXPLOITATION ZONE**

In [1]:
!git clone https://github.com/OscarMoliina/betterlifebetterhealth.git

Cloning into 'betterlifebetterhealth'...
remote: Enumerating objects: 278, done.[K
remote: Counting objects: 100% (123/123), done.[K
remote: Compressing objects: 100% (96/96), done.[K
remote: Total 278 (delta 38), reused 95 (delta 27), pack-reused 155[K
Receiving objects: 100% (278/278), 98.52 MiB | 20.45 MiB/s, done.
Resolving deltas: 100% (88/88), done.
Updating files: 100% (36/36), done.


In [2]:
!pip3 install pyspark
import pyspark

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, mean, when, lit, count
from pyspark.sql.window import Window
from pyspark.sql.functions import mean
from pyspark.sql.functions import mean
from pyspark.sql.types import IntegerType, DoubleType

import pandas as pd
import matplotlib.pyplot as plt
import duckdb

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=d8756cdfe3130239238fa8c0bacfd9b34c8e943fddbb125b1dcb28763ca199e5
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


**Connexió:**

In [5]:
spark = SparkSession.builder \
    .appName("Preprocessing") \
    .config("spark.jars", "/content/betterlifebetterhealth/src/utils/duckdb.jar") \
    .getOrCreate()

dem = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:duckdb:/content/betterlifebetterhealth/data/db/trusted_zone.db") \
    .option("driver", "org.duckdb.DuckDBDriver") \
    .option("dbtable", "dem") \
    .load()

mh = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:duckdb:/content/betterlifebetterhealth/data/db/trusted_zone.db") \
    .option("driver", "org.duckdb.DuckDBDriver") \
    .option("dbtable", "mh") \
    .load()

soc = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:duckdb:/content/betterlifebetterhealth/data/db/trusted_zone.db") \
    .option("driver", "org.duckdb.DuckDBDriver") \
    .option("dbtable", "soc") \
    .load()

**RECONCILIACIÓ DE LES DADES**

El codi següent té una execució molt lenta ja que ha d'anar fila per fila fent un request a la API. En el nostre cas els països estan sempre en anglès en els 3 dataframes. Deixem el codi per si de cas, però no cal executar-lo a no ser que canviessin les fonts de dades.

In [None]:
"""
import requests

def get_wikidata_id(country_name):
    url = 'https://www.wikidata.org/w/api.php'
    params = {
        'action': 'wbsearchentities',
        'language': 'en',
        'format': 'json',
        'search': country_name,
        'type': 'item'
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        results = response.json().get('search')
        if results:
            for result in results:
                if result.get('label').lower() == country_name.lower():
                    return result['id']
    return None

dem_pd = dem.toPandas()
soc_pd = soc.toPandas()
mh_pd = mh.toPandas()

dem_pd['Country'].apply(get_wikidata_id)
soc_pd['Country'].apply(get_wikidata_id)
mh_pd['Country'].apply(get_wikidata_id)

dem = dem_pd.to_sql('dem', conn, if_exists='fail')
soc = soc_pd.to_sql('soc', conn, if_exists='fail')
mh = mh_pd.to_sql('mh', conn, if_exists='fail')

"""

**JOIN**

In [None]:
mh.columns

['Country',
 'Year',
 'Schizophrenia (%)',
 'Bipolar disorder (%)',
 'Eating disorders (%)',
 'Anxiety disorders (%)',
 'Drug use disorders (%)',
 'Depression (%)',
 'Alcohol use disorders (%)']

In [None]:
dem.columns

['Country',
 'Year',
 'Area_Km2',
 'CBR',
 'CDR',
 'Deaths',
 'E0',
 'Medage',
 'MR0_4',
 'Pop_Dens',
 'GSCA']

In [None]:
soc.columns

['Country',
 'Year',
 'Total population',
 'Total population, male (%)',
 'Total population, female (%)',
 'Mean age of women at birth of first child',
 'Women in the Labour Force, Percent of corresponding total for both sexes',
 'Female tertiary students, percent of total',
 'Female legislators, senior officials and managers, percent of total',
 'Female professionals, percent of total for both sexes',
 'Female clerks, percent of total for both sexes',
 'Female craft and related workers, percent of total for both sexes',
 'Female plant and machine operators and assemblers, percent of total for both sexes',
 'Female members of parliament, percent of total',
 'Total employment, growth rate',
 'Unemployment rate',
 'Youth unemployment rate',
 'GDP at current prices and PPPs, millions of US$',
 'GDP per capita at current prices and PPPs, US$',
 'Final consumption expenditure per capita, US Dollars, current PPPs',
 'Purchasing power parity (PPP), NCU per US$',
 'Consumer price index, growth

In [6]:
# Realiza el primer join y selecciona las columnas deseadas, renombrando para evitar duplicados si es necesario
joined_data = dem.join(mh, (dem["Country"] == mh["Country"]) & (dem["Year"] == mh["Year"]), "inner")
selected_columns = [dem[col] for col in dem.columns] + [mh[col] for col in mh.columns if col not in ["Country", "Year"]]
first_join = joined_data.select(*selected_columns)

# Realiza el segundo join y selecciona las columnas deseadas, evitando duplicados
final_join = first_join.join(soc, (first_join["Country"] == soc["Country"]) & (first_join["Year"] == soc["Year"]), "inner")
selected_columns_final = [first_join[col] for col in first_join.columns] + [soc[col] for col in soc.columns if col not in ["Country", "Year"]]
result = final_join.select(*selected_columns_final)


In [7]:
result.write \
    .format("jdbc") \
    .option("url", "jdbc:duckdb:exploitation_zone.db") \
    .option("dbtable", "join_table") \
    .option("driver", "org.duckdb.DuckDBDriver") \
    .save()

**GUARDAR DB A GITHUB**

In [8]:
# Clone the repository
%cd betterlifebetterhealth/data/db

# Remove old files
!rm -rf exploitation_zone.db

# Copy new files from your local directory to this directory
!cp -R /content/*.db .

/content/betterlifebetterhealth/data/db


In [10]:
#CODI A LA DOCUMENTACIO NO POSAT AQUÍ PER PRIVACITAT DE USUARI