# **Data Collectors and Landing Zone**

### *Required packages:*

In [1]:
# !pip install opendatasets
# !pip install kaggle
# !pip install openpyxl

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Collecting kaggle
  Downloading kaggle-1.6.6.tar.gz (84 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.6/84.6 KB[0m [31m831.5 kB/s[0m eta [36m0:00:00[0m kB/s[0m eta [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting tqdm
  Downloading tqdm-4.66.2-py3-none-any.whl (78 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 KB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hCollecting click
  Using cached click-8.1.7-py3-none-any.whl (97 kB)
Collecting bleach
  Downloading bleach-6.1.0-py3-none-any.whl (162 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.8/162.8 KB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
[?25hCollecting certifi
  Downloading certifi-2024.2.2-py3-none-any.whl (163 kB)
[2K 

In [10]:
import numpy as np
import pandas as pd
import requests
import os
import opendatasets as od
import csv
import openpyxl

### Kaggle Configuration - Drive

(Requereix del fitxer `kaggle_fitxer.json`)

In [11]:
!mkdir ~/.kaggle

mkdir: cannot create directory ‘/home/oscarmolina/.kaggle’: File exists


In [12]:
!cp ./kaggle_fitxer.json ~/.kaggle/kaggle.json

In [13]:
!cd ~/.kaggle && ls

kaggle.json


### CODE

#### Downloading datasets

##### UNECE

In [14]:
# URL del dataset UNECE
unece_db = pd.read_json('https://query.data.world/s/qd6kjqguuvb573mffbpfysgpudi2pv?dws=00000')
unece_db.to_csv('unece_db.csv', index=False)
!mv ./unece_db.csv ../data/unece_db.csv

In [21]:
unece_db.columns

Index(['Country', 'Year', 'Area (square kilometres)', 'Total population',
       'Population density, pers. per sq. km', 'Population aged 0-14, male',
       'Population aged 0-14, female', 'Population aged 15-64, male',
       'Population aged 15-64, female', 'Population aged 64+, male',
       'Population aged 64+, female', 'Total population, male (%)',
       'Total population, female (%)', 'Life expectancy at birth, women',
       'Life expectancy at birth, men', 'Life expectancy at age 65, women',
       'Life expectancy at age 65, men', 'Total fertility rate',
       'Adolescent fertility rate',
       'Mean age of women at birth of first child',
       'Computer use, 16-24, male', 'Computer use, 16-24, female',
       'Computer use, 25-54, male', 'Computer use, 25-54, female',
       'Computer use, 55-74, male', 'Computer use, 55-74, female',
       'Women in the Labour Force, Percent of corresponding total for both sexes',
       'Female part-time employment, percent of both se

##### Kaggle - Mental Health

In [17]:
# API Kaggle Mental Health
!kaggle datasets download  -d 'thedevastator/uncover-global-trends-in-mental-health-disorder'

Downloading uncover-global-trends-in-mental-health-disorder.zip to /home/oscarmolina/UNI/BDA/betterlifebetterhealth/src
 81%|██████████████████████████████▌       | 1.00M/1.24M [00:00<00:00, 1.50MB/s]
100%|██████████████████████████████████████| 1.24M/1.24M [00:00<00:00, 1.80MB/s]


In [19]:
#!sudo apt-get install unzip
!unzip uncover-global-trends-in-mental-health-disorder.zip -d ../data/
!rm uncover-global-trends-in-mental-health-disorder.zip

Archive:  uncover-global-trends-in-mental-health-disorder.zip
  inflating: ../data/Mental health Depression disorder Data.csv  


##### USA Census

In [20]:
# API United States Census
variables = "NAME","YR","AREA_KM2","CBR","CDR","DEATHS","E0","GRR","IMR","MEDAGE","MR0_4","POP_DENS","genc standard countries and areas"
url_base = "https://api.census.gov/data/timeseries/idb/5year?get=NAME,YR,AREA_KM2,CBR,CDR,DEATHS,E0,GRR,IMR,MEDAGE,MR0_4,POP_DENS&for=genc%20standard%20countries%20and%20areas:*".format(variables)

# Llista per guardar les dades
data = []

# Fem una sol·licitud a l'API per agafar dades des de l'any 1990 fins al 2019
for year in range(1990, 2019):
    url = "{}&time={}".format(url_base, year)
    response = requests.get(url)
    if response.status_code == 200:
        data.extend(response.json()[1:])

# Guardar los datos en un archivo CSV
csv_filename = "dades_demogr_census.csv"
with open(csv_filename, "w", newline="") as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(["NAME","YR","AREA_KM2","CBR","CDR","DEATHS","E0","GRR","IMR","MEDAGE","MR0_4","POP_DENS","genc standard countries and areas", "code"])
    csv_writer.writerows(data)

print("Les dades s'han guardat a", csv_filename)

!mv ./dades_demogr_census.csv ../data/dades_demogr_census.csv


Les dades s'han guardat a dades_demogr_census.csv


#### Rename

In [22]:
!mv ../data/dades_demogr_census.csv ../data/demography.csv
!mv ../data/unece_db.csv ../data/society.csv
#!mv ../data/Mental health Depression disorder Data.csv ../data/mental_health.csv

#### Storing files in a suitable format (Parquet)


In [58]:
# Unece_db
unece_db = pd.read_csv('../data/society.csv')
unece_db.to_parquet('../data/society.parquet')

# Kaggle db
kaggle_db = pd.read_csv('../data/mental_health.csv', decimal=',')
kaggle_db.to_parquet('../data/mental_health.parquet')

# United State Census db
us_census_db = pd.read_csv('../data/demography.csv')
us_census_db.to_parquet('../data/demography.parquet')

  kaggle_db = pd.read_csv('/content/data/Mental health Depression disorder Data.csv', decimal=',')
