# Extract country profiles form DNSS application

## Load necessary python libraries and define working directory

This script will heavily rely on the data transformation and data management methods available from the `pandas` python library.

In [1]:
import json

from bs4 import BeautifulSoup
import urllib3
http = urllib3.PoolManager()


import pandas as pd
import math
import os 
import hashlib

import time

import datetime

dir_path = os.path.dirname(os.path.realpath('__file__'))
print(dir_path)

data_dir = r'../data/'
print('data inputs dir: ' + data_dir)

output_dir = r'../output/'
print('outputs dir: ' + output_dir)


# https://volderette.de/jupyter-notebook-tip-multiple-outputs/
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

C:\Users\L.GonzalezMorales\Documents\GitHub\FOC-FPOS
data inputs dir: ../data/
outputs dir: ../output/


## Utilities

#### Disable insecure request warnings when using `urllib3`.

In [2]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

#### Compute a hash of a dictionary

In [3]:
def dict_hash(d):
    out = hashlib.md5()
    for key, value in d.items():
        out.update(key.encode('utf-8'))
        out.update(str(value).encode('utf-8'))
    return out.hexdigest()


#### Get unique dictionaries in a list

In [4]:
def unique_dicts(dictionary_list):

    uniques_map = {}

    for d in dictionary_list:
        uniques_map[dict_hash(d)] = d

    return list(uniques_map.values())


#### Extract subset of key-value pairs from Python dictionary object

In [5]:
def subdict_list(dict_list, keys_list, exclude = False):
    sub_d_list = []
    if exclude:
        for d in dict_list:
            sub_d= {k: d[k] for k in d.keys() if k not in keys_list}
            sub_d_list.append(sub_d)
    else:
        for d in dict_list:
            sub_d= {k: d[k] for k in keys_list}
            sub_d_list.append(sub_d)
    
    return sub_d_list




#### Get a dict from a list based on something inside the dict

In [6]:
def select_dict(dict_list, k, v):
    selected = []
    for d in dict_list:
        if d[k] == v:
            selected.append(d)
    return selected

## Read list of country profiles

In [7]:
xls = pd.ExcelFile('All documents.xlsx')
x = pd.read_excel(xls, 'Country Profiles').to_dict('index')
country_profiles = []
for d in x:
    country_profiles.append(x[d])

country_profiles[1:4]

[{'DocumentNo': 7,
  'Country': 'Albania',
  'LastModified': Timestamp('2009-02-06 00:00:00'),
  'DocumentLink': 'https://unstats.un.org/unsd/dnss/docViewer.aspx?docID=563'},
 {'DocumentNo': 11,
  'Country': 'Algeria',
  'LastModified': Timestamp('2009-09-06 00:00:00'),
  'DocumentLink': 'https://unstats.un.org/unsd/dnss/docViewer.aspx?docID=564'},
 {'DocumentNo': 25,
  'Country': 'Andorra',
  'LastModified': Timestamp('2011-10-11 00:00:00'),
  'DocumentLink': 'https://unstats.un.org/unsd/dnss/docViewer.aspx?docID=558'}]

In [8]:
def get_doc(url):
    response = http.request('GET', url)
    doc = BeautifulSoup(response.data)
    for br in doc.find_all("br"):
        br.replace_with("\n")
    return doc


In [9]:
def get_groups(doc):
    groups = doc.find_all('div', {'class': ['catGroupPanel']})
    return groups

In [10]:
def parse_groups(groups):
    
    content = []
    
    for g in groups:

        g_dict = {}

        title = g.find_all('div', {'class': ['catGroupTitle']})
        for t in title:
            title_text = t.text
            #title_text

        subtitles = g.find_all('div', {"class": ['docTitlePanel']})
        subtitles_text = []
        for s in subtitles:
            subtitles_text.append(s.text)
        #subtitles_text


        subtitles_detail = g.find_all('div', {"class": None})
        subtitles_detail_text = []
        for sd in subtitles_detail:
            subtitles_detail_text.append(sd.text)
        #subtitles_detail_text

        g_dict['title'] = title_text
        g_dict['content'] = {}
        for i in range(len(subtitles)):
            g_dict['content'][subtitles_text[i]]= subtitles_detail_text[i]
            
        content.append(g_dict)
    
    return(content)

In [11]:
url = country_profiles[0]['DocumentLink']
url

'https://unstats.un.org/unsd/dnss/docViewer.aspx?docID=562'

In [None]:
#doc = get_doc(url)
#groups = get_groups(doc)
#doc_content = parse_groups(groups)

In [None]:
CountryProfiles1 = []


for cp in country_profiles[0:80]:
    cp_dict = {}
    cp_dict['Country'] = cp['Country']
    cp_dict['LastModified'] = cp['LastModified']
    cp_dict['url'] = cp['DocumentLink']
    cp_dict['Content'] = parse_groups(get_groups(get_doc(cp['DocumentLink'])))
    print(cp_dict['Country'])
    
    CountryProfiles1.append(cp_dict)
    
    time.sleep(0.5)
    
print('-----finished first block---')

for i in CountryProfiles1:
    i['LastModified'] = i['LastModified'].strftime('%d/%m/%Y')
    
with open('CountryProfiles1.json', 'w') as fout:
    json.dump(CountryProfiles1, fout, indent = 4)
    
#wait 20 minutes
time.sleep(1200) 


In [None]:
CountryProfiles2 = []

for cp in country_profiles[80:160]:
    cp_dict = {}
    cp_dict['Country'] = cp['Country']
    cp_dict['LastModified'] = cp['LastModified']
    cp_dict['url'] = cp['DocumentLink']
    cp_dict['Content'] = parse_groups(get_groups(get_doc(cp['DocumentLink'])))
    print(cp_dict['Country'])
    print('lenght:', len(cp_dict['Content']))
    
    CountryProfiles2.append(cp_dict)
    
    time.sleep(0.5)
    
print('-----finished second block---')

for i in CountryProfiles2:
    i['LastModified'] = i['LastModified'].strftime('%d/%m/%Y')
    
with open('CountryProfiles1.json', 'w') as fout:
    json.dump(CountryProfiles2, fout, indent = 4)
    
#wait 20 minutes
time.sleep(1200) 


In [12]:
CountryProfiles3 = []

for cp in country_profiles[160:240]:
    cp_dict = {}
    cp_dict['Country'] = cp['Country']
    cp_dict['LastModified'] = cp['LastModified']
    cp_dict['url'] = cp['DocumentLink']
    cp_dict['Content'] = parse_groups(get_groups(get_doc(cp['DocumentLink'])))
    print(cp_dict['Country'])
    print('lenght:', len(cp_dict['Content']))
    
    CountryProfiles3.append(cp_dict)
    
    time.sleep(0.5)
    
print('-----finished third block---')


for i in CountryProfiles3:
    i['LastModified'] = i['LastModified'].strftime('%d/%m/%Y')
    
with open('CountryProfiles3.json', 'w') as fout:
    json.dump(CountryProfiles3, fout, indent = 4)
    

Slovenia
lenght: 7
Solomon Islands
lenght: 4
South Africa
lenght: 6
Spain
lenght: 7
Sri Lanka
lenght: 7
Sudan
lenght: 6
Suriname
lenght: 7
Swaziland
lenght: 6
Sweden
lenght: 7
Switzerland
lenght: 7
Syrian Arab Republic
lenght: 6
Tajikistan
lenght: 5
Thailand
lenght: 7
The former Yugoslav Republic of Macedonia
lenght: 7
Timor-Leste
lenght: 7
Togo
lenght: 6
Tonga
lenght: 7
Trinidad and Tobago
lenght: 7
Tunisia
lenght: 7
Turkey
lenght: 7
Turkmenistan
lenght: 5
Tuvalu
lenght: 6
Uganda
lenght: 7
Ukraine
lenght: 7
United Kingdom
lenght: 6
United Republic of Tanzania
lenght: 6
United States of America
lenght: 7
Uruguay
lenght: 6
Uzbekistan
lenght: 6
Vanuatu
lenght: 7
Venezuela
lenght: 7
Viet Nam, Socialist Republic of
lenght: 6
Yemen
lenght: 7
Zambia
lenght: 4
Zimbabwe
lenght: 5
-----finished third block---


In [14]:
with open('CountryProfiles1.json') as json_file:
    CountryProfiles1 = json.load(json_file)

with open('CountryProfiles2.json') as json_file:
    CountryProfiles2 = json.load(json_file)

with open('CountryProfiles3.json') as json_file:
    CountryProfiles3 = json.load(json_file)



In [15]:
CountryProfiles = CountryProfiles1 + CountryProfiles2 + CountryProfiles3

In [16]:
with open('CountryProfiles.json', 'w') as fout:
    json.dump(CountryProfiles, fout, indent = 4)
