# Query narratives

In this script we will:
 1. Obtain a list of all narrative titles by thematic area
 2. Obtain a list of all "tag" references, including the narrative and section in which they are included

## Load required libraries

In [1]:
import os
from bs4 import BeautifulSoup
import pandas as pd
import sys
from copy import deepcopy
import getpass
from arcgis.gis import GIS

# see: https://stackoverflow.com/questions/4383571/importing-files-from-different-folder
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../scripts')

import utils


# https://volderette.de/jupyter-notebook-tip-multiple-outputs/
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

  pd.datetime,


## Set path

In [2]:
path = "../narratives/"

## Get list of story maps in ArcGIS online

In [3]:
online_username = input('Username: ')
online_password = getpass.getpass('Password: ')
online_connection = "https://www.arcgis.com"
gis_online_connection = GIS(online_connection,
                            online_username,
                            online_password)



Username:  unstats_admin
Password:  ············


In [4]:
narratives_folder='f92458348e234be39392ee8c6515c4ae'

user = gis_online_connection.users.get('unstats_admin')
user_items = user.items(folder="World's Women 2020 Narratives", max_items=800)

arcgis_narratives = []
for item in user_items:
    
    if item.url == 'https://undesa.maps.arcgis.com/apps/FilterGallery/index.html?appid=2501891d28164237b903c1a5ff31621f':
        continue
    
    d=dict()
    d['type'] = item.type
    
    if item.description:
        description = BeautifulSoup(item.description, "html.parser")
        desc_list = description.find_all('li')
        d['narrative_id'] = desc_list[0].get_text().replace("Narrative ID: ", "") 
        d['narrative_title'] = desc_list[1].get_text().replace("Narrative Title: ", "")  
        d['narrative_theme'] = desc_list[2].get_text().replace("Theme: ", "") 
        d['narrative_sdgs'] = desc_list[3].get_text().replace("SDG indicators: ","")
        d['narrative_beijing'] = desc_list[4].get_text().replace("Beijing objectives: ","")
    
    d['title'] = item.title
    d['url'] = item.url
    d['item_id'] = item.url.replace('https://undesa.maps.arcgis.com/apps/MapJournal/index.html?appid=','')
    d['tags'] = item.tags
    
    arcgis_narratives.append(d)


pd.DataFrame.from_dict(arcgis_narratives).to_excel(
    '../narratives/arcgis_narratives.xlsx', index=False)

## Walk through narrative folders

In [5]:

thematic_areas = [f.name for f in os.scandir(path) if f.is_dir()]
thematic_areas.remove('_templates')
print(thematic_areas)

narrative_catalogue = []

for ta in thematic_areas:
    for n in [f.name for f in os.scandir(path + '/' + ta) if f.is_dir()]:
        d = dict()
        #print(f'Thematic area: {ta}, Narrative: {n}')
        d['thematic_area_id'] = ta
        d['narrative_id'] = n
        narrative_catalogue.append(d)
        
#print(narrative_catalogue)
        
        

['ND', 'NE', 'NH', 'NP', 'NV', 'NW']


In [6]:
file = '../narratives/ND/ND1/index.html'

with open(file, 'r', encoding="utf8") as f:
    narrative = BeautifulSoup(f.read(), "html.parser")
    
#print(narrative.div)
        


## Extract list of narrative references

In [7]:
narrative_references = []

for n in narrative_catalogue:
    file = '../narratives/'+ n['thematic_area_id'] +'/'+ n['narrative_id'] +'/index.html'
    with open(file, 'r', encoding="utf8") as f:
        narrative = BeautifulSoup(f.read(), "html.parser")
        
        # Find all narrative references:
        n_title = narrative.find('div', {'class':'title'}).get_text()
        n_ref_list = narrative.find_all('a', {'class':'narrative-ref'})
        
        for n_ref in n_ref_list:
            
          
            d = dict()
            d['thematic_area_id'] = n['thematic_area_id']
            d['narrative_id'] = n['narrative_id']
            d['narrative_title'] = n_title
            d['n_ref_id'] = n_ref.attrs['href']
            d['n_ref_text'] = n_ref.get_text()
            
            ref_arcgis = utils.select_dict(arcgis_narratives, {'narrative_id':n_ref.attrs['href'].replace('#','')})
            
            if len(ref_arcgis)>0:
                d['n_ref_id2'] = ref_arcgis[0]['narrative_id']
                d['narrative_title2'] = ref_arcgis[0]['narrative_title']
                d['narrative_theme'] = ref_arcgis[0]['narrative_theme']
                d['item_id'] = ref_arcgis[0]['item_id']
                d['url'] = ref_arcgis[0]['url']
            else:
                d['n_ref_id2'] = None
                d['narrative_title2'] = None
                d['narrative_theme'] = None
                d['item_id'] = None
                d['url'] = None
            
            narrative_references.append(d)
            
#print(narrative_references)
pd.DataFrame.from_dict(narrative_references).to_excel(
    '../narratives/narrative_references.xlsx', index=False)
            

## Extract list of tagged paragraphs

In [8]:
tags = []

for n in narrative_catalogue:
    file = '../narratives/'+ n['thematic_area_id'] +'/'+ n['narrative_id'] +'/index.html'
    with open(file, 'r', encoding="utf8") as f:
        narrative = BeautifulSoup(f.read(), "html.parser")
        
        # Find all narrative references:
        n_title = narrative.find('div', {'class':'title'}).get_text()
        n_tag_list = narrative.find_all('span', {'class':'label-ref'})
        
        for n_tag in n_tag_list:
            d = dict()
            d['thematic_area_id'] = n['thematic_area_id']
            d['narrative_id'] = n['narrative_id']
            d['narrative_title'] = n_title
            d['tag_id'] = n_tag.attrs['tag']
            d['n_ref_text'] = n_tag.get_text()
            
            narrative_data = utils.select_dict(arcgis_narratives, {'narrative_id': n['narrative_id']})
            if len(narrative_data)>0:
                d['url'] = narrative_data[0]['url']
            else:
                d['url'] = None
                
            parent = deepcopy(n_tag.parent)

            
            # remove footnotes:
            footnote = parent.findAll('span', {'class':'footnote-index'})
            for f in footnote:
                x = f.decompose()
            
            # remove <strong> </strong> tags:
            strong = parent.findAll('strong')
            for s in strong:
                x = s.unwrap()
            
            # remove <span> </span> tags:
            span = parent.findAll('span', {'class':'label-ref'})
            for s in span:
                if s.string == n_tag.get_text():
                    x = s.string.wrap(narrative.new_tag("strong"))
                    
            span = parent.findAll('span', {'class':'label-ref'})
            for s in span:
                x = s.unwrap()
                
            
            

            
            
            # ------------  link to actual narrative's url -----------------
            excerpt = BeautifulSoup(''.join(str(i) for i in parent.contents), "html.parser")
            
            ref_list = excerpt.find_all('a', {'class':'narrative-ref'})
            
            for r in ref_list:
                
                ref_arcgis = utils.select_dict(arcgis_narratives, {'narrative_id':r.attrs['href'].replace('#','')})
                
                if len(ref_arcgis)>0:
                    r.attrs['href'] = ref_arcgis[0]['url']
                    
            d['excerpt'] = excerpt
            
            
            tags.append(d)
            
#print(narrative_references)
pd.DataFrame.from_dict(tags).to_excel(
    '../narratives/narrative_tags.xlsx', index=False)
            