In [1]:
import os
import json
import time
import requests
import contextlib
import collections

import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker

from datetime import datetime

import pandas as pd
import numpy as np

import re
import urllib.parse

import matplotlib.pyplot as plt
import requests

In [2]:
from html import unescape
import unicodedata

In [3]:
def clean(text: str) -> str:
    text = text.strip()
    while True:
        prev_text = text
        text = unescape(text)
        if prev_text == text:
            break
    text = unicodedata.normalize("NFKC", text)
    return re.sub("\n\n\n+", "\n\n", re.sub("[ \t]+", " ", re.sub("\n[ \t]+", "\n", re.sub("\n\n+", "\n", re.sub("\r", "\n", text)))))

def strip_html(text: str) -> str:
    return re.sub(r"<(?:\"[^\"]*\"['\"]*|'[^']*'['\"]*|[^'\">])+>", "", re.sub(r"<br\s*/?\s*>", "\n", text.strip()))

In [4]:
CONFIG_PATH = "config.json"
# CONFIG_PATH = "config_local.json"
LIMIT = None
# LIMIT = 10
VERBOSE = False

In [5]:
CONFIG = None
ENGINES = {}
TABLES = {}
BINDS = {}
SESSION = None


def config_template():
    default_conn = {
        "dialect": "postgresql",
        "host": "localhost",
        "port": 5432,
        "dbname": "INVALID",
        "schema": "public",
        "user": "INVALID",
        "passwd": "INVALID",
    }
    return {
        "dbs": {
            "login": default_conn.copy(),
            "sm": default_conn.copy(),
            "exp": default_conn.copy(),
            "ap": default_conn.copy(),
            "blogs": default_conn.copy(),
        },
        "nlpapi": {
            "host": "localhost",
            "token": "INVALID",
            "write_access": "INVALID",
        },
    }


def get_config():
    global CONFIG
    
    if CONFIG is not None:
        return CONFIG
    if not os.path.exists(CONFIG_PATH):
        with open(CONFIG_PATH, "w") as fout:
            print(json.dumps(config_template(), indent=4, sort_keys=True), file=fout)
        raise ValueError(
            f"config file missing. new file was created at '{CONFIG_PATH}'. "
            "please correct values in file and run again")
    with open(CONFIG_PATH, "r") as fin:
        CONFIG = json.load(fin)
    config_out = {
        "dbs": {
            "login": CONFIG["dbs"]["login"].copy(),
            "sm": CONFIG["dbs"]["sm"].copy(),
            "exp": CONFIG["dbs"]["exp"].copy(),
            "ap": CONFIG["dbs"]["ap"].copy(),
            "blogs": CONFIG["dbs"]["blogs"].copy(),
        },
         "nlpapi": CONFIG["nlpapi"].copy(),
    }
    config_out["dbs"]["login"]["passwd"] = "*****"
    config_out["dbs"]["sm"]["passwd"] = "*****"
    config_out["dbs"]["exp"]["passwd"] = "*****"
    config_out["dbs"]["ap"]["passwd"] = "*****"
    config_out["dbs"]["blogs"]["passwd"] = "*****"
    config_out["nlpapi"]["token"] = "*****"
    config_out["nlpapi"]["write_access"] = "*****"
    print(f"loaded config\n{json.dumps(config_out, indent=2, sort_keys=True)}")
    return CONFIG


def get_engine(dbname):
    res = ENGINES.get(dbname)
    if res is not None:
        return res
    db = get_config()["dbs"][dbname]
    user = urllib.parse.quote(db["user"])
    passwd = urllib.parse.quote(db["passwd"])
    engine = sa.create_engine(
        f"{db['dialect']}://{user}:{passwd}@{db['host']}:{db['port']}/{db['dbname']}",
        echo=VERBOSE)
    engine = engine.execution_options(
        schema_translate_map={None: db['schema']})
    res = engine, sa.MetaData()
    ENGINES[dbname] = res
    return res


def get_table(dbname, tablename):
    global SESSION
    
    key = (dbname, tablename)
    res = TABLES.get(key)
    if res is not None:
        return res
    SESSION = None
    engine, metadata = get_engine(dbname)
    res = sa.Table(
        tablename,
        metadata,
        autoload_with=engine)
    TABLES[key] = res
    BINDS[res] = engine
    return res


@contextlib.contextmanager
def get_session():
    global SESSION
    
    session = SESSION
    if session is None:
        session = sessionmaker()
        session.configure(binds=BINDS)
        SESSION = session
    with session() as res:
        yield res

In [6]:
def call_nlpapi(path: str, payload: dict, *, is_write: bool, is_print: bool) -> dict:
    config = get_config()["nlpapi"]
    protocol = "https://"
    if "localhost" in config['host']:
        protocol = "http://"
    url = f"{protocol}{config['host']}{path}"
    if is_print:
        print(url)
    retry = 0
    while True:
        try:
            res = requests.post(url, json={
                **payload,
                "token": config["token"],
                **({"write_access": config["write_access"]} if is_write else {}),
            }, timeout=900)
            res.raise_for_status()
            return res.json()
        except (requests.exceptions.Timeout, requests.exceptions.HTTPError):
            retry += 1
            if retry > 6:
                raise
            time.sleep(retry * 10)

In [7]:
# global tables
t_tags = get_table("login", "tags")
t_users = get_table("login", "users")

# solution mapping tables
t_sm_pads = get_table("sm", "pads")
t_sm_tagging = get_table("sm", "tagging")

# action plan tables
t_ap_pads = get_table("ap", "pads")
t_ap_tagging = get_table("ap", "tagging")

# experiments tables
t_exp_pads = get_table("exp", "pads")
t_exp_tagging = get_table("exp", "tagging")

loaded config
{
  "dbs": {
    "ap": {
      "dbname": "action_plans_platform",
      "dialect": "postgresql",
      "host": "acclabs-global-login.postgres.database.azure.com",
      "passwd": "*****",
      "port": 5432,
      "schema": "public",
      "user": "acclabshqadmin@acclabs-global-login"
    },
    "blogs": {
      "dbname": "blogs",
      "dialect": "postgresql",
      "host": "acclabs.postgres.database.azure.com",
      "passwd": "*****",
      "port": 5432,
      "schema": "public",
      "user": "undpacclab@acclabs"
    },
    "exp": {
      "dbname": "experiments_platform",
      "dialect": "postgresql",
      "host": "acclabs-global-login.postgres.database.azure.com",
      "passwd": "*****",
      "port": 5432,
      "schema": "public",
      "user": "acclabshqadmin@acclabs-global-login"
    },
    "login": {
      "dbname": "postgres",
      "dialect": "postgresql",
      "host": "acclabs-global-login.postgres.database.azure.com",
      "passwd": "*****",
      "port

  res = sa.Table(


KeyboardInterrupt: 

In [None]:
call_nlpapi("/api/query_embed", {
    "input": "africa",
    "offset": 0,
    "limit": 10,
    "db": "main",
    "score_threshold": 0.2,
    "filters": {"iso3": ["EGY"]},
}, is_write=False, is_print=True)

In [None]:
tags = {}
with get_session() as session:
    stmt = sa.select(t_tags.c.id, t_tags.c.name, t_tags.c.type)
    for row in session.execute(stmt):
        tags[f"{row[2]}-{row[0]}"] = row[1]

In [None]:
status_map = {
    2: "preview",
    3: "public",
}

In [None]:
# skip_ahead = 5250
skip_ahead = 0

In [None]:
with get_session() as session:
    destination = "main"  # "test"
    names = ["solution", "actionplan", "experiment"]
    urls = [
        "https://solutions.sdg-innovation-commons.org/en/view/pad?id=",
        "https://learningplans.sdg-innovation-commons.org/en/view/pad?id=",
        "https://experiments.sdg-innovation-commons.org/en/view/pad?id=",
    ]
    dbs = [
        t_sm_pads,
        t_ap_pads,
        t_exp_pads,
    ]
    doc_types = [
        "solution",
        "action plan",
        "experiment",
    ]    
    count = 0
    start_total_time = time.monotonic()
    try:
        for (name, url_prefix, pad_db, doc_type) in zip(names, urls, dbs, doc_types):
            stmt = sa.select(pad_db.c.status, pad_db.c.full_text, pad_db.c.id, pad_db.c.update_at, pad_db.c.title)
            stmt = stmt.where(pad_db.c.status > 1)
            stmt = stmt.order_by(pad_db.c.id)
            if LIMIT is not None:
                stmt = stmt.limit(LIMIT)
            start_time = time.monotonic()
            time_count = 0
            print(f"processing {name}")
            for row in session.execute(stmt):
                if count < skip_ahead:
                    count += 1
                    continue
                status = status_map[int(row[0])]
                if count % 100 == 0:
                    print(f"adding #{count} {name} {row[2]} {status} {row[3]}")
                url = f"{url_prefix}{int(row[2])}"
                title = row[4]
                if row[3]:
                    date = datetime.fromisoformat(f"{row[3]}").isoformat()
                else:
                    date = None
                call_nlpapi("/api/add_embed", {
                    "input": f"{row[1]}",
                    "base": f"{name}",
                    "doc_id": int(row[2]),
                    "url": url,
                    "title": title,
                    "meta": {
                        "status": status,
                        "date": date,
                        "doc_type": doc_type,
                    },
                    "db": destination,
                }, is_write=True, is_print=count % 100 == 0)
                time_count += 1
                if count % 100 == 0:
                    print(
                        f"took avg {(time.monotonic() - start_time) / time_count if time_count > 0 else '?'}s for {time_count}")
                    first = False
                    start_time = time.monotonic()
                    time_count = 0
                count += 1
    finally:
        duration = time.monotonic() - start_total_time
        print(f"processed: {count} in {duration}s avg time {duration / (count - skip_ahead) if (count - skip_ahead) > 0 else '?'}s")

In [None]:
call_nlpapi(
    "/api/build_index", 
    {
        "db": destination,
    },
    is_write=True,
    is_print=True)

In [None]:
with get_session() as session:
    for name, pad_db in zip(names, dbs):
        if name not in {"actionplan"}:
            continue
        stmt = sa.select(pad_db.c.status, pad_db.c.full_text, pad_db.c.id, pad_db.c.update_at, pad_db.c.title)
        stmt = stmt.where(pad_db.c.id == 1883)
        for row in session.execute(stmt):
            print(f"name: {name}")
            print(f"id: {row[2]}")
            print(f"title: {row[4]}")
            print(f"status: {row[0]}")
            print(f"update_at: {row[3]}")
            print(f"full_text: {row[1]}")
            full_text = f"{row[1]}"
            display(
                call_nlpapi("/api/extract", {"modules": [{"name": "location"}, {"name": "language"}], "input": full_text}, is_write=False, is_print=True))

In [None]:
t_articles = get_table("blogs", "articles")
t_article_content = get_table("blogs", "article_content")
t_raw_html = get_table("blogs", "raw_html")

In [None]:
# skip_ahead = 3699
skip_ahead = 0

In [None]:
# destination = "main"  # "test"
count = 0
start_total_time = time.monotonic()
try:
    with get_session() as session:
        start_time = time.monotonic()
        time_count = 0
        stmt = sa.select(
            t_articles.c.id,  # 0
            t_articles.c.url,  # 1
            t_articles.c.title,  # 2
            t_articles.c.posted_date,  # 3
            t_articles.c.article_type,  # 4
            t_articles.c.relevance,  # 5
            t_article_content.c.article_id,  # 6
            t_article_content.c.content,  # 7
            t_articles.c.posted_date_str,  # 8
            t_raw_html.c.article_id,  # 9
            t_raw_html.c.raw_html)  # 10
        stmt = stmt.where(sa.and_(t_article_content.c.article_id == t_articles.c.id, t_raw_html.c.article_id == t_articles.c.id))
        stmt = stmt.order_by(t_articles.c.id)
        for row in session.execute(stmt):
            if row[5] <= 1:
                # print(f"skip {row[0]} -- relevance <= 1 {row[5]}")
                continue
            if not row[7]:
                continue
            content = f"{row[7]}".strip()
            if not content:
                continue
            if count < skip_ahead:
                count += 1
                continue
            if LIMIT is not None and count >= LIMIT:
                break
            status = "public"
            if count % 100 == 0:
                print(f"adding #{count} blog {row[2]} {status} {row[3]}")
            url = f"{row[1]}".strip()
            if row[3]:
                date = datetime.fromisoformat(f"{row[3]}").isoformat()
            else:
                dates = call_nlpapi(
                    "/api/date", 
                    {
                        "raw_html": row[10],
                        "posted_date_str": row[8],
                        "language": None,
                        "use_date_str": True,
                    },
                    is_write=False,
                    is_print=False)
                date = dates["date"]
            title = row[2]
            if title:
                content = f"{title}\n\n{content}"
            call_nlpapi("/api/add_embed", {
                "input": content,
                "base": "blog",
                "doc_id": int(row[0]),
                "url": url,
                "title": title,
                "meta": {
                    "status": status,
                    "date": date,
                    "doc_type": f"{row[4]}",
                },
                "db": destination,
            }, is_write=True, is_print=count % 100 == 0)
            time_count += 1
            if count % 100 == 0:
                print(
                    f"took avg {(time.monotonic() - start_time) / time_count if time_count > 0 else '?'}s for {time_count}")
                first = False
                start_time = time.monotonic()
                time_count = 0
            count += 1
finally:
    duration = time.monotonic() - start_total_time
    print(f"processed: {count} in {duration}s avg time {duration / (count - skip_ahead) if (count - skip_ahead) > 0 else '?'}s")

In [None]:
call_nlpapi(
    "/api/build_index", 
    {
        "db": destination,
    },
    is_write=True,
    is_print=True)

In [None]:
with get_session() as session:
    stmt = sa.select(
        t_articles.c.id,  # 0
        t_articles.c.url,  # 1
        t_articles.c.title,  # 2
        t_articles.c.posted_date,  # 3
        t_articles.c.article_type,  # 4
        t_articles.c.relevance,  # 5
        t_article_content.c.article_id,  # 6
        t_article_content.c.content)  # 7
    stmt = stmt.where(sa.and_(t_article_content.c.article_id == t_articles.c.id, t_articles.c.id == 14091))
    stmt = stmt.order_by(t_articles.c.id)
    for row in session.execute(stmt):
        content = f"{row[7]}".strip()
        if not content or content.lower() == "none" or content.lower() == "null":
            content = ""
        url = f"{row[1]}".strip()
        title = f"{row[2]}".strip()
        if not title or title.lower() == "none" or title.lower() == "null":
            title = url
        content = f"{title}\n\n{content}"
        print(f"id: {row[0]}")
        print(f"url: {url}")
        print(f"title: {title}")
        print(f"date: {row[3]}")
        print(f"type: {row[4]}")
        print(f"relevance: {row[5]}")
        print(f"content: {content[:100]}")
        # print(
        #     json.dumps(
        #         call_nlpapi(
        #             "/api/extract",
        #             {"modules": [{"name": "location"}, {"name": "language"}], "input": content},
        #             is_write=False,
        #             is_print=True),
        #         sort_keys=True,
        #         indent=2))
        snippets = call_nlpapi(
            "/api/snippify",
            {"input": content},
            is_write=False,
            is_print=True)
        print(json.dumps(snippets, sort_keys=True, indent=2))
        shorts = call_nlpapi(
            "/api/snippify",
            {"input": snippets["snippets"][0], "small_snippets": True},
            is_write=False,
            is_print=True)
        print(json.dumps(shorts, sort_keys=True, indent=2))
        stest = call_nlpapi(
            "/api/snippify",
            {"input": snippets["snippets"][0], "chunk_size": 60, "chunk_padding": 20},
            is_write=False,
            is_print=True)
        print(json.dumps(stest, sort_keys=True, indent=2))

In [None]:
urls = []
with get_session() as session:
    stmt = sa.select(t_articles.c.url)
    for row in session.execute(stmt):
        urls.append(f"{row[0]}".strip())
len(urls)

In [None]:
countries = pd.read_csv("countries.csv")
country_map = {}
for _, row in countries.iterrows():
    country_map[row["name"].lower().replace(" ", "-")] = row["iso3"].strip()
len(country_map)

In [None]:
def process_url(country_set, found_countries, url):
    purl = url.removeprefix("https://www.undp.org/")
    for lang in (
            "ar", "az", "bs", "cnr", "da", "de", "es", "fi", "fr", "id", "ja", "ka", "kk", "km", "ko", "ku", "ky", "no",
            "pt", "ro", "ru", "sr", "sv", "tr", "uk", "uz", "vi", "zh"):
        turl = purl.removeprefix(f"{lang}/")
        if turl != purl:
            purl = turl
            break
    fix = purl.find("/")
    if fix < 0:
        return
    pres = purl[:fix]
    if pres in (
            "https:",
            "about-us",
            "acceleratorlabs",
            "accountability",
            "careers",
            "energy",
            "publications",
            "events",
            "executive-board",
            "approvisionnement",
            "blog",
            "press-releases",
            "speeches",
            "policy-centre",
            "seoul-policy-centre",
            "node",
            "papp",
            "procurement",
            "rolhr",
            "romecentre",
            "sgtechcentre",
            "sites",
            "stories"):
        return
    if pres in ("africa", "arab-states", "asia-pacific", "latin-america", "eurasia", "european-union", "geneva", "pacific"):
        return
    if pres in country_map:
        found_countries.add(country_map[pres])
        return
    country_set[pres].append(url)

country_set = collections.defaultdict(list)
found_countries = set()
for url in urls:
    process_url(country_set, found_countries, url)
country_set

In [None]:
len(found_countries)

In [None]:
with get_session() as session:
    stmt = sa.select(
        t_articles.c.id,  # 0
        t_articles.c.url,  # 1
        t_articles.c.title,  # 2
        t_articles.c.posted_date_str,  # 3
        t_articles.c.language,  # 4
        t_articles.c.relevance,  # 5
        t_raw_html.c.article_id,  # 6
        t_raw_html.c.raw_html)  # 7
    stmt = stmt.where(sa.and_(t_raw_html.c.article_id == t_articles.c.id, t_articles.c.id == 14091))
    stmt = stmt.order_by(t_articles.c.id)
    for row in session.execute(stmt):
        print(f"id: {row[0]}")
        print(f"url: {row[1]}")
        print(f"title: {row[2]}")
        print(f"date: {row[3]}")
        print(f"language: {row[4]}")
        print(f"relevance: {row[5]}")
        print(f"content length: {len(row[7])}")
        dates = call_nlpapi(
            "/api/date",
            {
                "raw_html": row[7],
                "posted_date_str": row[3],  # can be None
                "language": row[4],  # can be None
                "use_date_str": True,
            },
            is_write=False,
            is_print=True)
        print(json.dumps(dates, sort_keys=True, indent=2))
        dates = call_nlpapi(
            "/api/date",
            {
                "raw_html": row[7],
                "posted_date_str": None,  # can be None
                "language": None,  # can be None
                "use_date_str": True,
            },
            is_write=False,
            is_print=True)
        print(json.dumps(dates, sort_keys=True, indent=2))