In [1]:
import os
import json
import time
import requests
import contextlib

import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker

import pandas as pd
import numpy as np

import re
import urllib.parse

import matplotlib.pyplot as plt
import requests

In [2]:
from html import unescape
import unicodedata

In [3]:
def clean(text: str) -> str:
    text = text.strip()
    while True:
        prev_text = text
        text = unescape(text)
        if prev_text == text:
            break
    text = unicodedata.normalize("NFKC", text)
    return re.sub("\n\n\n+", "\n\n", re.sub("[ \t]+", " ", re.sub("\n[ \t]+", "\n", re.sub("\n\n+", "\n", re.sub("\r", "\n", text)))))

def strip_html(text: str) -> str:
    return re.sub(r"<(?:\"[^\"]*\"['\"]*|'[^']*'['\"]*|[^'\">])+>", "", re.sub(r"<br\s*/?\s*>", "\n", text.strip()))

In [4]:
CONFIG_PATH = "config.json"
VERBOSE = False

In [5]:
CONFIG = None
ENGINES = {}
TABLES = {}
BINDS = {}
SESSION = None


def config_template():
    default_conn = {
        "dialect": "postgresql",
        "host": "localhost",
        "port": 5432,
        "dbname": "INVALID",
        "schema": "public",
        "user": "INVALID",
        "passwd": "INVALID",
    }
    return {
        "dbs": {
            "login": default_conn.copy(),
            "sm": default_conn.copy(),
            "exp": default_conn.copy(),
            "ap": default_conn.copy(),
            "blogs": default_conn.copy(),
        },
        "nlpapi": {
            "host": "localhost",
            "token": "INVALID",
            "write_access": "INVALID",
        },
    }


def get_config():
    global CONFIG
    
    if CONFIG is not None:
        return CONFIG
    if not os.path.exists(CONFIG_PATH):
        with open(CONFIG_PATH, "w") as fout:
            print(json.dumps(config_template(), indent=4, sort_keys=True), file=fout)
        raise ValueError(
            f"config file missing. new file was created at '{CONFIG_PATH}'. "
            "please correct values in file and run again")
    with open(CONFIG_PATH, "r") as fin:
        CONFIG = json.load(fin)
    config_out = {
        "dbs": {
            "login": CONFIG["dbs"]["login"].copy(),
            "sm": CONFIG["dbs"]["sm"].copy(),
            "exp": CONFIG["dbs"]["exp"].copy(),
            "ap": CONFIG["dbs"]["ap"].copy(),
            "blogs": CONFIG["dbs"]["blogs"].copy(),
        },
         "nlpapi": CONFIG["nlpapi"].copy(),
    }
    config_out["dbs"]["login"]["passwd"] = "*****"
    config_out["dbs"]["sm"]["passwd"] = "*****"
    config_out["dbs"]["exp"]["passwd"] = "*****"
    config_out["dbs"]["ap"]["passwd"] = "*****"
    config_out["nlpapi"]["token"] = "*****"
    config_out["nlpapi"]["write_access"] = "*****"
    print(f"loaded config\n{json.dumps(config_out, indent=2, sort_keys=True)}")
    return CONFIG


def get_engine(dbname):
    res = ENGINES.get(dbname)
    if res is not None:
        return res
    db = get_config()["dbs"][dbname]
    user = urllib.parse.quote_plus(db["user"])
    passwd = urllib.parse.quote_plus(db["passwd"])
    engine = sa.create_engine(
        f"{db['dialect']}://{user}:{passwd}@{db['host']}:{db['port']}/{db['dbname']}",
        echo=VERBOSE)
    engine = engine.execution_options(
        schema_translate_map={None: db['schema']})
    res = engine, sa.MetaData()
    ENGINES[dbname] = res
    return res


def get_table(dbname, tablename):
    global SESSION
    
    key = (dbname, tablename)
    res = TABLES.get(key)
    if res is not None:
        return res
    SESSION = None
    engine, metadata = get_engine(dbname)
    res = sa.Table(
        tablename,
        metadata,
        autoload_with=engine)
    TABLES[key] = res
    BINDS[res] = engine
    return res


@contextlib.contextmanager
def get_session():
    global SESSION
    
    session = SESSION
    if session is None:
        session = sessionmaker()
        session.configure(binds=BINDS)
        SESSION = session
    with session() as res:
        yield res

In [6]:
def call_nlpapi(path: str, payload: dict, *, is_write: bool) -> dict:
    config = get_config()["nlpapi"]
    url = f"https://{config['host']}{path}"
    print(url)
    res = requests.post(url, json={
        **payload,
        "token": config["token"],
        **({"write_access": config["write_access"]} if is_write else {}),
    }, timeout=120)
    res.raise_for_status()
    return res.json()

In [7]:
# global tables
t_tags = get_table("login", "tags")
t_users = get_table("login", "users")

# solution mapping tables
t_sm_pads = get_table("sm", "pads")
t_sm_tagging = get_table("sm", "tagging")

# action plan tables
t_ap_pads = get_table("ap", "pads")
t_ap_tagging = get_table("ap", "tagging")

# experiments tables
t_exp_pads = get_table("exp", "pads")
t_exp_tagging = get_table("exp", "tagging")

loaded config
{
  "dbs": {
    "ap": {
      "dbname": "action_plans_platform",
      "dialect": "postgresql",
      "host": "acclabs-global-login.postgres.database.azure.com",
      "passwd": "*****",
      "port": 5432,
      "schema": "public",
      "user": "acclabshqadmin@acclabs-global-login"
    },
    "blogs": {
      "dbname": "blogs",
      "dialect": "postgresql",
      "host": "acclabs.postgres.database.azure.com",
      "passwd": "acclabsblogs@2023",
      "port": 5432,
      "schema": "public",
      "user": "undpacclab@acclabs"
    },
    "exp": {
      "dbname": "experiments_platform",
      "dialect": "postgresql",
      "host": "acclabs-global-login.postgres.database.azure.com",
      "passwd": "*****",
      "port": 5432,
      "schema": "public",
      "user": "acclabshqadmin@acclabs-global-login"
    },
    "login": {
      "dbname": "postgres",
      "dialect": "postgresql",
      "host": "acclabs-global-login.postgres.database.azure.com",
      "passwd": "*****",

  res = sa.Table(


In [8]:
call_nlpapi("/api/query_embed", {
    "input": "africa",
    "offset": 0,
    "limit": 10,
    "db": "test",
    "score_threshold": 0.2,
    "filters": {"iso3": ["egy"]},
}, is_write=False)

https://acclabs-nlpapi.azurewebsites.net/api/query_embed


{'hits': [{'base': 'blog',
   'doc_id': 1,
   'main_id': 'blog:1',
   'meta': {'language': ['fr'], 'status': 'public'},
   'score': 0.62715155,
   'snippets': ['https://www.undp.org/africa/press-releases/africa-border-\nlands-centre-reimagining-africa%E2%80%99s-borderlands\n-and-empowering-its-voices\nhttps://www.undp.org/africa/press-releases/through-pande-\nmic-and-beyond-new-report-reveals-regional-partnerships-a\nre-key-covid-19-recovery-africa \nhttps://www.undp.org/africa/press-releases/peer-lear-\nning-and-cross-regional-dialogues-create-opportunities-\nentrepreneurs-one-african-market \nhttps://www.undp.org/africa/press-releases/cross-bor-\nder-trade-driver-peace-liptako-gourma-region',
    '. Box 60130\nAddis Ababa, Éthiopie\n\nhttp://www.africa.undp.org\nrp.africa@undp.org \nTwitter @UNDPAfrica',
    'n Union\nhttps://www.undp.org/africa/press-releases/afri-\ncan-union-and-undp-launch-%E2%80%9Cau-20%E2%80%9D-s\ntudy-review-progress-ahead-au%E2%80%99s-20th-anniversary \nhttps:

In [9]:
tags = {}
with get_session() as session:
    stmt = sa.select(t_tags.c.id, t_tags.c.name, t_tags.c.type)
    for row in session.execute(stmt):
        tags[f"{row[2]}-{row[0]}"] = row[1]

In [10]:
status_map = {
    2: "preview",
    3: "public",
}

In [11]:
with get_session() as session:
    # for (name, pad_db) in [("sm", t_sm_pads), ("ap", t_ap_pads), ("exp", t_exp_pads)]:
    pad_db = t_sm_pads
    name = "sm"
    url_prefix = "https://solutions.sdg-innovation-commons.org/en/view/pad?id="
    stmt = sa.select(pad_db.c.status, pad_db.c.full_text, pad_db.c.id, pad_db.c.update_at)
    stmt = stmt.where(pad_db.c.status > 1).limit(100)
    for row in session.execute(stmt):
        start_time = time.monotonic()
        status = status_map[int(row[0])]
        print(f"adding {name} {row[2]} {status} {row[3]}")
        call_nlpapi("/api/add_embed", {
            "input": f"{row[1]}",
            "base": f"{name}",
            "doc_id": int(row[2]),
            "url": f"{url_prefix}{int(row[2])}",
            "meta": {
                "status": status,
                "date": f"{row[3]}",
            },
            "db": "test",
        }, is_write=True)
        print(f"took {time.monotonic() - start_time}s")

adding sm 4364 public 2023-09-04 10:18:44.457811+00:00
https://acclabs-nlpapi.azurewebsites.net/api/add_embed
took 0.9506647910457104s
adding sm 4571 public 2023-01-16 08:33:43.843095+00:00
https://acclabs-nlpapi.azurewebsites.net/api/add_embed
took 1.0893373750150204s
adding sm 4145 preview 2023-01-16 08:33:43.843095+00:00
https://acclabs-nlpapi.azurewebsites.net/api/add_embed
took 0.9359839579556137s
adding sm 4366 public 2023-09-04 10:20:36.926975+00:00
https://acclabs-nlpapi.azurewebsites.net/api/add_embed
took 2.1341014580102637s
adding sm 4361 public 2023-09-04 10:26:32.678102+00:00
https://acclabs-nlpapi.azurewebsites.net/api/add_embed
took 2.7181810409529135s
adding sm 4358 public 2023-09-04 10:40:22.430835+00:00
https://acclabs-nlpapi.azurewebsites.net/api/add_embed
took 2.2990672920132056s
adding sm 4939 public 2023-01-16 08:33:43.843095+00:00
https://acclabs-nlpapi.azurewebsites.net/api/add_embed
took 2.6617935840040445s
adding sm 5053 public 2023-01-16 08:33:43.843095+00:00