In [1]:
import os
import json
import time
import requests
import contextlib

import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker

import pandas as pd
import numpy as np

import re
import urllib.parse

import matplotlib.pyplot as plt
import requests

In [2]:
from html import unescape
import unicodedata

In [3]:
def clean(text: str) -> str:
    text = text.strip()
    while True:
        prev_text = text
        text = unescape(text)
        if prev_text == text:
            break
    text = unicodedata.normalize("NFKC", text)
    return re.sub("\n\n\n+", "\n\n", re.sub("[ \t]+", " ", re.sub("\n[ \t]+", "\n", re.sub("\n\n+", "\n", re.sub("\r", "\n", text)))))

def strip_html(text: str) -> str:
    return re.sub(r"<(?:\"[^\"]*\"['\"]*|'[^']*'['\"]*|[^'\">])+>", "", re.sub(r"<br\s*/?\s*>", "\n", text.strip()))

In [4]:
CONFIG_PATH = "config.json"
VERBOSE = False

In [5]:
CONFIG = None
ENGINES = {}
TABLES = {}
BINDS = {}
SESSION = None


def config_template():
    default_conn = {
        "dialect": "postgresql",
        "host": "localhost",
        "port": 5432,
        "dbname": "INVALID",
        "schema": "public",
        "user": "INVALID",
        "passwd": "INVALID",
    }
    return {
        "dbs": {
            "login": default_conn.copy(),
            "sm": default_conn.copy(),
            "exp": default_conn.copy(),
            "ap": default_conn.copy(),
            "blogs": default_conn.copy(),
        },
        "nlpapi": {
            "host": "localhost",
            "token": "INVALID",
            "write_access": "INVALID",
        },
    }


def get_config():
    global CONFIG
    
    if CONFIG is not None:
        return CONFIG
    if not os.path.exists(CONFIG_PATH):
        with open(CONFIG_PATH, "w") as fout:
            print(json.dumps(config_template(), indent=4, sort_keys=True), file=fout)
        raise ValueError(
            f"config file missing. new file was created at '{CONFIG_PATH}'. "
            "please correct values in file and run again")
    with open(CONFIG_PATH, "r") as fin:
        CONFIG = json.load(fin)
    config_out = {
        "dbs": {
            "login": CONFIG["dbs"]["login"].copy(),
            "sm": CONFIG["dbs"]["sm"].copy(),
            "exp": CONFIG["dbs"]["exp"].copy(),
            "ap": CONFIG["dbs"]["ap"].copy(),
            "blogs": CONFIG["dbs"]["blogs"].copy(),
        },
         "nlpapi": CONFIG["nlpapi"].copy(),
    }
    config_out["dbs"]["login"]["passwd"] = "*****"
    config_out["dbs"]["sm"]["passwd"] = "*****"
    config_out["dbs"]["exp"]["passwd"] = "*****"
    config_out["dbs"]["ap"]["passwd"] = "*****"
    config_out["dbs"]["blogs"]["passwd"] = "*****"
    config_out["nlpapi"]["token"] = "*****"
    config_out["nlpapi"]["write_access"] = "*****"
    print(f"loaded config\n{json.dumps(config_out, indent=2, sort_keys=True)}")
    return CONFIG


def get_engine(dbname):
    res = ENGINES.get(dbname)
    if res is not None:
        return res
    db = get_config()["dbs"][dbname]
    user = urllib.parse.quote_plus(db["user"])
    passwd = urllib.parse.quote_plus(db["passwd"])
    engine = sa.create_engine(
        f"{db['dialect']}://{user}:{passwd}@{db['host']}:{db['port']}/{db['dbname']}",
        echo=VERBOSE)
    engine = engine.execution_options(
        schema_translate_map={None: db['schema']})
    res = engine, sa.MetaData()
    ENGINES[dbname] = res
    return res


def get_table(dbname, tablename):
    global SESSION
    
    key = (dbname, tablename)
    res = TABLES.get(key)
    if res is not None:
        return res
    SESSION = None
    engine, metadata = get_engine(dbname)
    res = sa.Table(
        tablename,
        metadata,
        autoload_with=engine)
    TABLES[key] = res
    BINDS[res] = engine
    return res


@contextlib.contextmanager
def get_session():
    global SESSION
    
    session = SESSION
    if session is None:
        session = sessionmaker()
        session.configure(binds=BINDS)
        SESSION = session
    with session() as res:
        yield res

In [6]:
def call_nlpapi(path: str, payload: dict, *, is_write: bool, is_print: bool) -> dict:
    config = get_config()["nlpapi"]
    url = f"https://{config['host']}{path}"
    if is_print:
        print(url)
    res = requests.post(url, json={
        **payload,
        "token": config["token"],
        **({"write_access": config["write_access"]} if is_write else {}),
    }, timeout=120)
    res.raise_for_status()
    return res.json()

In [7]:
# global tables
t_tags = get_table("login", "tags")
t_users = get_table("login", "users")

# solution mapping tables
t_sm_pads = get_table("sm", "pads")
t_sm_tagging = get_table("sm", "tagging")

# action plan tables
t_ap_pads = get_table("ap", "pads")
t_ap_tagging = get_table("ap", "tagging")

# experiments tables
t_exp_pads = get_table("exp", "pads")
t_exp_tagging = get_table("exp", "tagging")

loaded config
{
  "dbs": {
    "ap": {
      "dbname": "action_plans_platform",
      "dialect": "postgresql",
      "host": "acclabs-global-login.postgres.database.azure.com",
      "passwd": "*****",
      "port": 5432,
      "schema": "public",
      "user": "acclabshqadmin@acclabs-global-login"
    },
    "blogs": {
      "dbname": "blogs",
      "dialect": "postgresql",
      "host": "acclabs.postgres.database.azure.com",
      "passwd": "*****",
      "port": 5432,
      "schema": "public",
      "user": "undpacclab@acclabs"
    },
    "exp": {
      "dbname": "experiments_platform",
      "dialect": "postgresql",
      "host": "acclabs-global-login.postgres.database.azure.com",
      "passwd": "*****",
      "port": 5432,
      "schema": "public",
      "user": "acclabshqadmin@acclabs-global-login"
    },
    "login": {
      "dbname": "postgres",
      "dialect": "postgresql",
      "host": "acclabs-global-login.postgres.database.azure.com",
      "passwd": "*****",
      "port

  res = sa.Table(


In [8]:
call_nlpapi("/api/query_embed", {
    "input": "africa",
    "offset": 0,
    "limit": 10,
    "db": "test",
    "score_threshold": 0.2,
    "filters": {"iso3": ["egy"]},
}, is_write=False, is_print=True)

https://acclabs-nlpapi.azurewebsites.net/api/query_embed


{'hits': [], 'status': 'ok'}

In [9]:
tags = {}
with get_session() as session:
    stmt = sa.select(t_tags.c.id, t_tags.c.name, t_tags.c.type)
    for row in session.execute(stmt):
        tags[f"{row[2]}-{row[0]}"] = row[1]

In [10]:
status_map = {
    2: "preview",
    3: "public",
}

In [11]:
skip_ahead = 5250

In [12]:
with get_session() as session:
    destination = "main"  # "test"
    names = ["solution", "actionplan", "experiment"]
    urls = [
        "https://solutions.sdg-innovation-commons.org/en/view/pad?id=",
        "https://learningplans.sdg-innovation-commons.org/en/view/pad?id=",
        "https://experiments.sdg-innovation-commons.org/en/view/pad?id=",
    ]
    dbs = [
        t_sm_pads,
        t_ap_pads,
        t_exp_pads,
    ]
    doc_types = [
        "solution",
        "action plan",
        "experiment",
    ]    
    count = 0
    start_total_time = time.monotonic()
    try:
        for (name, url_prefix, pad_db, doc_type) in zip(names, urls, dbs, doc_types):
            stmt = sa.select(pad_db.c.status, pad_db.c.full_text, pad_db.c.id, pad_db.c.update_at, pad_db.c.title)
            stmt = stmt.where(pad_db.c.status > 1)
            start_time = time.monotonic()
            time_count = 0
            print(f"processing {name}")
            for row in session.execute(stmt):
                if count < skip_ahead:
                    count += 1
                    continue
                status = status_map[int(row[0])]
                if count % 100 == 0:
                    print(f"adding #{count} {name} {row[2]} {status} {row[3]}")
                url = f"{url_prefix}{int(row[2])}"
                title = f"{row[4]}".strip()
                if not title:
                    title = url
                call_nlpapi("/api/add_embed", {
                    "input": f"{row[1]}",
                    "base": f"{name}",
                    "doc_id": int(row[2]),
                    "url": url,
                    "title": title,
                    "meta": {
                        "status": status,
                        "date": f"{row[3]}",
                        "doc_type": doc_type,
                    },
                    "db": destination,
                }, is_write=True, is_print=count % 100 == 0)
                time_count += 1
                if count % 100 == 0:
                    print(
                        f"took avg {(time.monotonic() - start_time) / time_count if time_count > 0 else '?'}s for {time_count}")
                    first = False
                    start_time = time.monotonic()
                    time_count = 0
                count += 1
    finally:
        duration = time.monotonic() - start_total_time
        print(f"processed: {count} in {duration}s avg time {duration / (count - skip_ahead) if (count - skip_ahead) > 0 else '?'}s")

processing solution
processing actionplan
processing experiment
processed: 5250 in 5.703836207976565s avg time ?s


In [13]:
for name, pad_db in zip(names, dbs):
    if name not in {"actionplan"}:
        continue
    stmt = sa.select(pad_db.c.status, pad_db.c.full_text, pad_db.c.id, pad_db.c.update_at, pad_db.c.title)
    stmt = stmt.where(pad_db.c.id == 1883)
    for row in session.execute(stmt):
        print(f"name: {name}")
        print(f"id: {row[2]}")
        print(f"title: {row[4]}")
        print(f"status: {row[0]}")
        print(f"update_at: {row[3]}")
        print(f"full_text: {row[1]}")
        full_text = f"{row[1]}"

name: actionplan
id: 1883
title: Collective Intelligence to Accelerate the Just Energy Transition in South A…
status: 3
update_at: 2024-03-25 14:48:44.097060+00:00
full_text: Collective Intelligence to Accelerate the Just Energy Transition in South A…

Title
null
Please provide a name for your action learning plan.
Using Collective Intelligence to Accelerate the Just Energy Transition in South Africa
Challenge statement
null
Challenge type: If you are working on multiple challenges, please indicate if this is your "big bet" or "exploratory" challenge. 

Please note: we ask you to only submit a maximum of 3 challenges - 1x Big Bet, 2x Exploratory. Each challenge must be submitted individually.
BIG BET

Challenge statement: What is your challenge? (Please answer in specific terms: "Our challenge is that...”.)
Our challenge is that the most vulnerable groups who will be affected by the Just Transition, namely coal miners, women, and youth, have not been sufficiently consulted regarding th

In [14]:
call_nlpapi("/api/extract", {"modules": [{"name": "location"}, {"name": "language"}], "input": full_text}, is_write=False, is_print=True)

https://acclabs-nlpapi.azurewebsites.net/api/extract


{'language': {'languages': [{'count': 10,
    'lang': 'en',
    'score': 0.9999968743886795}]},
 'location': {'country': 'ZAF',
  'entities': [{'contexts': ['…Energy Transition in *South Africa*\nChallenge statement…',
     '…effects does it produce?\n*South Africa* generates about 80 percent…',
     '…conducted by COP26, *South Africa* is the world’s 13th…',
     '…zero CO2 emissions in *South Africa* by 2050.\nThe local social…',
     '…the EU Delegation to *South Africa*, South African Institute…'],
    'count': 5,
    'location': {'country': 'ZAF',
     'formatted': 'South Africa',
     'lat': -28.8166236,
     'lng': 24.991639,
     'relevance': 1.0},
    'query': 'South Africa',
    'spans': [[217, 229],
     [1050, 1062],
     [1203, 1215],
     [1355, 1367],
     [7901, 7913]],
    'status': 'cache_hit'},
   {'contexts': ['…state the name of the *Parter*:\niSpani\nWhat sector…'],
    'count': 1,
    'location': {'country': 'DEU',
     'formatted': 'Parterre, 06785 Oranienbaum, 

In [15]:
t_articles = get_table("blogs", "articles")
t_article_content = get_table("blogs", "article_content")

OperationalError: (psycopg2.OperationalError) connection to server at "acclabs.postgres.database.azure.com" (13.69.105.208), port 5432 failed: FATAL:  password authentication failed for user "undpacclab"
connection to server at "acclabs.postgres.database.azure.com" (13.69.105.208), port 5432 failed: FATAL:  password authentication failed for user "undpacclab"

(Background on this error at: https://sqlalche.me/e/20/e3q8)

In [None]:
stmt = sa.select(
    t_articles.c.id,
    t_articles.c.url,
    t_articles.c.title,
    t_articles.c.posted_date,
    t_articles.c.article_type,
    t_articles.c.relevance,
    t_article_content.c.article_id,
    t_article_content.c.content)
stmt = stmt.where(t_article_content.c.article_id == t_articles.c.id)
stmt = stmt.limit(2)
for row in session.execute(stmt):
    print(row)