In [None]:
# Get vote dataset
# For each vote, pull event minutes item and minutes item files, read docs
from multiprocessing import set_start_method

set_start_method("spawn")

In [None]:
from cdp_data import CDPInstances, datasets

votes = datasets.get_vote_dataset(
    CDPInstances.Seattle,
    start_datetime="2022-11-25",
)
votes

In [None]:
subset = votes[["id", "person_name", "decision", "event_minutes_item_key", "matter_key"]].sample(30)
subset

In [None]:
from typing import Callable
from uuid import uuid4
from cdp_backend.utils import file_utils
from pathlib import Path
from functools import partial

def fetch_process_return_text(uri: str, processor: Callable) -> list[str]:
    # Create temp store path
    temp_store_path = Path(f"temp_{uuid4()}")
    
    # Store remote to local
    file_utils.resource_copy(uri, dst=temp_store_path, overwrite=True)
    
    # Process and get text
    text = processor(temp_store_path)
    
    # Remove temp file
    temp_store_path.unlink(missing_ok=True)
    
    return text

In [None]:
import docx
from pathlib import Path

def docx_processor(doc_path: Path) -> list[str]:
    # Read and process
    doc = docx.Document(doc_path)
    text = []
    for para in doc.paragraphs:
        this_text = para.text.strip()
        if len(this_text) > 0:
            text.append(this_text)
    
    return text

read_docx = partial(fetch_process_return_text, processor=docx_processor)

# read_docx("https://legistar2.granicus.com/seattle/attachments/9cd74a40-f12a-401a-90f6-e5047715f12c.docx")

In [None]:
from PyPDF2 import PdfReader
from pathlib import Path

def pdf_processor(doc_path: Path) -> list[str]:
    # Read and process
    reader = PdfReader(doc_path)
    text = []
    for page in reader.pages:
        this_page_text = page.extract_text().strip()
        for para in this_page_text.split("\n"):
            para_cleaned = para.strip()
            if len(para_cleaned) > 0:
                text.append(para_cleaned)
            
    return text

read_pdf = partial(fetch_process_return_text, processor=pdf_processor)

# read_pdf("https://legistar2.granicus.com/seattle/attachments/b7be6254-f3e0-468c-babc-f81ee65fd305.pdf")

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L12-v2")

def process_file(uri: str, embedding_model=model):
    try:
        text = read_docx(uri)
    except:
        try:
            text = read_pdf(uri)
        except Exception as e:
            return None
    
    if len(" ".join(text).strip()) == 0:
        return None
    
    vecs = model.encode(text)
    mean_vec = vecs.mean(axis=0)
    return mean_vec, " ".join(text)

In [None]:
mean_vec, text = process_file("https://legistar2.granicus.com/seattle/attachments/9cd74a40-f12a-401a-90f6-e5047715f12c.docx")
mean_vec.shape, text[:100]

In [None]:
from cdp_backend.database import models as db_models
from cdp_data.utils import connect_to_infrastructure
import numpy as np
from tqdm import tqdm
import pandas as pd
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"

vote_rows = [row for _, row in subset.iterrows()]

def process_vote_row(row):
    connect_to_infrastructure(CDPInstances.Seattle)
    
    docs1 = [
        emif.uri for emif in
        db_models.EventMinutesItemFile.collection.filter(
            "event_minutes_item_ref", "==", row.event_minutes_item_key
        ).fetch()
    ]
    docs2 = [
        mf.uri for mf in
        db_models.MatterFile.collection.filter(
            "matter_ref", "==", row.matter_key
        ).fetch()
    ]
    docs = set([
        *docs1,
        *docs2,
    ])
    
    # encode decision
    decision_lut = {
        "Approve": 1,
        "Reject": -1,
    }
    encoded_decision = decision_lut.get(row.decision, 0)
    
    for doc in docs:
        ret_val = process_file(doc)
        if ret_val is not None:
            vec, text = ret_val

            return {
                "person_name": row.person_name,
                "decision": encoded_decision,
                "text": text,
                **{
                    f"text_feat_{i}": val
                    for i, val in enumerate(vec.tolist())
                },
            }
            
doc_vote_rows = [process_vote_row(row) for row in tqdm(vote_rows)]
doc_votes = pd.DataFrame(doc_vote_rows)
doc_votes

In [1]:
import pandas as pd

fake_df = pd.DataFrame([
    {
        "person": "eva",
        "text": "climate",
        "decision": 1,
        "text_feat_0": 0.6,
        "text_feat_1": 0.7,
        "text_feat_2": -0.1,
        "text_feat_3": 0.2,
    },
    {
        "person": "nic",
        "text": "climate",
        "decision": 1,
        "text_feat_0": 0.6,
        "text_feat_1": 0.7,
        "text_feat_2": -0.1,
        "text_feat_3": 0.2,
    },
    {
        "person": "lindsey",
        "text": "climate",
        "decision": -1,
        "text_feat_0": 0.6,
        "text_feat_1": 0.7,
        "text_feat_2": -0.1,
        "text_feat_3": 0.2,
    },
    {
        "person": "richard",
        "text": "climate",
        "decision": 1,
        "text_feat_0": 0.6,
        "text_feat_1": 0.7,
        "text_feat_2": -0.1,
        "text_feat_3": 0.2,
    },
    {
        "person": "eva",
        "text": "housing",
        "decision": 1,
        "text_feat_0": 0.3,
        "text_feat_1": 0.1,
        "text_feat_2": -0.4,
        "text_feat_3": -0.9,
    },
    {
        "person": "nic",
        "text": "housing",
        "decision": 1,
        "text_feat_0": 0.3,
        "text_feat_1": 0.1,
        "text_feat_2": -0.4,
        "text_feat_3": -0.9,
    },
    {
        "person": "lindsey",
        "text": "housing",
        "decision": -1,
        "text_feat_0": 0.3,
        "text_feat_1": 0.1,
        "text_feat_2": -0.4,
        "text_feat_3": -0.9,
    },
    {
        "person": "richard",
        "text": "housing",
        "decision": -1,
        "text_feat_0": 0.6,
        "text_feat_1": 0.7,
        "text_feat_2": -0.1,
        "text_feat_3": 0.2,
    },
    {
        "person": "eva",
        "text": "transportation",
        "decision": 1,
        "text_feat_0": 0.9,
        "text_feat_1": 0.8,
        "text_feat_2": 0.1,
        "text_feat_3": 0.3,
    },
    {
        "person": "nic",
        "text": "transportation",
        "decision": 1,
        "text_feat_0": 0.9,
        "text_feat_1": 0.8,
        "text_feat_2": 0.1,
        "text_feat_3": 0.3,
    },
    {
        "person": "lindsey",
        "text": "transportation",
        "decision": -1,
        "text_feat_0": 0.9,
        "text_feat_1": 0.8,
        "text_feat_2": 0.1,
        "text_feat_3": 0.3,
    },
    {
        "person": "richard",
        "text": "transportation",
        "decision": -1,
        "text_feat_0": 0.9,
        "text_feat_1": 0.8,
        "text_feat_2": 0.1,
        "text_feat_3": 0.3,
    },
    {
        "person": "eva",
        "text": "budget",
        "decision": -1,
        "text_feat_0": -0.3,
        "text_feat_1": -0.6,
        "text_feat_2": 0.1,
        "text_feat_3": -0.9,
    },
    {
        "person": "nic",
        "text": "budget",
        "decision": -1,
        "text_feat_0": -0.3,
        "text_feat_1": -0.6,
        "text_feat_2": 0.1,
        "text_feat_3": -0.9,
    },
    {
        "person": "lindsey",
        "text": "budget",
        "decision": 1,
        "text_feat_0": -0.3,
        "text_feat_1": -0.6,
        "text_feat_2": 0.1,
        "text_feat_3": -0.9,
    },
    {
        "person": "richard",
        "text": "budget",
        "decision": -1,
        "text_feat_0": -0.3,
        "text_feat_1": -0.6,
        "text_feat_2": 0.1,
        "text_feat_3": -0.9,
    },
    {
        "person": "eva",
        "text": "parks",
        "decision": 1,
        "text_feat_0": 0.5,
        "text_feat_1": 0.5,
        "text_feat_2": 0.5,
        "text_feat_3": -0.1,
    },
    {
        "person": "nic",
        "text": "parks",
        "decision": 1,
        "text_feat_0": 0.5,
        "text_feat_1": 0.5,
        "text_feat_2": 0.5,
        "text_feat_3": -0.1,
    },
    {
        "person": "lindsey",
        "text": "parks",
        "decision": -1,
        "text_feat_0": 0.5,
        "text_feat_1": 0.5,
        "text_feat_2": 0.5,
        "text_feat_3": -0.1,
    },
    {
        "person": "richard",
        "text": "parks",
        "decision": -1,
        "text_feat_0": 0.5,
        "text_feat_1": 0.5,
        "text_feat_2": 0.5,
        "text_feat_3": -0.1,
    },
    {
        "person": "eva",
        "text": "school",
        "decision": -1,
        "text_feat_0": 0.2,
        "text_feat_1": 0.1,
        "text_feat_2": -0.3,
        "text_feat_3": 0.1,
    },
    {
        "person": "nic",
        "text": "school",
        "decision": -1,
        "text_feat_0": 0.2,
        "text_feat_1": 0.1,
        "text_feat_2": -0.3,
        "text_feat_3": 0.1,
    },
    {
        "person": "lindsey",
        "text": "school",
        "decision": 1,
        "text_feat_0": 0.2,
        "text_feat_1": 0.1,
        "text_feat_2": -0.3,
        "text_feat_3": 0.1,
    },
    {
        "person": "richard",
        "text": "school",
        "decision": -1,
        "text_feat_0": 0.2,
        "text_feat_1": 0.1,
        "text_feat_2": -0.3,
        "text_feat_3": 0.1,
    },
])
fake_df

Unnamed: 0,person,text,decision,text_feat_0,text_feat_1,text_feat_2,text_feat_3
0,eva,climate,1,0.6,0.7,-0.1,0.2
1,nic,climate,1,0.6,0.7,-0.1,0.2
2,lindsey,climate,-1,0.6,0.7,-0.1,0.2
3,richard,climate,1,0.6,0.7,-0.1,0.2
4,eva,housing,1,0.3,0.1,-0.4,-0.9
5,nic,housing,1,0.3,0.1,-0.4,-0.9
6,lindsey,housing,-1,0.3,0.1,-0.4,-0.9
7,richard,housing,-1,0.6,0.7,-0.1,0.2
8,eva,transportation,1,0.9,0.8,0.1,0.3
9,nic,transportation,1,0.9,0.8,0.1,0.3


In [2]:
data = fake_df[fake_df.columns[~fake_df.columns.isin(["person", "text"])]]
data

Unnamed: 0,decision,text_feat_0,text_feat_1,text_feat_2,text_feat_3
0,1,0.6,0.7,-0.1,0.2
1,1,0.6,0.7,-0.1,0.2
2,-1,0.6,0.7,-0.1,0.2
3,1,0.6,0.7,-0.1,0.2
4,1,0.3,0.1,-0.4,-0.9
5,1,0.3,0.1,-0.4,-0.9
6,-1,0.3,0.1,-0.4,-0.9
7,-1,0.6,0.7,-0.1,0.2
8,1,0.9,0.8,0.1,0.3
9,1,0.9,0.8,0.1,0.3


In [3]:
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, output_notebook, show
from umap import UMAP

umap = UMAP()
xtfm = umap.fit_transform(data)

fit_data = pd.DataFrame()
fit_data["x"] = xtfm[:, 0]
fit_data["y"] = xtfm[:, 1]
fit_data["person"] = fake_df["person"]
fit_data["text"] = fake_df["text"]
fit_data["display_text"] = fake_df["person"] + "--" + fake_df["text"]
fit_data

Unnamed: 0,x,y,person,text,display_text
0,-4.123665,-0.993135,eva,climate,eva--climate
1,-4.838696,-2.00545,nic,climate,nic--climate
2,7.421381,10.796131,lindsey,climate,lindsey--climate
3,-4.494793,-1.316948,richard,climate,richard--climate
4,-2.94662,-1.491961,eva,housing,eva--housing
5,-2.986534,-1.760503,nic,housing,nic--housing
6,6.53051,9.862044,lindsey,housing,lindsey--housing
7,7.824811,11.162517,richard,housing,richard--housing
8,-4.726674,-1.596422,eva,transportation,eva--transportation
9,-4.901511,-1.318839,nic,transportation,nic--transportation


In [4]:
import bokeh.models as bmo
from bokeh.palettes import d3

source = ColumnDataSource(fit_data)


DEFAULT_TOOLTIP_FORMATTER = """
<div style="max-width: 400px; word-wrap: break-word;">
    <span style="color: blue;font-weight: bold;">text: </span>@display_text
</div>
"""

palette = d3['Category10'][len(fake_df["person"].unique())]
color_map = bmo.CategoricalColorMapper(factors=fake_df["person"].unique(),
                                   palette=palette)

p = figure(width=800, height=800, tooltips=DEFAULT_TOOLTIP_FORMATTER)
p.scatter(x="x", y="y", source=source, size=3, alpha=0.8, color={'field': 'person', 'transform': color_map}, legend_field="person")
p.xaxis[0].axis_label = "X"
p.yaxis[0].axis_label = "Y"

output_notebook()
show(p)