In [1]:
# from cdp_data import CDPInstances, datasets

# votes = datasets.get_vote_dataset(
#     CDPInstances.Seattle,
#     start_datetime="2022-11-27",
# )
# votes

In [2]:
from cdp_backend.utils.file_utils import resource_copy
import docx
from pathlib import Path
from uuid import uuid4
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer

def process_document(doc_path):
    # Try docx
    try:
        doc = docx.Document(doc_path)
        texts = []
        for para in doc.paragraphs:
            this_text = para.text.strip()
            if len(this_text) > 0:
                texts.append(this_text)

        # If we found text
        # This was a docx file
        if len(texts) > 0:
            return texts

        # If we didn't find text, it was still a docx file
        # that we just failed to parse
        # No need to try anything else in that case, just return
        return None

    # Try pdf
    except:
        try:
            reader = PdfReader(doc_path)
            texts = []
            for page in reader.pages:
                this_page_text = page.extract_text().strip()
                for para in this_page_text.split("\n"):
                    para_cleaned = para.strip()
                    if len(para_cleaned) > 0:
                        texts.append(para_cleaned)
            
            # If we found text
            # This was a pdf file
            if len(texts) > 0:
                return texts

            # If we didn't find text, it was still a pdf file
            # That we just failed to parse
            return None
        
        # It wasn't either file format
        except:
            return None

def process_matter_file(matter_file):
    try:
        # Create temp store path
        temp_store_path = Path(f"temp_{uuid4()}")

        # Store remote to local
        temp_store_path = Path(
            resource_copy(matter_file.uri, dst=temp_store_path, overwrite=True),
        )

        # Get document content
        texts = process_document(temp_store_path)

        # Handle file not supported
        if texts is None:
            return None

        # Otherwise proceed
        # Encode all texts and then get the average vector
        model = SentenceTransformer("all-MiniLM-L12-v2")
        vec = model.encode(texts).mean(axis=0)

        return {"vec": vec, "text": " ".join(texts)[:256]}
    
    # Always clear file
    finally:
        temp_store_path.unlink(missing_ok=True)

In [3]:
from cdp_data import CDPInstances
from cdp_data.utils import connect_to_infrastructure
from cdp_backend.database import models as db_models
from tqdm import tqdm

# Ignore resource copy warnings
import warnings
warnings.filterwarnings("ignore", message=".*Unverified HTTPS request is being made to host.*")

connect_to_infrastructure(CDPInstances.Seattle)

matter_files = list(db_models.MatterFile.collection.fetch(400))

matter_id_vec_lut = {}
for matter_file in tqdm(matter_files, desc="Processing matter files"):
    # Process the file
    process_result = process_matter_file(matter_file)
    
    # Add the result to the LUT
    if process_result is not None:
        # Create the vec list on the first encounter
        if matter_file.matter_ref.ref.id not in matter_id_vec_lut:
            matter_id_vec_lut[matter_file.matter_ref.ref.id] = []
        
        # Add the new vec
        matter_id_vec_lut[matter_file.matter_ref.ref.id].append(process_result)

Processing matter files:  37%|██████████████████████████████████████████████▎                                                                               | 147/400 [06:46<19:27,  4.62s/it]unknown widths : 
[0, IndirectObject(67, 0, 140450741186144)]
unknown widths : 
[0, IndirectObject(70, 0, 140450741186144)]
unknown widths : 
[0, IndirectObject(73, 0, 140450741186144)]
unknown widths : 
[0, IndirectObject(76, 0, 140450741186144)]
unknown widths : 
[0, IndirectObject(79, 0, 140450741186144)]
unknown widths : 
[0, IndirectObject(82, 0, 140450741186144)]
unknown widths : 
[0, IndirectObject(85, 0, 140450741186144)]
unknown widths : 
[0, IndirectObject(88, 0, 140450741186144)]
unknown widths : 
[0, IndirectObject(91, 0, 140450741186144)]
unknown widths : 
[0, IndirectObject(94, 0, 140450741186144)]
unknown widths : 
[0, IndirectObject(97, 0, 140450741186144)]
unknown widths : 
[0, IndirectObject(100, 0, 140450741186144)]
unknown widths : 
[0, IndirectObject(103, 0, 140450741186144)]
un

In [19]:
import numpy as np

# Reduce (if needed) the matter ids
reduced_matter_id_vec_lut = {}
for matter_id, vecs_and_texts in matter_id_vec_lut.items():
    # If matter id had multiple docs, combine
    if len(vecs_and_texts) > 1:
        # Stack the vecs and mean
        stacked_vecs = np.stack(
            [vec_and_text["vec"] for vec_and_text in vecs_and_texts],
            axis=0,
        )
        vec = stacked_vecs.mean(axis=0)
        
        # Find the longest text
        text = vecs_and_texts[0]["text"]
        for vec_and_text in vecs_and_texts:
            if len(vec_and_text["text"]) > len(text):
                text = vec_and_text["text"]
    
    # Just passthrough data
    else:
        vec = vecs_and_texts[0]["vec"]
        text = vecs_and_texts[0]["text"]
    
    reduced_matter_id_vec_lut[matter_id] = {"vec": vec, "text": text}

In [26]:
votes_as_recordandas as pd

# Get all votes that have a matter ref of the matter id
votes = []
votes_as_record = {}

# Replace decision with integers
decision_lut = {
    "Approve": 1,
    "Reject": -1,
}

for matter_id, vec_and_text in tqdm(reduced_matter_id_vec_lut.items()):
    related_votes = list(db_models.Vote.collection.filter(
        "matter_ref", "==", f"matter/{matter_id}"
    ).fetch())
    for vote in related_votes:
        person_name = vote.person_ref.get().name
        votes.append({
            "person": person_name,
            "decision": decision_lut.get(vote.decision, 0),
            "text": vec_and_text["text"],
            **{
                f"doc_text_feat_{i}": val
                for i, val in enumerate(vec_and_text["vec"].tolist())
            },
        })
        if person_name not in votes_as_record:
            votes_as_record[person_name] = {}
        
        votes_as_record[person_name][matter_id] = decision_lut.get(vote.decision, 0)

votes = pd.DataFrame(votes)
votes

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 243/243 [02:42<00:00,  1.50it/s]


Unnamed: 0,person,decision,text,doc_text_feat_0,doc_text_feat_1,doc_text_feat_2,doc_text_feat_3,doc_text_feat_4,doc_text_feat_5,doc_text_feat_6,...,doc_text_feat_374,doc_text_feat_375,doc_text_feat_376,doc_text_feat_377,doc_text_feat_378,doc_text_feat_379,doc_text_feat_380,doc_text_feat_381,doc_text_feat_382,doc_text_feat_383
0,Teresa Mosqueda,1,City of Seattle Boards & Commissions Notice of...,0.005913,0.015398,-0.002368,-0.000722,-0.003859,0.021779,0.010424,...,0.036363,0.000296,-0.006038,-0.010985,-0.016543,-0.018927,0.011327,-0.005457,-0.003206,0.035538
1,Andrew Lewis,1,City of Seattle Boards & Commissions Notice of...,0.005913,0.015398,-0.002368,-0.000722,-0.003859,0.021779,0.010424,...,0.036363,0.000296,-0.006038,-0.010985,-0.016543,-0.018927,0.011327,-0.005457,-0.003206,0.035538
2,Debora Juarez,1,City of Seattle Boards & Commissions Notice of...,0.005913,0.015398,-0.002368,-0.000722,-0.003859,0.021779,0.010424,...,0.036363,0.000296,-0.006038,-0.010985,-0.016543,-0.018927,0.011327,-0.005457,-0.003206,0.035538
3,Lisa Herbold,1,City of Seattle Boards & Commissions Notice of...,0.005913,0.015398,-0.002368,-0.000722,-0.003859,0.021779,0.010424,...,0.036363,0.000296,-0.006038,-0.010985,-0.016543,-0.018927,0.011327,-0.005457,-0.003206,0.035538
4,Andrew Lewis,1,City of Seattle Boards & Commissions Notice of...,0.005913,0.015398,-0.002368,-0.000722,-0.003859,0.021779,0.010424,...,0.036363,0.000296,-0.006038,-0.010985,-0.016543,-0.018927,0.011327,-0.005457,-0.003206,0.035538
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1825,Dan Strauss,1,"ATTACHMENT A Upon Recording, Please Return To:...",-0.008266,0.064521,0.027700,0.006945,-0.006280,0.023736,0.020383,...,0.027454,-0.003894,0.004577,-0.008063,0.002817,-0.008203,0.015461,0.012413,-0.001186,0.024733
1826,Andrew Lewis,1,"ATTACHMENT A Upon Recording, Please Return To:...",-0.008266,0.064521,0.027700,0.006945,-0.006280,0.023736,0.020383,...,0.027454,-0.003894,0.004577,-0.008063,0.002817,-0.008203,0.015461,0.012413,-0.001186,0.024733
1827,Alex Pedersen,1,"ATTACHMENT A Upon Recording, Please Return To:...",-0.008266,0.064521,0.027700,0.006945,-0.006280,0.023736,0.020383,...,0.027454,-0.003894,0.004577,-0.008063,0.002817,-0.008203,0.015461,0.012413,-0.001186,0.024733
1828,Kshama Sawant,1,"ATTACHMENT A Upon Recording, Please Return To:...",-0.008266,0.064521,0.027700,0.006945,-0.006280,0.023736,0.020383,...,0.027454,-0.003894,0.004577,-0.008063,0.002817,-0.008203,0.015461,0.012413,-0.001186,0.024733


In [21]:
data = votes[votes.columns[~votes.columns.isin(["person", "text", "decision"])]]
data

Unnamed: 0,doc_text_feat_0,doc_text_feat_1,doc_text_feat_2,doc_text_feat_3,doc_text_feat_4,doc_text_feat_5,doc_text_feat_6,doc_text_feat_7,doc_text_feat_8,doc_text_feat_9,...,doc_text_feat_374,doc_text_feat_375,doc_text_feat_376,doc_text_feat_377,doc_text_feat_378,doc_text_feat_379,doc_text_feat_380,doc_text_feat_381,doc_text_feat_382,doc_text_feat_383
0,0.005913,0.015398,-0.002368,-0.000722,-0.003859,0.021779,0.010424,0.002510,-0.029309,0.011589,...,0.036363,0.000296,-0.006038,-0.010985,-0.016543,-0.018927,0.011327,-0.005457,-0.003206,0.035538
1,0.005913,0.015398,-0.002368,-0.000722,-0.003859,0.021779,0.010424,0.002510,-0.029309,0.011589,...,0.036363,0.000296,-0.006038,-0.010985,-0.016543,-0.018927,0.011327,-0.005457,-0.003206,0.035538
2,0.005913,0.015398,-0.002368,-0.000722,-0.003859,0.021779,0.010424,0.002510,-0.029309,0.011589,...,0.036363,0.000296,-0.006038,-0.010985,-0.016543,-0.018927,0.011327,-0.005457,-0.003206,0.035538
3,0.005913,0.015398,-0.002368,-0.000722,-0.003859,0.021779,0.010424,0.002510,-0.029309,0.011589,...,0.036363,0.000296,-0.006038,-0.010985,-0.016543,-0.018927,0.011327,-0.005457,-0.003206,0.035538
4,0.005913,0.015398,-0.002368,-0.000722,-0.003859,0.021779,0.010424,0.002510,-0.029309,0.011589,...,0.036363,0.000296,-0.006038,-0.010985,-0.016543,-0.018927,0.011327,-0.005457,-0.003206,0.035538
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1825,-0.008266,0.064521,0.027700,0.006945,-0.006280,0.023736,0.020383,-0.009808,-0.008763,0.014350,...,0.027454,-0.003894,0.004577,-0.008063,0.002817,-0.008203,0.015461,0.012413,-0.001186,0.024733
1826,-0.008266,0.064521,0.027700,0.006945,-0.006280,0.023736,0.020383,-0.009808,-0.008763,0.014350,...,0.027454,-0.003894,0.004577,-0.008063,0.002817,-0.008203,0.015461,0.012413,-0.001186,0.024733
1827,-0.008266,0.064521,0.027700,0.006945,-0.006280,0.023736,0.020383,-0.009808,-0.008763,0.014350,...,0.027454,-0.003894,0.004577,-0.008063,0.002817,-0.008203,0.015461,0.012413,-0.001186,0.024733
1828,-0.008266,0.064521,0.027700,0.006945,-0.006280,0.023736,0.020383,-0.009808,-0.008763,0.014350,...,0.027454,-0.003894,0.004577,-0.008063,0.002817,-0.008203,0.015461,0.012413,-0.001186,0.024733


In [22]:
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, output_notebook, show
from umap import UMAP

umap = UMAP()
xtfm = umap.fit_transform(data)

fit_data = pd.DataFrame()
fit_data["x"] = xtfm[:, 0]
fit_data["y"] = xtfm[:, 1]
fit_data["person"] = votes["person"]
fit_data["text"] = votes["text"]
fit_data

failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(


Unnamed: 0,x,y,person,text
0,-13.338323,17.682674,Teresa Mosqueda,City of Seattle Boards & Commissions Notice of...
1,-13.437538,17.585686,Andrew Lewis,City of Seattle Boards & Commissions Notice of...
2,-13.419210,17.603979,Debora Juarez,City of Seattle Boards & Commissions Notice of...
3,-13.391037,17.630669,Lisa Herbold,City of Seattle Boards & Commissions Notice of...
4,-13.482527,17.540808,Andrew Lewis,City of Seattle Boards & Commissions Notice of...
...,...,...,...,...
1825,-8.246597,-14.832593,Dan Strauss,"ATTACHMENT A Upon Recording, Please Return To:..."
1826,-8.264042,-14.801157,Andrew Lewis,"ATTACHMENT A Upon Recording, Please Return To:..."
1827,-8.271081,-14.797844,Alex Pedersen,"ATTACHMENT A Upon Recording, Please Return To:..."
1828,-8.231905,-14.843483,Kshama Sawant,"ATTACHMENT A Upon Recording, Please Return To:..."


In [23]:
import bokeh.models as bmo
from bokeh.palettes import d3

source = ColumnDataSource(fit_data)

DEFAULT_TOOLTIP_FORMATTER = """
<div style="max-width: 400px; word-wrap: break-word;">
    <span style="color: blue;font-weight: bold;">text: </span>@person
</div>
"""

palette = d3['Category10'][len(votes["person"].unique())]
color_map = bmo.CategoricalColorMapper(factors=votes["person"].unique(),
                                   palette=palette)

p = figure(width=800, height=800, tooltips=DEFAULT_TOOLTIP_FORMATTER)
p.scatter(x="x", y="y", source=source, size=3, alpha=0.8, color={'field': 'person', 'transform': color_map}, legend_field="person")
p.xaxis[0].axis_label = "X"
p.yaxis[0].axis_label = "Y"

output_notebook()
show(p)

In [30]:
pd.DataFrame(votes_as_record).T.fillna(0)

Unnamed: 0,e9e7aa472cb9,7a6f1f18b2a3,67e08644a87b,bfb5753cb180,065014bc904d,d5ee609a46ec,9f804134acb9,cc6bdfb9a093,942c668f5dd2,375de35443c3,...,f450ae35082f,76c8489dbe0c,3830bddb1948,dde9b4850029,a50c0ba55971,046ebe1c4e0e,ef5892795426,569adb11639c,6a75f47d55ca,67b0a99c5a86
Teresa Mosqueda,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Andrew Lewis,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,-1.0,1.0,0.0,0.0,0.0
Debora Juarez,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,-1.0,1.0,0.0,0.0,0.0
Lisa Herbold,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,-1.0,1.0,1.0,1.0,1.0
Dan Strauss,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,-1.0,1.0,1.0,0.0,1.0
Tammy J. Morales,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Alex Pedersen,1.0,1.0,-1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,1.0,1.0,-1.0,1.0,1.0,1.0,1.0
M. Lorena González,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
Sara Nelson,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0,0.0
Kshama Sawant,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0


In [31]:
import hdbscan

clusterer = hdbscan.HDBSCAN()
clusterer.fit(pd.DataFrame(votes_as_record).T.fillna(0))
clusterer.labels_

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1])

In [33]:
votes.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1820,1821,1822,1823,1824,1825,1826,1827,1828,1829
person,Teresa Mosqueda,Andrew Lewis,Debora Juarez,Lisa Herbold,Andrew Lewis,Dan Strauss,Tammy J. Morales,Alex Pedersen,Debora Juarez,M. Lorena González,...,Tammy J. Morales,Lisa Herbold,Kshama Sawant,Teresa Mosqueda,Debora Juarez,Dan Strauss,Andrew Lewis,Alex Pedersen,Kshama Sawant,Alex Pedersen
decision,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
text,City of Seattle Boards & Commissions Notice of...,City of Seattle Boards & Commissions Notice of...,City of Seattle Boards & Commissions Notice of...,City of Seattle Boards & Commissions Notice of...,City of Seattle Boards & Commissions Notice of...,City of Seattle Boards & Commissions Notice of...,City of Seattle Boards & Commissions Notice of...,City of Seattle Boards & Commissions Notice of...,City of Seattle Boards & Commissions Notice of...,City of Seattle Boards & Commissions Notice of...,...,"ATTACHMENT A Upon Recording, Please Return To:...","ATTACHMENT A Upon Recording, Please Return To:...","ATTACHMENT A Upon Recording, Please Return To:...","ATTACHMENT A Upon Recording, Please Return To:...","ATTACHMENT A Upon Recording, Please Return To:...","ATTACHMENT A Upon Recording, Please Return To:...","ATTACHMENT A Upon Recording, Please Return To:...","ATTACHMENT A Upon Recording, Please Return To:...","ATTACHMENT A Upon Recording, Please Return To:...","ATTACHMENT A Upon Recording, Please Return To:..."
doc_text_feat_0,0.005913,0.005913,0.005913,0.005913,0.005913,0.005913,0.005913,0.005913,0.005913,0.005913,...,-0.008266,-0.008266,-0.008266,-0.008266,-0.008266,-0.008266,-0.008266,-0.008266,-0.008266,-0.008266
doc_text_feat_1,0.015398,0.015398,0.015398,0.015398,0.015398,0.015398,0.015398,0.015398,0.015398,0.015398,...,0.064521,0.064521,0.064521,0.064521,0.064521,0.064521,0.064521,0.064521,0.064521,0.064521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
doc_text_feat_379,-0.018927,-0.018927,-0.018927,-0.018927,-0.018927,-0.018927,-0.018927,-0.018927,-0.018927,-0.018927,...,-0.008203,-0.008203,-0.008203,-0.008203,-0.008203,-0.008203,-0.008203,-0.008203,-0.008203,-0.008203
doc_text_feat_380,0.011327,0.011327,0.011327,0.011327,0.011327,0.011327,0.011327,0.011327,0.011327,0.011327,...,0.015461,0.015461,0.015461,0.015461,0.015461,0.015461,0.015461,0.015461,0.015461,0.015461
doc_text_feat_381,-0.005457,-0.005457,-0.005457,-0.005457,-0.005457,-0.005457,-0.005457,-0.005457,-0.005457,-0.005457,...,0.012413,0.012413,0.012413,0.012413,0.012413,0.012413,0.012413,0.012413,0.012413,0.012413
doc_text_feat_382,-0.003206,-0.003206,-0.003206,-0.003206,-0.003206,-0.003206,-0.003206,-0.003206,-0.003206,-0.003206,...,-0.001186,-0.001186,-0.001186,-0.001186,-0.001186,-0.001186,-0.001186,-0.001186,-0.001186,-0.001186
