In [1]:
from cdp_backend.utils.file_utils import resource_copy
import docx
from pathlib import Path
from uuid import uuid4
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer

def process_document(doc_path):
    # Try docx
    try:
        doc = docx.Document(doc_path)
        texts = []
        for para in doc.paragraphs:
            this_text = para.text.strip()
            if len(this_text) > 0:
                texts.append(this_text)

        # If we found text
        # This was a docx file
        if len(texts) > 0:
            return texts

        # If we didn't find text, it was still a docx file
        # that we just failed to parse
        # No need to try anything else in that case, just return
        return None

    # Try pdf
    except:
        try:
            reader = PdfReader(doc_path)
            texts = []
            for page in reader.pages:
                this_page_text = page.extract_text().strip()
                for para in this_page_text.split("\n"):
                    para_cleaned = para.strip()
                    if len(para_cleaned) > 0:
                        texts.append(para_cleaned)
            
            # If we found text
            # This was a pdf file
            if len(texts) > 0:
                return texts

            # If we didn't find text, it was still a pdf file
            # That we just failed to parse
            return None
        
        # It wasn't either file format
        except:
            return None

def process_matter_file(matter_file):
    try:
        # Create temp store path
        temp_store_path = Path(f"temp_{uuid4()}")

        # Store remote to local
        temp_store_path = Path(
            resource_copy(matter_file.uri, dst=temp_store_path, overwrite=True),
        )

        # Get document content
        texts = process_document(temp_store_path)

        # Handle file not supported
        if texts is None:
            return None

        # Otherwise proceed
        # Encode all texts and then get the average vector
        model = SentenceTransformer("all-MiniLM-L12-v2")
        vec = model.encode(texts).mean(axis=0)

        return {"vec": vec, "text": " ".join(texts)[:256]}
    
    # Always clear file
    finally:
        temp_store_path.unlink(missing_ok=True)

In [2]:
from cdp_data import CDPInstances
from cdp_data.utils import connect_to_infrastructure
from cdp_backend.database import models as db_models
from tqdm import tqdm

# Ignore resource copy warnings
import warnings
warnings.filterwarnings("ignore", message=".*Unverified HTTPS request is being made to host.*")

connect_to_infrastructure(CDPInstances.Seattle)

matter_files = list(db_models.MatterFile.collection.fetch(800))

matter_id_vec_lut = {}
for matter_file in tqdm(matter_files, desc="Processing matter files"):
    # Process the file
    process_result = process_matter_file(matter_file)
    
    # Add the result to the LUT
    if process_result is not None:
        # Create the vec list on the first encounter
        if matter_file.matter_ref.ref.id not in matter_id_vec_lut:
            matter_id_vec_lut[matter_file.matter_ref.ref.id] = []
        
        # Add the new vec
        matter_id_vec_lut[matter_file.matter_ref.ref.id].append(process_result)

Processing matter files:  18%|██████████████████████▊                                                                                                     | 147/800 [07:57<1:13:49,  6.78s/it]unknown widths : 
[0, IndirectObject(67, 0, 140108220165152)]
unknown widths : 
[0, IndirectObject(70, 0, 140108220165152)]
unknown widths : 
[0, IndirectObject(73, 0, 140108220165152)]
unknown widths : 
[0, IndirectObject(76, 0, 140108220165152)]
unknown widths : 
[0, IndirectObject(79, 0, 140108220165152)]
unknown widths : 
[0, IndirectObject(82, 0, 140108220165152)]
unknown widths : 
[0, IndirectObject(85, 0, 140108220165152)]
unknown widths : 
[0, IndirectObject(88, 0, 140108220165152)]
unknown widths : 
[0, IndirectObject(91, 0, 140108220165152)]
unknown widths : 
[0, IndirectObject(94, 0, 140108220165152)]
unknown widths : 
[0, IndirectObject(97, 0, 140108220165152)]
unknown widths : 
[0, IndirectObject(100, 0, 140108220165152)]
unknown widths : 
[0, IndirectObject(103, 0, 140108220165152)]
un

In [3]:
import numpy as np

# Reduce (if needed) the matter ids
reduced_matter_id_vec_lut = {}
for matter_id, vecs_and_texts in matter_id_vec_lut.items():
    # If matter id had multiple docs, combine
    if len(vecs_and_texts) > 1:
        # Stack the vecs and mean
        stacked_vecs = np.stack(
            [vec_and_text["vec"] for vec_and_text in vecs_and_texts],
            axis=0,
        )
        vec = stacked_vecs.mean(axis=0)
        
        # Find the longest text
        text = vecs_and_texts[0]["text"]
        for vec_and_text in vecs_and_texts:
            if len(vec_and_text["text"]) > len(text):
                text = vec_and_text["text"]
    
    # Just passthrough data
    else:
        vec = vecs_and_texts[0]["vec"]
        text = vecs_and_texts[0]["text"]
    
    reduced_matter_id_vec_lut[matter_id] = {"vec": vec, "text": text}

In [4]:
import pandas as pd

# Get all votes that have a matter ref of the matter id
person_decision_lut = {}
for matter_id, vec_and_text in tqdm(reduced_matter_id_vec_lut.items()):
    related_votes = list(db_models.Vote.collection.filter(
        "matter_ref", "==", f"matter/{matter_id}"
    ).fetch())
    for vote in related_votes:
        person_name = vote.person_ref.get().name
        if person_name not in person_decision_lut:
            person_decision_lut[person_name] = {}
        
        if vote.decision not in person_decision_lut[person_name]:
            person_decision_lut[person_name][vote.decision] = []
        
        person_decision_lut[person_name][vote.decision].append(
            vec_and_text["vec"],
        )

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 408/408 [04:59<00:00,  1.36it/s]

Teresa Mosqueda
Approve
309
Reject
4
Andrew Lewis
Approve
378
Absent (Non-Voting)
2
Reject
6
Debora Juarez
Approve
309
Reject
6
Absent (Non-Voting)
2
Lisa Herbold
Approve
371
Reject
4
Dan Strauss
Approve
354
Absent (Non-Voting)
1
Reject
3
Tammy J. Morales
Approve
354
Reject
6
Absent (Non-Voting)
4
Alex Pedersen
Approve
377
Reject
20
Absent (Non-Voting)
6
M. Lorena González
Approve
256
Reject
4
Sara Nelson
Reject
3
Approve
80
Kshama Sawant
Approve
249
Reject
16
Absent (Non-Voting)
1





In [5]:
semantic_voting_records = []
for person_name, decision_vec_lists in person_decision_lut.items():
    person_record = {
        "councilmember": person_name,
    }
    for decision, vec_list in decision_vec_lists.items():
        if decision in ["Approve", "Reject"]:
            mean_vec = np.stack(vec_list, axis=0).mean(axis=0)
            for i, vec_val in enumerate(mean_vec.tolist()):
                person_record[f"{decision.lower()}-mean-feat-{i}"] = vec_val
    
    semantic_voting_records.append(person_record)
            
semantic_voting_records = pd.DataFrame(semantic_voting_records).fillna(0)
semantic_voting_records

Unnamed: 0,councilmember,approve-mean-feat-0,approve-mean-feat-1,approve-mean-feat-2,approve-mean-feat-3,approve-mean-feat-4,approve-mean-feat-5,approve-mean-feat-6,approve-mean-feat-7,approve-mean-feat-8,...,reject-mean-feat-374,reject-mean-feat-375,reject-mean-feat-376,reject-mean-feat-377,reject-mean-feat-378,reject-mean-feat-379,reject-mean-feat-380,reject-mean-feat-381,reject-mean-feat-382,reject-mean-feat-383
0,Teresa Mosqueda,0.001982,0.032942,0.023369,0.007702,0.017713,0.013749,-0.018754,-0.00385,-0.026018,...,0.030395,0.013953,-0.010459,0.002192,-0.006338,0.013867,0.030037,0.017509,0.03001,0.021047
1,Andrew Lewis,0.001401,0.031658,0.021423,0.006764,0.015862,0.013073,-0.015913,-0.003291,-0.025537,...,0.029708,0.018104,-0.010169,-0.007402,-0.002512,-0.001977,0.025562,0.002217,0.015605,0.019461
2,Debora Juarez,0.001241,0.032066,0.021689,0.006278,0.013713,0.012736,-0.014832,-0.003442,-0.025426,...,0.037037,0.019057,-0.007286,-0.007396,-0.010908,-3.5e-05,0.022189,0.003269,0.021367,0.023388
3,Lisa Herbold,0.000818,0.032211,0.021954,0.005889,0.017154,0.011714,-0.01664,-0.003096,-0.025672,...,0.035262,0.01605,-0.0078,-0.006602,-0.014414,-0.010947,0.012068,0.003404,0.026246,0.022827
4,Dan Strauss,0.001649,0.032927,0.022574,0.006685,0.016316,0.011744,-0.017387,-0.003458,-0.026075,...,0.032177,0.020541,-0.015554,-0.012352,-0.00826,0.002144,0.038533,0.001926,0.006408,0.017901
5,Tammy J. Morales,0.00049,0.031545,0.02109,0.005982,0.015594,0.012213,-0.016053,-0.003278,-0.025637,...,0.029718,0.010728,-0.004571,-0.008294,-0.007166,-0.003567,0.019652,0.014827,0.026052,0.023451
6,Alex Pedersen,0.001032,0.031568,0.021411,0.00591,0.016091,0.011641,-0.016031,-0.003041,-0.025636,...,0.029285,0.022584,-0.002351,-0.007642,-0.01273,-0.007849,0.004689,0.009127,0.019913,0.028919
7,M. Lorena González,0.001345,0.032071,0.020315,0.006592,0.017635,0.012254,-0.013659,-0.002479,-0.025076,...,0.026042,0.020806,-0.001754,-0.011131,-0.013727,0.009272,0.027148,0.012744,0.002716,0.029996
8,Sara Nelson,0.000125,0.032247,0.024119,0.006439,0.015833,0.011397,-0.023138,-0.004593,-0.028393,...,0.044653,0.005545,-0.010688,-0.01808,-0.020111,-0.008999,-0.005761,-0.00044,0.016686,0.028121
9,Kshama Sawant,0.000957,0.032272,0.021061,0.005914,0.013862,0.012838,-0.014283,-0.003658,-0.024434,...,0.034618,0.014415,-0.002223,-0.004331,-0.015538,-0.004068,0.009545,0.007467,0.022999,0.022902


In [6]:
data = semantic_voting_records[
    semantic_voting_records.columns[
        ~semantic_voting_records.columns.isin(["councilmember"])
    ]
]

In [7]:
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, output_notebook, show
from umap import UMAP

umap = UMAP()
xtfm = umap.fit_transform(data)

fit_data = pd.DataFrame()
fit_data["x"] = xtfm[:, 0]
fit_data["y"] = xtfm[:, 1]
fit_data["councilmember"] = semantic_voting_records["councilmember"]
fit_data

  warn(


Unnamed: 0,x,y,councilmember
0,1.814717,2.817305,Teresa Mosqueda
1,-0.08188,2.862837,Andrew Lewis
2,0.570736,3.198642,Debora Juarez
3,-0.103741,4.57416,Lisa Herbold
4,0.32367,2.174594,Dan Strauss
5,1.376674,3.374542,Tammy J. Morales
6,-1.001942,3.415123,Alex Pedersen
7,1.134363,2.2678,M. Lorena González
8,-0.51847,3.894322,Sara Nelson
9,0.377295,3.993452,Kshama Sawant


In [8]:
import bokeh.models as bmo
from bokeh.palettes import d3

source = ColumnDataSource(fit_data)

DEFAULT_TOOLTIP_FORMATTER = """
<div style="max-width: 400px; word-wrap: break-word;">
    @councilmember
</div>
"""

palette = d3['Category10'][
    len(semantic_voting_records["councilmember"].unique())
]
color_map = bmo.CategoricalColorMapper(
    factors=semantic_voting_records["councilmember"].unique(),
    palette=palette,
)

p = figure(width=800, height=800, tooltips=DEFAULT_TOOLTIP_FORMATTER)
p.scatter(
    x="x",
    y="y",
    source=source,
    size=10,
    # alpha=0.8,
    color={'field': 'councilmember', 'transform': color_map},
    legend_field="councilmember",
)
p.xaxis[0].axis_label = "X"
p.yaxis[0].axis_label = "Y"

output_notebook()
show(p)

In [11]:
import hdbscan

clusterer = hdbscan.HDBSCAN()
clusterer.fit(data)
clusterer.labels_

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1])