In [1]:
from cdp_backend.utils.file_utils import resource_copy
import docx
from pathlib import Path
from uuid import uuid4
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer

def process_document(doc_path):
    # Try docx
    try:
        doc = docx.Document(doc_path)
        texts = []
        for para in doc.paragraphs:
            this_text = para.text.strip()
            if len(this_text) > 0:
                texts.append(this_text)

        # If we found text
        # This was a docx file
        if len(texts) > 0:
            return texts

        # If we didn't find text, it was still a docx file
        # that we just failed to parse
        # No need to try anything else in that case, just return
        return None

    # Try pdf
    except:
        try:
            reader = PdfReader(doc_path)
            texts = []
            for page in reader.pages:
                this_page_text = page.extract_text().strip()
                for para in this_page_text.split("\n"):
                    para_cleaned = para.strip()
                    if len(para_cleaned) > 0:
                        texts.append(para_cleaned)
            
            # If we found text
            # This was a pdf file
            if len(texts) > 0:
                return texts

            # If we didn't find text, it was still a pdf file
            # That we just failed to parse
            return None
        
        # It wasn't either file format
        except:
            return None

def process_matter_file(matter_file):
    try:
        # Create temp store path
        temp_store_path = Path(f"temp_{uuid4()}")

        # Store remote to local
        temp_store_path = Path(
            resource_copy(matter_file.uri, dst=temp_store_path, overwrite=True),
        )

        # Get document content
        texts = process_document(temp_store_path)

        # Handle file not supported
        if texts is None:
            return None

        # Otherwise proceed
        # Encode all texts and then get the average vector
        model = SentenceTransformer("all-MiniLM-L12-v2")
        vec = model.encode(texts).mean(axis=0)

        return {"vec": vec, "text": " ".join(texts)[:256]}
    
    # Always clear file
    finally:
        temp_store_path.unlink(missing_ok=True)

In [2]:
from cdp_data import CDPInstances
from cdp_data.utils import connect_to_infrastructure
from cdp_backend.database import models as db_models
from tqdm import tqdm

# Ignore resource copy warnings
import warnings
warnings.filterwarnings("ignore", message=".*Unverified HTTPS request is being made to host.*")

connect_to_infrastructure(CDPInstances.Louisville)

matter_files = list(db_models.MatterFile.collection.fetch(800))

matter_id_vec_lut = {}
for matter_file in tqdm(matter_files, desc="Processing matter files"):
    # Process the file
    process_result = process_matter_file(matter_file)
    
    # Add the result to the LUT
    if process_result is not None:
        # Create the vec list on the first encounter
        if matter_file.matter_ref.ref.id not in matter_id_vec_lut:
            matter_id_vec_lut[matter_file.matter_ref.ref.id] = []
        
        # Add the new vec
        matter_id_vec_lut[matter_file.matter_ref.ref.id].append(process_result)

Processing matter files:   6%|███████▊                                                                                                                     | 50/800 [02:34<1:02:12,  4.98s/it]unknown widths : 
[0, IndirectObject(91, 0, 139965539525984)]
unknown widths : 
[0, IndirectObject(97, 0, 139965539525984)]
unknown widths : 
[0, IndirectObject(103, 0, 139965539525984)]
unknown widths : 
[0, IndirectObject(109, 0, 139965539525984)]
unknown widths : 
[0, IndirectObject(115, 0, 139965539525984)]
unknown widths : 
[0, IndirectObject(121, 0, 139965539525984)]
unknown widths : 
[0, IndirectObject(9, 0, 139965539525984)]
unknown widths : 
[0, IndirectObject(15, 0, 139965539525984)]
unknown widths : 
[0, IndirectObject(21, 0, 139965539525984)]
unknown widths : 
[0, IndirectObject(27, 0, 139965539525984)]
unknown widths : 
[0, IndirectObject(33, 0, 139965539525984)]
unknown widths : 
[0, IndirectObject(39, 0, 139965539525984)]
unknown widths : 
[0, IndirectObject(45, 0, 139965539525984)]
u

In [3]:
import numpy as np

# Reduce (if needed) the matter ids
reduced_matter_id_vec_lut = {}
for matter_id, vecs_and_texts in matter_id_vec_lut.items():
    # If matter id had multiple docs, combine
    if len(vecs_and_texts) > 1:
        # Stack the vecs and mean
        stacked_vecs = np.stack(
            [vec_and_text["vec"] for vec_and_text in vecs_and_texts],
            axis=0,
        )
        vec = stacked_vecs.mean(axis=0)
        
        # Find the longest text
        text = vecs_and_texts[0]["text"]
        for vec_and_text in vecs_and_texts:
            if len(vec_and_text["text"]) > len(text):
                text = vec_and_text["text"]
    
    # Just passthrough data
    else:
        vec = vecs_and_texts[0]["vec"]
        text = vecs_and_texts[0]["text"]
    
    reduced_matter_id_vec_lut[matter_id] = {"vec": vec, "text": text}

In [4]:
import pandas as pd

# Get all votes that have a matter ref of the matter id
person_decision_lut = {}
for matter_id, vec_and_text in tqdm(reduced_matter_id_vec_lut.items()):
    related_votes = list(db_models.Vote.collection.filter(
        "matter_ref", "==", f"matter/{matter_id}"
    ).fetch())
    for vote in related_votes:
        person_name = vote.person_ref.get().name
        if person_name not in person_decision_lut:
            person_decision_lut[person_name] = {}
        
        if vote.decision not in person_decision_lut[person_name]:
            person_decision_lut[person_name][vote.decision] = []
        
        person_decision_lut[person_name][vote.decision].append(
            vec_and_text["vec"],
        )

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 346/346 [03:38<00:00,  1.58it/s]


In [47]:
semantic_voting_records = []
for person_name, decision_vec_lists in person_decision_lut.items():
    person_record = {
        "councilmember": person_name,
    }
    for decision, vec_list in decision_vec_lists.items():
        if decision in ["Approve", "Reject"]:
            mean_vec = np.stack(vec_list, axis=0).mean(axis=0)
            for i, vec_val in enumerate(mean_vec.tolist()):
                person_record[f"{decision.lower()}-mean-feat-{i}"] = vec_val
    
    semantic_voting_records.append(person_record)
            
semantic_voting_records = pd.DataFrame(semantic_voting_records).fillna(0)
semantic_voting_records

Unnamed: 0,councilmember,approve-mean-feat-0,approve-mean-feat-1,approve-mean-feat-2,approve-mean-feat-3,approve-mean-feat-4,approve-mean-feat-5,approve-mean-feat-6,approve-mean-feat-7,approve-mean-feat-8,...,reject-mean-feat-374,reject-mean-feat-375,reject-mean-feat-376,reject-mean-feat-377,reject-mean-feat-378,reject-mean-feat-379,reject-mean-feat-380,reject-mean-feat-381,reject-mean-feat-382,reject-mean-feat-383
0,Markus Winkler (D-17),-0.000534,0.034048,0.011378,0.007166,0.013672,0.003644,-0.001373,-0.003448,-0.025827,...,0.029505,0.003680,0.003755,-0.014315,-0.019407,-0.015735,-0.005778,0.010369,0.013626,0.020480
1,Kevin Kramer (R-11),-0.004045,0.032941,0.010603,0.003653,0.012023,0.003149,0.000935,-0.005301,-0.025217,...,0.036432,0.009273,0.004859,-0.007152,-0.017726,-0.012102,-0.004598,0.008495,0.022688,0.015769
2,Keisha Dorsey (D-3),-0.000434,0.033559,0.013494,0.008178,0.012572,0.001795,-0.002687,-0.004253,-0.025092,...,0.032149,0.005821,0.007094,-0.015494,-0.021231,-0.014743,-0.008690,0.008664,0.013342,0.016305
3,Barbara Sexton Smith (D-4),-0.000713,0.032344,0.008423,0.006301,0.012973,0.002445,0.000015,-0.006085,-0.025334,...,0.040476,0.011785,0.000557,-0.001061,-0.025336,-0.022165,-0.004386,-0.002584,0.014306,0.007235
4,Scott Reed (R-16),-0.003596,0.032405,0.012609,0.008219,0.012468,0.001928,-0.000738,-0.003302,-0.024860,...,0.031845,0.008086,0.007754,-0.013472,-0.018380,-0.011606,-0.006369,0.007722,0.015217,0.018867
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,Ashlyn Ackerman,-0.017922,0.027341,0.004262,0.004917,-0.002095,0.004336,0.008959,-0.004329,-0.019186,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
68,Lori Stahlgren,-0.016328,0.035641,0.004720,0.015105,0.000180,0.021451,-0.013797,-0.007267,-0.022845,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
69,Stefanie Buzan,-0.017922,0.027341,0.004262,0.004917,-0.002095,0.004336,0.008959,-0.004329,-0.019186,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
70,Morgan Ward,-0.017922,0.027341,0.004262,0.004917,-0.002095,0.004336,0.008959,-0.004329,-0.019186,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [50]:
primary_council_voting_records = semantic_voting_records[semantic_voting_records.councilmember.str.contains("\(")]
primary_council_voting_records

Unnamed: 0,councilmember,approve-mean-feat-0,approve-mean-feat-1,approve-mean-feat-2,approve-mean-feat-3,approve-mean-feat-4,approve-mean-feat-5,approve-mean-feat-6,approve-mean-feat-7,approve-mean-feat-8,...,reject-mean-feat-374,reject-mean-feat-375,reject-mean-feat-376,reject-mean-feat-377,reject-mean-feat-378,reject-mean-feat-379,reject-mean-feat-380,reject-mean-feat-381,reject-mean-feat-382,reject-mean-feat-383
0,Markus Winkler (D-17),-0.000534,0.034048,0.011378,0.007166,0.013672,0.003644,-0.001373,-0.003448,-0.025827,...,0.029505,0.00368,0.003755,-0.014315,-0.019407,-0.015735,-0.005778,0.010369,0.013626,0.02048
1,Kevin Kramer (R-11),-0.004045,0.032941,0.010603,0.003653,0.012023,0.003149,0.000935,-0.005301,-0.025217,...,0.036432,0.009273,0.004859,-0.007152,-0.017726,-0.012102,-0.004598,0.008495,0.022688,0.015769
2,Keisha Dorsey (D-3),-0.000434,0.033559,0.013494,0.008178,0.012572,0.001795,-0.002687,-0.004253,-0.025092,...,0.032149,0.005821,0.007094,-0.015494,-0.021231,-0.014743,-0.00869,0.008664,0.013342,0.016305
3,Barbara Sexton Smith (D-4),-0.000713,0.032344,0.008423,0.006301,0.012973,0.002445,1.5e-05,-0.006085,-0.025334,...,0.040476,0.011785,0.000557,-0.001061,-0.025336,-0.022165,-0.004386,-0.002584,0.014306,0.007235
4,Scott Reed (R-16),-0.003596,0.032405,0.012609,0.008219,0.012468,0.001928,-0.000738,-0.003302,-0.02486,...,0.031845,0.008086,0.007754,-0.013472,-0.01838,-0.011606,-0.006369,0.007722,0.015217,0.018867
5,Paula McCraney (D-7),-0.002509,0.034737,0.011311,0.00637,0.014475,0.003528,-0.00162,-0.003321,-0.025432,...,0.031033,0.007463,0.00609,-0.010033,-0.021208,-0.014928,-0.007068,0.014846,0.017156,0.012781
6,Cindi Fowler (D-14),-0.003474,0.034669,0.009527,0.0062,0.013066,0.002781,-0.000371,-0.00466,-0.025732,...,0.041674,0.006436,0.004462,-0.012428,-0.019393,-0.013908,-0.007082,0.003689,0.016271,0.009411
7,Anthony Piagentini (R-19),-0.001382,0.034011,0.011564,0.00566,0.013205,0.004249,-0.001708,-0.004685,-0.025268,...,0.032941,0.003634,0.00561,-0.009193,-0.022572,-0.013906,-0.006756,0.008636,0.013245,0.013917
8,Bill Hollander (D-9),-0.003519,0.033599,0.010832,0.00661,0.012539,0.002408,-0.001126,-0.004859,-0.024098,...,0.037271,0.012295,0.003493,-0.001306,-0.019023,-0.020283,-0.005181,0.004907,0.022601,0.008768
17,Madonna Flood (D-24),-0.001984,0.033639,0.01221,0.009427,0.013046,0.000839,-0.000562,-0.0049,-0.023778,...,0.032299,0.012721,0.009438,-0.014037,-0.017609,-0.011848,-0.005899,0.008463,0.01844,0.019589


In [51]:
primary_council_voting_records["councilmember"] = primary_council_voting_records.councilmember.apply(
    lambda name: name[:name.index("(") - 1] if "(" in name else name
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_council_voting_records["councilmember"] = primary_council_voting_records.councilmember.apply(


In [19]:
semantic_voting_records.to_parquet("louisville-semantic-voting-records.parquet")

In [52]:
data = primary_council_voting_records[
    primary_council_voting_records.columns[
        ~primary_council_voting_records.columns.isin(["councilmember"])
    ]
]

In [53]:
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, output_notebook, show
from umap import UMAP

umap = UMAP()
xtfm = umap.fit_transform(data)

fit_data = pd.DataFrame()
fit_data["x"] = xtfm[:, 0]
fit_data["y"] = xtfm[:, 1]
fit_data["councilmember"] = semantic_voting_records["councilmember"]
fit_data

Unnamed: 0,x,y,councilmember
0,-2.73101,-0.782573,Markus Winkler (D-17)
1,0.165585,-0.15742,Kevin Kramer (R-11)
2,-2.807263,0.224702,Keisha Dorsey (D-3)
3,-0.510107,1.107102,Barbara Sexton Smith (D-4)
4,-2.092953,-1.164049,Scott Reed (R-16)
5,-1.041445,0.511043,Paula McCraney (D-7)
6,-1.498675,1.035133,Cindi Fowler (D-14)
7,-1.103964,0.044531,Anthony Piagentini (R-19)
8,-0.367977,1.506776,Bill Hollander (D-9)
9,-2.372614,-0.624257,Richard Carlson


In [54]:
import bokeh.models as bmo
from bokeh.palettes import d3

source = ColumnDataSource(fit_data)

DEFAULT_TOOLTIP_FORMATTER = """
<div style="max-width: 400px; word-wrap: break-word;">
    @councilmember
</div>
"""

# palette = d3['Category10'][
#     len(semantic_voting_records["councilmember"].unique())
# ]
# color_map = bmo.CategoricalColorMapper(
#     factors=semantic_voting_records["councilmember"].unique(),
#     palette=palette,
# )

p = figure(width=800, height=800, tooltips=DEFAULT_TOOLTIP_FORMATTER)
p.scatter(
    x="x",
    y="y",
    source=source,
    size=10,
    # alpha=0.8,
    # color={'field': 'councilmember', 'transform': color_map},
    # legend_field="councilmember",
)
p.xaxis[0].axis_label = "X"
p.yaxis[0].axis_label = "Y"

output_notebook()
show(p)

In [55]:
import hdbscan

clusterer = hdbscan.HDBSCAN()
clusterer.fit(data)
clusterer.labels_

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])

In [56]:
seattle_semantic_voting_record = pd.read_parquet("seattle-semantic-voting-records.parquet")
seattle_semantic_voting_record["council"] = "seattle"
seattle_semantic_voting_record["color"] = "blue"
primary_council_voting_records["council"] = "louisville"
primary_council_voting_records["color"] = "red"
all_semantic_voting_records = pd.concat([primary_council_voting_records, seattle_semantic_voting_record]).reset_index(drop=True)
all_semantic_voting_records

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_council_voting_records["council"] = "louisville"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_council_voting_records["color"] = "red"


Unnamed: 0,councilmember,approve-mean-feat-0,approve-mean-feat-1,approve-mean-feat-2,approve-mean-feat-3,approve-mean-feat-4,approve-mean-feat-5,approve-mean-feat-6,approve-mean-feat-7,approve-mean-feat-8,...,reject-mean-feat-376,reject-mean-feat-377,reject-mean-feat-378,reject-mean-feat-379,reject-mean-feat-380,reject-mean-feat-381,reject-mean-feat-382,reject-mean-feat-383,council,color
0,Markus Winkler,-0.000534,0.034048,0.011378,0.007166,0.013672,0.003644,-0.001373,-0.003448,-0.025827,...,0.003755,-0.014315,-0.019407,-0.015735,-0.005778,0.010369,0.013626,0.02048,louisville,red
1,Kevin Kramer,-0.004045,0.032941,0.010603,0.003653,0.012023,0.003149,0.000935,-0.005301,-0.025217,...,0.004859,-0.007152,-0.017726,-0.012102,-0.004598,0.008495,0.022688,0.015769,louisville,red
2,Keisha Dorsey,-0.000434,0.033559,0.013494,0.008178,0.012572,0.001795,-0.002687,-0.004253,-0.025092,...,0.007094,-0.015494,-0.021231,-0.014743,-0.00869,0.008664,0.013342,0.016305,louisville,red
3,Barbara Sexton Smith,-0.000713,0.032344,0.008423,0.006301,0.012973,0.002445,1.5e-05,-0.006085,-0.025334,...,0.000557,-0.001061,-0.025336,-0.022165,-0.004386,-0.002584,0.014306,0.007235,louisville,red
4,Scott Reed,-0.003596,0.032405,0.012609,0.008219,0.012468,0.001928,-0.000738,-0.003302,-0.02486,...,0.007754,-0.013472,-0.01838,-0.011606,-0.006369,0.007722,0.015217,0.018867,louisville,red
5,Paula McCraney,-0.002509,0.034737,0.011311,0.00637,0.014475,0.003528,-0.00162,-0.003321,-0.025432,...,0.00609,-0.010033,-0.021208,-0.014928,-0.007068,0.014846,0.017156,0.012781,louisville,red
6,Cindi Fowler,-0.003474,0.034669,0.009527,0.0062,0.013066,0.002781,-0.000371,-0.00466,-0.025732,...,0.004462,-0.012428,-0.019393,-0.013908,-0.007082,0.003689,0.016271,0.009411,louisville,red
7,Anthony Piagentini,-0.001382,0.034011,0.011564,0.00566,0.013205,0.004249,-0.001708,-0.004685,-0.025268,...,0.00561,-0.009193,-0.022572,-0.013906,-0.006756,0.008636,0.013245,0.013917,louisville,red
8,Bill Hollander,-0.003519,0.033599,0.010832,0.00661,0.012539,0.002408,-0.001126,-0.004859,-0.024098,...,0.003493,-0.001306,-0.019023,-0.020283,-0.005181,0.004907,0.022601,0.008768,louisville,red
9,Madonna Flood,-0.001984,0.033639,0.01221,0.009427,0.013046,0.000839,-0.000562,-0.0049,-0.023778,...,0.009438,-0.014037,-0.017609,-0.011848,-0.005899,0.008463,0.01844,0.019589,louisville,red


In [57]:
data = all_semantic_voting_records[
    all_semantic_voting_records.columns[
        ~all_semantic_voting_records.columns.isin(["councilmember", "council", "color"])
    ]
]

In [58]:
umap = UMAP()
xtfm = umap.fit_transform(data)

fit_data = pd.DataFrame()
fit_data["x"] = xtfm[:, 0]
fit_data["y"] = xtfm[:, 1]
fit_data["councilmember"] = all_semantic_voting_records["councilmember"]
fit_data["council"] = all_semantic_voting_records["council"]
fit_data["color"] = all_semantic_voting_records["color"]
fit_data

Unnamed: 0,x,y,councilmember,council,color
0,9.303505,-0.549999,Markus Winkler,louisville,red
1,11.628406,-0.384761,Kevin Kramer,louisville,red
2,9.655279,-1.352985,Keisha Dorsey,louisville,red
3,11.4138,-2.364077,Barbara Sexton Smith,louisville,red
4,9.767409,-0.35589,Scott Reed,louisville,red
5,10.994226,-1.185724,Paula McCraney,louisville,red
6,10.727836,-1.934555,Cindi Fowler,louisville,red
7,11.084494,-0.544086,Anthony Piagentini,louisville,red
8,11.63562,-2.766101,Bill Hollander,louisville,red
9,9.756385,-0.698499,Madonna Flood,louisville,red


In [59]:
source = ColumnDataSource(fit_data)

DEFAULT_TOOLTIP_FORMATTER = """
<div style="max-width: 400px; word-wrap: break-word;">
    @councilmember -- @council
</div>
"""

# palette = d3['Category10'][
#     len(all_semantic_voting_records["council"].unique())
# ]
# color_map = bmo.CategoricalColorMapper(
#     factors=all_semantic_voting_records["council"].unique(),
#     palette=palette,
# )

p = figure(width=800, height=800, tooltips=DEFAULT_TOOLTIP_FORMATTER)
p.scatter(
    x="x",
    y="y",
    source=source,
    size=10,
    alpha=0.8,
    color="color",
    legend_field="council",
)
p.xaxis[0].axis_label = "X"
p.yaxis[0].axis_label = "Y"

output_notebook()
show(p)

In [46]:
fit_data[fit_data.councilmember == "Anthony Piagentini"]

Unnamed: 0,x,y,councilmember,council,color
7,13.913713,18.762556,Anthony Piagentini,louisville,red
