In [1]:
import pandas as pd

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
from gensim import corpora

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [4]:
from sklearn.cluster import KMeans

In [5]:
from collections import Counter

In [6]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

In [7]:
from transformers import BertTokenizer, BertModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
add_stop = ['local', 'law', 'chair', 'land', 'comittee','application','oversight','committee','application','number','land use',
            'resolution','communication','new','york','city','council','councilmember','district','amend','code','administrative',
            'relation', 'calling', 'state', 'requiring', 'department', 'pass', 'sign', 'block', 'legislature', 'charter', 'governor',
             'section', 'property', 'lot', 'pursuant', 'approving', 'upon', 'establishing', 'borough', 'certain', 'commission', 'amendment', 
             'public', 'act', 'located', 'program', 'real', 'c', 'n','legislation', 'exemption', 'report', 'submitted','introduce',
             'preconsidered','manhattan','staten','island','bronx','brooklyn','queens','borough']

In [9]:
def preprocess_data(documents):
    stop_words = set(stopwords.words('english'))
    stop_words.update(add_stop)

    tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]
    processed_docs = [[word for word in doc if word not in stop_words and word.isalpha()] for doc in tokenized_docs]
    return processed_docs

In [10]:
def top_terms_for_member(member, n=5):
    if member in tfidf_df.index:
        print(f"Top {n} terms for {member}:")
        member_tfidf = tfidf_df.loc[member]
        top_terms = member_tfidf.nlargest(n)
        for term in top_terms.index:
            print(term)
    else:
        print(f"No data for individual: {member}")

In [11]:
def join_tokens(token_list):
    return ' '.join(token_list)

#### Impot and clean legislation tables

In [12]:
try:
    leg_2024 = pd.read_csv("../data/cy2023csv.csv", encoding='ISO-8859-1')
except UnicodeDecodeError:
    try:
        leg_2024 = pd.read_csv("../data/cy2023csv.csv", encoding='cp1252')
    except UnicodeDecodeError:
        pass

In [13]:
try:
    leg_2023 = pd.read_csv('../data/cy2023csv.csv', encoding='ISO-8859-1')
except UnicodeDecodeError:
    try:
        leg_2023 = pd.read_csv('../data/cy2023csv.csv', encoding='cp1252')
    except UnicodeDecodeError:
        pass

In [14]:
try:
    leg_2022 = pd.read_csv('../data/cy2022csv.csv', encoding='ISO-8859-1')
except UnicodeDecodeError:
    try:
        leg_2022 = pd.read_csv('../data/cy2022csv.csv', encoding='cp1252')
    except UnicodeDecodeError:
        pass

In [15]:
all_leg = pd.concat([leg_2022,leg_2023, leg_2024])

In [16]:
all_leg['processed_text'] = all_leg['Title'].apply(lambda x: preprocess_data([x])[0])

In [17]:
all_leg = all_leg[['Type','Status','Committee','Prime Sponsor','processed_text']]
all_leg = all_leg.rename(columns={'Prime Sponsor':'prime_sponsor'})
all_leg = all_leg.dropna()

In [17]:
all_tokens = [token for sublist in all_leg['processed_text'] for token in sublist]
word_freq = Counter(all_tokens)

most_common_words = word_freq.most_common(40)  

In [89]:
common_words_list = [word for word, freq in most_common_words]
common_words_string = ', '.join(f"'{word}'" for word in common_words_list)
print(common_words_string)

'community', 'housing', 'zoning', 'street', 'planning', 'development', 'services', 'avenue', 'map', 'decision', 'school', 'use', 'designation', 'health', 'sections', 'area', 'education', 'private', 'tax', 'within', 'lots', 'finance', 'would', 'special', 'provide', 'preservation', 'llc', 'rezoning', 'member', 'ulurp', 'taxes', 'service', 'information', 'reporting', 'building', 'action', 'changing', 'repeal', 'prohibiting', 'urban'


In [18]:
all_leg['joined_text'] = all_leg['processed_text'].apply(join_tokens)

In [19]:
all_leg.to_csv('../data/processed_leg.csv', index=False)

#### tf-idf

In [73]:
grouped_df = all_leg.groupby('prime_sponsor')['joined_text'].apply(' '.join).reset_index()

In [20]:
tfidf = TfidfVectorizer()

In [74]:
tfidf_matrix = tfidf.fit_transform(grouped_df['joined_text'])

In [75]:
feature_names = tfidf.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names, index=grouped_df['prime_sponsor'])

In [76]:
top_terms_for_member('Rita C. Joseph',5)

Top 5 terms for Rita C. Joseph:
teams
students
bathrooms
student
introduce


#### clustering 

In [96]:
num_clusters = 5 
km = KMeans(n_clusters=num_clusters)

In [97]:
km.fit(tfidf_df)

  super()._check_params_vs_input(X, default_n_init=10)


In [98]:
clusters = km.labels_.tolist()

In [101]:
name_cluster_mapping = {name: cluster for name, cluster in zip(tfidf_df.index, clusters)}

In [108]:
name = "Tiffany Cabán"  
cluster_label = name_cluster_mapping.get(name, None)
if cluster_label is not None:
    print(f"{name} belongs to cluster {cluster_label}")
else:
    print(f"{name} is not found in any cluster")

Tiffany Cabán is not found in any cluster


In [107]:
cluster_name_mapping = {}
for name, cluster in zip(tfidf_df.index, clusters):
    if cluster in cluster_name_mapping:
        cluster_name_mapping[cluster].append(name)
    else:
        cluster_name_mapping[cluster] = [name]

cluster_label_to_check = 1
if cluster_label_to_check in cluster_name_mapping:
    names_in_cluster = cluster_name_mapping[cluster_label_to_check]
    print(f"Names in cluster {cluster_label_to_check}:")
    for name in names_in_cluster:
        print(name)
else:
    print(f"No names found in cluster {cluster_label_to_check}")

Names in cluster 1:
Bill Perkins
Daniel Dromm
David M. Carr
Erik D. Bottcher
Kalman Yeger
Keith Powers
Mark Levine
Sandy Nurse
Shaun Abreu
Vickie Paladino


##### LDA

In [110]:
dictionary = corpora.Dictionary(all_leg['processed_text'])

In [111]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in all_leg['processed_text']]

In [113]:
LDA = gensim.models.ldamodel.LdaModel
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=5, random_state=100, chunksize=1000, passes=50)

In [114]:
for i, topic in lda_model.print_topics(-1):
    print(f"Topic {i}: {topic}\n")

Topic 0: 0.023*"planning" + 0.021*"decision" + 0.017*"zoning" + 0.013*"map" + 0.013*"ulurp" + 0.009*"building" + 0.009*"increase" + 0.008*"designation" + 0.008*"text" + 0.008*"training"

Topic 1: 0.038*"community" + 0.024*"development" + 0.024*"housing" + 0.018*"zoning" + 0.018*"avenue" + 0.016*"street" + 0.015*"sections" + 0.015*"area" + 0.012*"rezoning" + 0.012*"llc"

Topic 2: 0.017*"school" + 0.010*"states" + 0.010*"education" + 0.010*"president" + 0.010*"congress" + 0.009*"united" + 0.008*"parking" + 0.008*"services" + 0.007*"authority" + 0.007*"schools"

Topic 3: 0.021*"housing" + 0.017*"services" + 0.016*"private" + 0.013*"finance" + 0.013*"repeal" + 0.010*"taxes" + 0.010*"force" + 0.010*"study" + 0.009*"reporting" + 0.009*"health"

Topic 4: 0.012*"would" + 0.011*"service" + 0.011*"police" + 0.008*"provide" + 0.007*"year" + 0.007*"health" + 0.007*"community" + 0.007*"mental" + 0.007*"water" + 0.007*"management"



In [119]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, doc_term_matrix, dictionary)

In [143]:
def assign_topics(document):
    doc_topics = lda_model.get_document_topics(document)
    if doc_topics:
        max_prob_topic = max(doc_topics, key=lambda x: x[1])
        return max_prob_topic[0]  
    else:
        return -1

In [150]:
# add col to all_leg with topic number
all_leg['topic'] = all_leg['processed_text'].apply(lambda x: assign_topics(dictionary.doc2bow(x)))

#### BERT

In [19]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

model.safetensors: 100%|██████████| 440M/440M [01:00<00:00, 7.30MB/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [21]:
processed_texts = all_leg['joined_text']

In [23]:
bert_inputs = []

for text in processed_texts:
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    bert_inputs.append(inputs)

In [None]:
bert_outputs = []

for inputs in bert_inputs:
    outputs = model(**inputs)
    bert_outputs.append(outputs)