In [1]:
import pandas as pd
import numpy as np

from fuzzywuzzy import process
from fuzzywuzzy import fuzz

import re
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

from nltk.corpus import stopwords
import nltk

from scipy.sparse import csr_matrix

In [2]:
tweet_df = pd.read_csv('../data/tweet_df.csv')
tweet_df_fac = pd.read_csv('../data/tweet_df_fac.csv')
finance_clusters = pd.read_csv('../../council_campaign_finance/data/winning_members_clusters.csv')

In [15]:
tweet_df.head()

Unnamed: 0.1,Unnamed: 0,date,username,text,full_name
0,0,2023-03-14T23:23:01.000Z,CMCrystalHudson,Thank you Aging Chair @CMCrystalHudson for hol...,Crystal Hudson
1,1,2023-03-14T22:56:52.000Z,EricDinowitzNYC,@cmmvelaz https://t.co/Cbl2pQAWz0,Eric Dinowitz
2,2,2023-03-14T22:50:55.000Z,NYCSpeakerAdams,Creating healthier and more sustainable public...,Adrienne E. Adams
3,3,2023-03-14T22:35:06.000Z,cmmvelaz,Today I toured the Parkchester/VanNest &amp; M...,Marjorie Velázquez
4,4,2023-03-14T22:34:36.000Z,cmmvelaz,📣 RALLY ANNOUNCEMENT: NYC Council Members @CMM...,Marjorie Velázquez


In [3]:
name_list = tweet_df['full_name'].unique()

In [21]:
finance_clusters.head()

Unnamed: 0,normalized_mem,count,total_amount,mean_amount,home_district_percentage,oos_percentage,perc_IND_value,perc_total,perc_highest,cluster,finance_cluster,matched_name
0,Adrienne Adams,713,75300.08,105.61021,31.45275,16.078984,76.703696,61.530863,100.0,4,4,Adrienne E. Adams
1,Alexa Aviles,1699,78261.53,46.06329,16.746988,18.614458,86.896592,34.628641,100.0,1,1,Alexa Avilés
2,Althea V Stevens,606,37898.65,62.539026,10.252101,20.504202,71.531572,34.445552,100.0,1,1,Althea V. Stevens
3,Amanda Farias,1101,60003.68,54.499255,8.140611,10.360777,84.117395,30.875704,100.0,1,1,Amanda Farías
4,Ari Kagan,592,70489.0,119.069257,39.414802,2.237522,85.947521,50.508646,100.0,2,2,Ari Kagan


In [4]:
def match_names(name, names_list):
    match, score = process.extractOne(name, names_list, scorer=fuzz.token_sort_ratio)
    return match if score >= 90 else name

In [5]:
finance_clusters['matched_name'] = finance_clusters['normalized_mem'].apply(lambda x: match_names(x, name_list.tolist()))

In [6]:
fc = finance_clusters[['matched_name','cluster']]

In [7]:
tweet_fc = tweet_df.merge(fc, left_on='full_name', right_on='matched_name', how='left')

In [8]:
tweet_fc = tweet_fc[['date','text','matched_name','cluster']]

In [9]:
# count tweets by month
tweet_fc['date'] = pd.to_datetime(tweet_fc['date'])
tweet_fc['month'] = tweet_fc['date'].dt.to_period('M')

  tweet_fc['month'] = tweet_fc['date'].dt.to_period('M')


In [51]:
tweet_fc.dropna(subset=['matched_name'], inplace=True)

In [33]:
tweet_fc.sample(10)

Unnamed: 0,date,text,matched_name,cluster,month
15600,2022-07-09 12:34:00+00:00,Eid al-Adha teaches us the meaning of selfless...,Sandra Ung,2.0,2022-07
134469,2023-10-07 23:54:02+00:00,My statement on today’s attacks and violence i...,Sandy Nurse,1.0,2023-10
49760,2022-05-06 18:24:11+00:00,LIVE NOW in Morris Park with @NYCMayor @bronxb...,Oswald Feliz,0.0,2022-05
73618,2023-08-08 14:32:05+00:00,💃🏻I am proud to be partnering with @AcaciaNetw...,"Rafael Salamanca, Jr.",3.0,2023-08
21338,2022-04-28 13:46:08+00:00,"Wifi is a utility, not a luxury. Black New Yor...",Julie Won,1.0,2022-04
69745,2021-04-22 19:56:29+00:00,Today is #EarthDay. Here is how the City Counc...,Shaun Abreu,2.0,2021-04
33223,2023-01-16 17:50:25+00:00,"Con verdadero espíritu laboral, MLK estaba en ...",Carmen N. De La Rosa,2.0,2023-01
95199,2023-07-29 21:42:37+00:00,Unfortunately we are postponing tonight’s movi...,Sandy Nurse,1.0,2023-07
34968,2022-12-12 16:02:36+00:00,We’re drowning in single-use plastic!\n\nToday...,Shaun Abreu,2.0,2022-12
144558,2021-09-21 14:14:09+00:00,The @MTA is seeking new bus operators! These p...,Adrienne E. Adams,4.0,2021-09


In [10]:
def preprocess_text(text, stopwords):
    text = re.sub(r'http\S+|www\S+|@\w+|#\w+', '', text.lower())  
    text = re.sub(r'[^\w\s]', '', text)  
    tokens = [word for word in text.split() if word not in stopwords]  
    return ' '.join(tokens)

In [52]:
df = tweet_fc.copy()

In [53]:
# only look at tweets before june 2021
df = df[df['date'] < '2021-07-01']

In [54]:
nltk.download('stopwords')
standard_stopwords = set(stopwords.words('english'))
custom_stopwords = {'us','day','amp','thank','new','today','us','im','get','council','district','need','great',
                    'work','nyc','city','join','support','office','proud','time','see','public','people','happy',
                    'make','help','many','thanks','first','must','year','one','communities','together','last',
                    'like','please','much','york','yorkers','every','de','colleagues','event','forward','love''members',
                     'bronx','brooklyn','manhattan','queens','staten','island','community','important','team','week',
                      'love','continue','joined','free','tomorrow', 'years','know', 'vote', 'lets','back','congratulations',
                       'campaign','endorsement', 'neighbors','honored','keep','also','good','right','sign',
                        'local', 'look', 'still','working', 'look', 'looking', 'congressman', '15','yes','go',
                         'south','hunts','point','1','everyone','come','stand','way','cant'} 
all_stopwords = standard_stopwords.union(custom_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samtg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [55]:
df['processed_text'] = df['text'].apply(lambda x: preprocess_text(x, all_stopwords))

Count words to improve stopwords

In [40]:
all_words = ' '.join(df['processed_text']).split()
word_freq = Counter(all_words)
print(word_freq.most_common(20))

[('workers', 1310), ('fight', 1118), ('justice', 942), ('housing', 781), ('vaccine', 693), ('health', 663), ('well', 661), ('dont', 649), ('pandemic', 643), ('black', 631), ('always', 612), ('families', 581), ('better', 571), ('women', 571), ('safe', 562), ('state', 548), ('park', 519), ('may', 517), ('family', 510), ('would', 509)]


tf-idf

In [56]:
df[['date','text','cluster','processed_text']].sample(10)

Unnamed: 0,date,text,cluster,processed_text
66989,2021-04-20 17:34:49+00:00,I joined several of my colleagues in the @NYCC...,4.0,several leaders department parks recreation em...
112279,2021-03-04 17:51:25+00:00,TONIGHT: NYC DOT is presenting a proposal to m...,1.0,tonight dot presenting proposal 7th 8th aves s...
67944,2021-04-04 14:51:12+00:00,"He was a formerly outcast refugee, and working...",1.0,formerly outcast refugee workingclass carpente...
90723,2021-01-22 03:03:29+00:00,By far one of my fave meme's!! LOL.,4.0,far fave memes lol
112543,2021-03-29 22:23:21+00:00,A majority of New Yorkers agree: It's time to ...,4.0,majority agree tax rich invest fund vital serv...
85620,2021-02-02 05:08:16+00:00,https://t.co/Krq3iLWiuK,2.0,
123731,2021-05-22 18:44:46+00:00,"Thank you to Doug Montgomery, St. Anastasia Ch...",2.0,doug montgomery st anastasia church douglaston...
87804,2021-02-01 18:55:12+00:00,The careers and livelihoods of arts workers ar...,2.0,careers livelihoods arts workers threatened co...
107240,2021-06-16 15:36:09+00:00,@MGrossman92 @NYCMayor @scottmstringer @KGforN...,2.0,
104207,2021-06-18 16:32:28+00:00,Honored to join @RepRitchie and the Union Comm...,0.0,union health center mobile dental unit ribbon ...


In [57]:
cluster_texts = df.groupby('cluster')['processed_text'].apply(lambda x: ' '.join(x)).reset_index()
tfidf_vectorizer = TfidfVectorizer()

In [58]:
tfidf_matrix = tfidf_vectorizer.fit_transform(cluster_texts['processed_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out(), index=cluster_texts['cluster'])

In [59]:
top_n = 10 
characteristic_words = {}
for cluster in tfidf_df.index:
    sorted_words = tfidf_df.loc[cluster].sort_values(ascending=False).head(top_n)
    characteristic_words[cluster] = sorted_words.index.tolist()

In [60]:
print(characteristic_words)

{0.0: ['workers', 'candidate', 'best', 'voters', 'better', 'deserve', 'home', 'grateful', 'fight', 'councilman'], 1.0: ['fight', 'workers', 'justice', 'black', 'housing', 'women', 'la', 'folks', 'en', 'fighting'], 2.0: ['workers', 'fight', 'voters', 'health', 'pandemic', 'hate', 'justice', '2021', 'endorsed', 'covid'], 3.0: ['testing', 'food', 'happening', 'residents', '1070', 'covid', 'appointment', 'southern', 'site', 'housing'], 4.0: ['workers', 'vaccine', 'pandemic', 'covid19', 'fight', 'families', 'park', 'health', 'june', 'street']}
