In [1]:
import pandas as pd
import numpy as np

from fuzzywuzzy import process
from fuzzywuzzy import fuzz

import re
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

from nltk.corpus import stopwords
import nltk


In [2]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [3]:
tweet_df = pd.read_csv('../data/tweet_df.csv')
clusters = pd.read_csv('../../cc_election_cleaning/clusters_with_names_dec4.csv')

In [4]:
name_list = tweet_df['full_name'].unique()

In [5]:
def match_names(name, names_list):
    match, score = process.extractOne(name, names_list, scorer=fuzz.token_sort_ratio)
    return match if score >= 90 else name

In [25]:
def preprocess_text(text, stopwords):
    text = re.sub(r'http\S+|www\S+|@\w+|#\w+', '', text.lower())  
    text = re.sub(r'[^\w\s]', '', text)  
    tokens = [word for word in text.split() if word not in stopwords]  
    return ' '.join(tokens)

In [7]:
clusters['twitter_name'] = clusters['matched_name'].apply(lambda x: match_names(x, name_list.tolist()))

In [9]:
fc = clusters[['twitter_name','kmode_cluster']]

In [20]:
tweet_fc = tweet_df.merge(fc, left_on='full_name', right_on='twitter_name', how='left')

In [21]:
tweet_fc = tweet_fc[['date','text','twitter_name','kmode_cluster']]

In [22]:
tweet_fc['date'] = pd.to_datetime(tweet_fc['date'])
tweet_fc['month'] = tweet_fc['date'].dt.to_period('M')

  tweet_fc['month'] = tweet_fc['date'].dt.to_period('M')


In [24]:
tweet_fc.dropna(subset=['twitter_name'], inplace=True)

tf_idf

In [26]:
df = tweet_fc.copy()

In [27]:
df = df[df['date'] < '2021-07-01']

In [58]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samtg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [59]:
standard_stopwords = set(stopwords.words('english'))
custom_stopwords = {'us','day','amp','thank','new','today','us','im','get','council','district','need','great',
                    'work','nyc','city','join','support','office','proud','time','see','public','people','happy',
                    'make','help','many','thanks','first','must','year','one','communities','together','last',
                    'like','please','much','york','yorkers','every','de','colleagues','event','forward','love''members',
                     'bronx','brooklyn','manhattan','queens','staten','island','community','important','team','week',
                      'love','continue','joined','free','tomorrow', 'years','know', 'vote','voter','voters', 'lets','back','congratulations',
                       'campaign','endorsement', 'neighbors','honored','keep','also','good','right','sign',
                        'local', 'look', 'still','working', 'look', 'looking', 'congressman', '15','yes','go',
                         'south','hunts','point','1','everyone','come','stand','way','cant','well','open','would',
                         'always','dont','better','take','workers','vaccine','elmhurst','holden','sure'} 
all_stopwords = standard_stopwords.union(custom_stopwords)

In [48]:
df['processed_text'] = df['text'].apply(lambda x: preprocess_text(x, all_stopwords))

In [49]:
all_words = ' '.join(df['processed_text']).split()
word_freq = Counter(all_words)
print(word_freq.most_common(20))

[('fight', 903), ('justice', 703), ('housing', 545), ('pandemic', 543), ('health', 499), ('women', 480), ('families', 476), ('safe', 472), ('park', 462), ('black', 442), ('food', 441), ('next', 432), ('family', 417), ('fighting', 416), ('excited', 412), ('want', 399), ('state', 398), ('may', 396), ('voting', 392), ('call', 390)]


In [50]:
cluster_texts = df.groupby('kmode_cluster')['processed_text'].apply(lambda x: ' '.join(x)).reset_index()

In [51]:
tfidf_vectorizer = TfidfVectorizer()

In [52]:
tfidf_matrix = tfidf_vectorizer.fit_transform(cluster_texts['processed_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out(), index=cluster_texts['kmode_cluster'])

In [53]:
top_n = 10 
characteristic_words = {}
for cluster in tfidf_df.index:
    sorted_words = tfidf_df.loc[cluster].sort_values(ascending=False).head(top_n)
    characteristic_words[cluster] = sorted_words.index.tolist()

In [54]:
characteristic_words_df = pd.DataFrame(characteristic_words).T

In [55]:
characteristic_words_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0.0,fight,justice,black,housing,health,pandemic,women,covid19,la,safe
1.0,cuomo,state,park,said,pandemic,governor,life,even,hate,thats
2.0,may,early,family,pandemic,voting,got,never,families,fight,hate
3.0,fight,justice,housing,park,movement,fighting,climate,schools,women,care
4.0,corona,food,families,residents,fight,pandemic,volunteers,vaccinated,testing,best


Not telling us much, lets look at specific categories and then sentiments 

In [None]:
cop_words = ['cop', 'police', 'nypd','policing','public safety','plainclothes']
transit_words = ['bus','bus rider','train','subway','mta','transit']
bike_words = ['bike','bike lane','protected bike lane','cycle','cyclist']
housing_words = ['rent','housing']
educ_words = ['education','teachers','schools','students']
unions = ['dc37','1199seiou']

In [57]:
cop_tweets = tweet_fc[tweet_fc['text'].str.contains('|'.join(cop_words), case=False)]

In [108]:
cop_tweets = cop_tweets[cop_tweets['date'] < '2021-07-01']

In [109]:
cop_stops = ['nypd','police','cop','cops','policing','public safety','safety','officer','coney','safe','officers','violence',
             'ty','12']
all_stops_with_cop = all_stopwords.union(cop_stops)

In [110]:
cop_tweets['processed_text'] = cop_tweets['text'].apply(lambda x: preprocess_text(x, all_stops_with_cop))

In [111]:
cluster_texts = cop_tweets.groupby('kmode_cluster')['processed_text'].apply(lambda x: ' '.join(x)).reset_index()

In [112]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(cluster_texts['processed_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out(), index=cluster_texts['kmode_cluster'])

In [113]:
top_n = 10 
characteristic_words = {}
for cluster in tfidf_df.index:
    sorted_words = tfidf_df.loc[cluster].sort_values(ascending=False).head(top_n)
    characteristic_words[cluster] = sorted_words.index.tolist()

characteristic_words_df = pd.DataFrame(characteristic_words).T

In [114]:
characteristic_words_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0.0,black,justice,housing,ccrb,hearing,committee,development,reform,invest,george
1.0,criminals,issues,via,neighborhoods,ncos,hate,another,maspeth,veterans,antipolice
2.0,hate,young,happen,guy,feel,floyd,news,family,multilingual,responsible
3.0,black,budget,health,invest,brown,services,students,housing,justice,schools
4.0,civic,hate,attacks,residents,action,prepared,fresh,association,food,anyone


Lets pull out some example tweets

In [102]:
pd.set_option('display.max_colwidth', None)

In [115]:
cluster_three = cop_tweets[cop_tweets['kmode_cluster'] == 3]

In [118]:
search_words = ['housing']
found_tweets = cluster_three[cluster_three['text'].str.contains('|'.join(search_words), case=False)]

In [119]:
found_tweets[['text','processed_text','twitter_name']].sample(10)

Unnamed: 0,text,processed_text,twitter_name
86514,"5/5 We must respond to this tragedy by rejecting more unnecessary violence. \n\nThat means funding essential services, like housing and healthcare, and fighting for a people's budget — an economy that works for everyone, not just the wealthy, not just the cops.",55 respond tragedy rejecting unnecessary means funding essential services housing healthcare fighting peoples budget economy works wealthy,Alexa Avilés
121907,"I'm proud to be a @BoldProgressive &amp; receive the Progressive Change Campaign Committee's co-endorsement!\n\nI'll fight at the forefront of progressive change in the City Council, from divesting from NYPD to guaranteeing housing and healthcare ✊🏽\n\nJOIN US: https://t.co/gb0gFc9znx https://t.co/heYP6biAHA",receive progressive change committees coendorsement ill fight forefront progressive change divesting guaranteeing housing healthcare,Shahana K. Hanif
108549,"Yes, we are *all* concerned with public safety. \n\nThe peddled falsehood: Majority see more policing as the solution. \n\nThe truth: Majority believes community violence intervention programs, drug &amp; mental health programs, &amp; housing are more effective.\n\nhttps://t.co/RqEGtWu67x",concerned peddled falsehood majority solution truth majority believes intervention programs drug mental health programs housing effective,Tiffany Cabán
105646,We must:\n✅ Win a #GreenNewDeal\n✅ Fully fund &amp; integrate our public schools\n✅ #DefundNYPD\n✅ Guarantee healthcare and housing for all\n✅ Protect undocumented New Yorkers\n✅ and so much more.\n\nI'm committed to all of these fights. A NYC that centers dignity and care is possible.,win fully fund integrate schools guarantee healthcare housing protect undocumented committed fights centers dignity care possible,Shahana K. Hanif
121828,"True community safety doesn’t come from the NYPD, which targets, brutalizes, and murders Black and brown New Yorkers. It comes from investments in communities — affordable housing, mental health care, youth programming, accessible open spaces, &amp; food security. (2/7)",true doesnt targets brutalizes murders black brown comes investments affordable housing mental health care youth programming accessible spaces food security 27,Shahana K. Hanif
86565,"We know that Black communities are over-policed but lack vital services, like quality healthcare, housing and education.\n\nLet this year's #BlackLivesMatter protests be a reminder: Budgets show our priorities. It's time New York City is proud of ours. #DefundNYPD.\n\n5/5",black overpoliced lack vital services quality healthcare housing education let protests reminder budgets show priorities 55,Alexa Avilés
66852,"“Conviction records are lasting vestige of War on Drugs &amp; broken windows policing.”\n\n750K NYers have records, which limit govt benefits &amp; is major source of discrimination in housing &amp; employment.\n\nGreat op-Ed by @EricaMBond on need to clear old records.\n\nhttps://t.co/lgWPy0Jbf5",conviction records lasting vestige war drugs broken windows 750k nyers records limit govt benefits major source discrimination housing employment oped clear old records,Lincoln Restler
104093,"Thank you @CouragetoChange for endorsing our campaign for City Council! Together, we can reimagine our approaches to environmental justice, workers’ rights, and public safety and housing and bring transformative change to Brooklyn and our City.\n\nJoin us: https://t.co/6iKaTf1oRr https://t.co/z5Zv3lNlnA",endorsing reimagine approaches environmental justice rights housing bring transformative change,Lincoln Restler
105538,"I'd be honored to be your #1 choice for City Council so we can:\n✅ Divest from NYPD and invest in community safety\n✅ Win a #GreenNewDeal\n✅ Guarantee housing and healthcare for all\n✅ Create a feminist future\n\nPOLLS ARE OPEN TILL 9PM, find your poll site: https://t.co/VBRsyQCLA7 https://t.co/oNER2IVJ9z",id choice divest invest win guarantee housing healthcare create feminist future polls till 9pm find poll site,Shahana K. Hanif
112955,"As a Council Member, I will work to get people out of shelters and into permanent housing, end HIV/AIDS stigma, fund overdose prevention programs, win a #HomesGuarantee and a #CCNewDeal, and #DefundNYPD.\n\n@VOCALNewYork makes a caring and compassionate City possible.",member shelters permanent housing end hivaids stigma fund overdose prevention programs win makes caring compassionate possible,Shahana K. Hanif


Cop Sentiment 

In [120]:
cop_tweets['vader_sentiment'] = cop_tweets['text'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [121]:
cop_tweets['pp_vader_sentiment'] = cop_tweets['processed_text'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [122]:
ct_centiment = cop_tweets.groupby('kmode_cluster')['pp_vader_sentiment'].mean().reset_index(name='pp_vader_mean')
ct_centiment['vader_mean'] = cop_tweets.groupby('kmode_cluster')['vader_sentiment'].mean().values
ct_centiment

Unnamed: 0,kmode_cluster,pp_vader_mean,vader_mean
0,0.0,-0.036439,0.091676
1,1.0,-0.108222,0.108157
2,2.0,-0.047427,0.162272
3,3.0,-0.043922,0.102156
4,4.0,0.052521,0.284464
