In [84]:
import pandas as pd
import numpy as np

from fuzzywuzzy import process
from fuzzywuzzy import fuzz

import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

from nltk.corpus import stopwords
import nltk
import os
import json

In [2]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [4]:
tweet_df = pd.read_csv('../../citycouncil_tweets/data/tweet_df.csv')
clusters = pd.read_csv('../../cc_election_cleaning/clusters_with_names_dec4.csv')

In [5]:
def json_to_dataframe(json_data):
    flattened_data = []
    for item in json_data:
        if 'data' in item and 'search_by_raw_query' in item['data'] and 'search_timeline' in item['data']['search_by_raw_query']:
            timeline = item['data']['search_by_raw_query']['search_timeline']
            if 'timeline' in timeline and 'instructions' in timeline['timeline']:
                instructions = timeline['timeline']['instructions']
                for instruction in instructions:
                    if instruction['type'] == 'TimelineAddEntries' and 'entries' in instruction:
                        for entry in instruction['entries']:
                            if 'content' in entry and 'itemContent' in entry['content']:
                                tweet_content = entry['content']['itemContent']
                                if tweet_content['itemType'] == 'TimelineTweet' and 'tweet_results' in tweet_content:
                                    tweet_info = tweet_content['tweet_results']['result']
                                    if 'core' in tweet_info and 'user_results' in tweet_info['core']:
                                        user_info = tweet_info['core']['user_results']['result']

                                        tweet_id = tweet_info.get('rest_id', '')
                                        user_id = user_info.get('rest_id', '')
                                        username = user_info.get('legacy', {}).get('screen_name', '')
                                        description = user_info.get('legacy', {}).get('description', '')
                                        tweet_text = tweet_info.get('legacy', {}).get('full_text', '')
                                        tweet_date = tweet_info.get('legacy', {}).get('created_at', '') 

                                        flattened_data.append({
                                            'tweet_id': tweet_id,
                                            'user_id': user_id,
                                            'username': username,
                                            'description': description,
                                            'tweet_text': tweet_text,
                                            'tweet_date': tweet_date 
                                        })

    return pd.DataFrame(flattened_data)

def combine_json_files_to_dataframe(directory_path):
    all_dataframes = []  
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):  
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                json_data = json.load(file)
                df = json_to_dataframe(json_data)
                all_dataframes.append(df)

    combined_dataframe = pd.concat(all_dataframes, ignore_index=True)
    return combined_dataframe


In [7]:
directory_path = '../../citycouncil_tweets/data/losers'
loser_df = combine_json_files_to_dataframe(directory_path)

In [8]:
caban_df = tweet_df[tweet_df['full_name'] == 'Tiffany Cabán']

In [9]:
caban_df.drop(columns=['Unnamed: 0'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  caban_df.drop(columns=['Unnamed: 0'], inplace=True)


In [10]:
evie_df = loser_df[loser_df['username'] == 'evie4us']

In [11]:
evie_df = evie_df[['tweet_date','username','tweet_text']]

In [12]:
evie_df['full_name'] = 'Evie Hantzopoulos'

In [13]:
evie_df.columns = ['date', 'username', 'text', 'full_name']

In [24]:
def preprocess_text(text, stopwords):
    text = re.sub(r'http\S+|www\S+|@\w+|#\w+', '', text.lower())  
    text = re.sub(r'[^\w\s]', '', text)  
    tokens = [word for word in text.split() if word not in stopwords]  
    return ' '.join(tokens)

In [14]:
caban_df['date'] = pd.to_datetime(caban_df['date'])
caban_df['month'] = caban_df['date'].dt.to_period('M')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  caban_df['date'] = pd.to_datetime(caban_df['date'])
  caban_df['month'] = caban_df['date'].dt.to_period('M')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  caban_df['month'] = caban_df['date'].dt.to_period('M')


In [15]:
evie_df['date'] = pd.to_datetime(evie_df['date'])
evie_df['month'] = evie_df['date'].dt.to_period('M')

  evie_df['date'] = pd.to_datetime(evie_df['date'])
  evie_df['month'] = evie_df['date'].dt.to_period('M')


In [16]:
district_df = pd.concat([caban_df, evie_df], ignore_index=True)

In [18]:
district_df.to_csv('../data/caban_evie_tweets.csv')

tf-idf

In [19]:
df = district_df.copy()

In [20]:
df = df[df['date'] < '2021-07-01']

In [21]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samtg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [56]:
standard_stopwords = set(stopwords.words('english'))
custom_stopwords = {'us','day','amp','thank','new','today','us','im','get','council','district','need','great',
                    'work','nyc','city','join','support','office','proud','time','see','public','people','happy',
                    'make','help','many','thanks','first','must','year','one','communities','together','last',
                    'like','please','much','york','yorkers','every','de','colleagues','event','forward','love''members',
                     'bronx','brooklyn','manhattan','queens','staten','island','community','important','team','week',
                      'love','continue','joined','free','tomorrow', 'years','know', 'vote','voter','voters', 'lets','back','congratulations',
                       'campaign','endorsement', 'neighbors','honored','keep','also','good','right','sign',
                        'local', 'look', 'still','working', 'look', 'looking', 'congressman', '15','yes','go',
                         'south','hunts','point','1','everyone','come','stand','way','cant','well','open','would',
                         'always','dont','better','take','workers','vaccine','elmhurst','holden','sure','22','folks',
                         'astoria','w','use','ive','want','done','candidates','ty','maybe','morning'} 
all_stopwords = standard_stopwords.union(custom_stopwords)

In [57]:
df['processed_text'] = df['text'].apply(lambda x: preprocess_text(x, all_stopwords))

In [58]:
all_words = ' '.join(df['processed_text']).split()
word_freq = Counter(all_words)
print(word_freq.most_common(20))

[('fight', 56), ('police', 53), ('safety', 50), ('violence', 42), ('justice', 41), ('health', 40), ('housing', 40), ('plan', 37), ('movement', 35), ('care', 35), ('meeting', 35), ('safe', 29), ('black', 29), ('grateful', 29), ('real', 28), ('yall', 26), ('going', 26), ('family', 26), ('got', 25), ('lives', 25)]


In [59]:
name_texts = df.groupby('full_name')['processed_text'].apply(lambda x: ' '.join(x)).reset_index()

In [60]:
tfidf_vectorizer = TfidfVectorizer()

In [41]:
tfidf_matrix =tfidf_vectorizer.fit_transform(name_texts['processed_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out(), index=name_texts['full_name'])

In [42]:
top_n = 10 
characteristic_words = {}
for cluster in tfidf_df.index:
    sorted_words = tfidf_df.loc[cluster].sort_values(ascending=False).head(top_n)
    characteristic_words[cluster] = sorted_words.index.tolist()

In [43]:
characteristic_words_df = pd.DataFrame(characteristic_words).T

In [44]:
characteristic_words_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Evie Hantzopoulos,meeting,housing,land,affordable,school,street,read,going,arts,set
Tiffany Cabán,movement,fight,police,safety,black,violence,yall,health,plan,justice


logistic

In [61]:
df['full_name'].value_counts()

full_name
Tiffany Cabán        837
Evie Hantzopoulos    807
Name: count, dtype: int64

In [62]:
X = df['processed_text']
y = df['full_name']
vectorizer = TfidfVectorizer(max_features=1000) 
X_tfidf = vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [63]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [64]:
accuracy = model.score(X_test, y_test)
print(f"Model accuracy: {accuracy:.2f}")

Model accuracy: 0.71


In [65]:
feature_names = vectorizer.get_feature_names_out()
coefficients = model.coef_

if coefficients.shape[0] == 1:  # Binary classification
    importance_df = pd.DataFrame(coefficients.T, index=feature_names, columns=[model.classes_[1]])
    # Add a column for the other class (negative coefficients)
    importance_df[model.classes_[0]] = -coefficients.T
else:  # Multiclass classification
    importance_df = pd.DataFrame(coefficients.T, index=feature_names, columns=model.classes_)


In [66]:
for name in model.classes_:
    print(f"Top words for {name}:")
    print(importance_df[name].sort_values(ascending=False).head(10))
    print()

Top words for Evie Hantzopoulos:
school        1.340045
land          1.315158
developers    1.263449
lot           1.215782
affordable    1.204018
news          1.170233
street        1.129710
issue         1.125367
really        1.090846
platform      1.002170
Name: Evie Hantzopoulos, dtype: float64

Top words for Tiffany Cabán:
movement    1.928428
yall        1.832839
fight       1.631922
police      1.507899
safety      1.441504
black       1.274014
talk        1.215474
dope        1.137017
politics    1.073495
violence    1.044922
Name: Tiffany Cabán, dtype: float64



Topic specific

In [78]:
cop_words = ['cop', 'police', 'nypd','policing','public safety','plainclothes']
housing_words = ['housing', 'eviction', 'tenant', 'rent', 'affordable', 'developers','land','lot']

In [79]:
housing_tweets = df[df['processed_text'].str.contains('|'.join(housing_words))]

In [68]:
cop_tweets = df[df['processed_text'].str.contains('|'.join(cop_words))]

In [80]:
housing_tweets['full_name'].value_counts()

full_name
Evie Hantzopoulos    108
Tiffany Cabán         50
Name: count, dtype: int64

In [70]:
cop_tweets['sentiment'] = cop_tweets['text'].apply(lambda x: sia.polarity_scores(x)['compound'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cop_tweets['sentiment'] = cop_tweets['text'].apply(lambda x: sia.polarity_scores(x)['compound'])


In [81]:
housing_tweets['sentiment'] = housing_tweets['text'].apply(lambda x: sia.polarity_scores(x)['compound'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  housing_tweets['sentiment'] = housing_tweets['text'].apply(lambda x: sia.polarity_scores(x)['compound'])


In [72]:
# change pd options to display full text
pd.set_option('display.max_colwidth', None)

In [83]:
housing_tweets[['full_name', 'text', 'sentiment']].sample(10)

Unnamed: 0,full_name,text,sentiment
3478,Evie Hantzopoulos,"""Why can't our land zoning process look more like participatory budgeting?"" asks @ka_hackett #HousingDist22",0.4201
3383,Evie Hantzopoulos,"Topic: Land Use Zoning Committee 3-24-2021 Meeting Time: March 24, 2021 06:30 PM Eastern Time (US and Canada)Join Zoom Meeting: https://t.co/GPyOxCvQ6S ID: 944 1629 9485Passcode: 858219 One tap mobile: +16465588656,,94416299485#,,,,*858219# US (New Yor",0.0
3304,Evie Hantzopoulos,@Tellythecairn Ironically it was many members of the small business community who fought it but I think they would see it differently now.,-0.1655
2329,Tiffany Cabán,"The food in Queens reigns supreme, right @jaslinforqueens? \n\nCome out and help us get on the ballot, then stay for the eats! \nQGTM and the sigs!",0.7835
1205,Tiffany Cabán,"Over $70k for just one of these NYPD robotic dogs. Meanwhile, agencies across the city that deliver essential services and programs are facing cuts. #DefundThePolice \n\nIf this isn’t absolutely abhorrent to you...🤯",-0.7645
3483,Evie Hantzopoulos,"What is a rent stabilized apt? If you’re rent stabilized, the landlord doesn’t get to decide what the rent increase will be. #HousingDist22",0.3182
1245,Tiffany Cabán,"Portland has a majority Black &amp; brown City Council for 1st time in history. Now that majority is voting to combat rising gun violence, not w more police, but w millions to community based violence interruption orgs.\n⁦@NYCCouncil⁩ ⁦@NYCMayor⁩ 👀 https://t.co/8iaaiPbzUR",-0.9306
2240,Tiffany Cabán,"@CatalinaCruzNY is a fierce fighter for immigrant justice, tenant protections, workers rights, and incredible ally to the LGBTQ community. I am honored to receive her endorsement! #cabanforcouncil https://t.co/rRfuxIcuZl",0.8858
3175,Evie Hantzopoulos,Hey #NYCSchools Parents! TODAY IS THE LAST DAY TO vote in community education council (CEC) elections! If you’re a NYC public school parent with a child in K-8 you can vote: https://t.co/QY1q3BUhGa,0.0
3740,Evie Hantzopoulos,CB1 Queens Land Use meeting tonight at 6:30 includes pre-cert presentation for new Hallets Cove rezoning 1400 units and 475 parking spaces. \nhttps://t.co/63UBedlK6z,0.0


bi-gram

In [89]:
vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words='english')
X_bigrams = vectorizer.fit_transform(df['text'])

In [90]:
bigram_df = pd.DataFrame(X_bigrams.toarray(), columns=vectorizer.get_feature_names_out())
bigram_df['full_name'] = df['full_name']
bigram_summary = bigram_df.groupby('full_name').sum()

rf

In [92]:
from sklearn.ensemble import RandomForestClassifier

In [93]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [95]:
feature_importances = rf_model.feature_importances_


In [97]:
feature_importances_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})

In [100]:
feature_importances_df.sort_values('importance', ascending=False).head(10)

Unnamed: 0,feature,importance
583,movement,0.017092
987,yall,0.015717
315,fight,0.014675
670,police,0.011614
773,safety,0.009169
478,land,0.008497
785,school,0.007579
883,talk,0.007557
557,meeting,0.007108
857,street,0.006597
