In [1]:
import pandas as pd
import numpy as np

from fuzzywuzzy import process
from fuzzywuzzy import fuzz

import re
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

from nltk.corpus import stopwords
import nltk
import os
import json

In [2]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [3]:
tweet_df = pd.read_csv('../citycouncil_tweets/data/tweet_df.csv')
clusters = pd.read_csv('../cc_election_cleaning/clusters_with_names_dec4.csv')

In [4]:
def json_to_dataframe(json_data):
    flattened_data = []
    for item in json_data:
        if 'data' in item and 'search_by_raw_query' in item['data'] and 'search_timeline' in item['data']['search_by_raw_query']:
            timeline = item['data']['search_by_raw_query']['search_timeline']
            if 'timeline' in timeline and 'instructions' in timeline['timeline']:
                instructions = timeline['timeline']['instructions']
                for instruction in instructions:
                    if instruction['type'] == 'TimelineAddEntries' and 'entries' in instruction:
                        for entry in instruction['entries']:
                            if 'content' in entry and 'itemContent' in entry['content']:
                                tweet_content = entry['content']['itemContent']
                                if tweet_content['itemType'] == 'TimelineTweet' and 'tweet_results' in tweet_content:
                                    tweet_info = tweet_content['tweet_results']['result']
                                    if 'core' in tweet_info and 'user_results' in tweet_info['core']:
                                        user_info = tweet_info['core']['user_results']['result']

                                        tweet_id = tweet_info.get('rest_id', '')
                                        user_id = user_info.get('rest_id', '')
                                        username = user_info.get('legacy', {}).get('screen_name', '')
                                        description = user_info.get('legacy', {}).get('description', '')
                                        tweet_text = tweet_info.get('legacy', {}).get('full_text', '')
                                        tweet_date = tweet_info.get('legacy', {}).get('created_at', '') 

                                        flattened_data.append({
                                            'tweet_id': tweet_id,
                                            'user_id': user_id,
                                            'username': username,
                                            'description': description,
                                            'tweet_text': tweet_text,
                                            'tweet_date': tweet_date 
                                        })

    return pd.DataFrame(flattened_data)

def combine_json_files_to_dataframe(directory_path):
    all_dataframes = []  
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):  
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                json_data = json.load(file)
                df = json_to_dataframe(json_data)
                all_dataframes.append(df)

    combined_dataframe = pd.concat(all_dataframes, ignore_index=True)
    return combined_dataframe


In [5]:
directory_path = '../citycouncil_tweets/data/losers'
loser_df = combine_json_files_to_dataframe(directory_path)

In [6]:
caban_df = tweet_df[tweet_df['full_name'] == 'Tiffany Cabán']

In [7]:
caban_df.drop(columns=['Unnamed: 0'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  caban_df.drop(columns=['Unnamed: 0'], inplace=True)


In [8]:
evie_df = loser_df[loser_df['username'] == 'evie4us']

In [10]:
evie_df = evie_df[['tweet_date','username','tweet_text']]

In [11]:
evie_df['full_name'] = 'Evie Hantzopoulos'

In [9]:
caban_df.head(1)

Unnamed: 0,date,username,text,full_name
14,2023-03-14T21:12:27.000Z,CabanD22,📣 RALLY ANNOUNCEMENT: NYC Council Members @CM...,Tiffany Cabán


In [12]:
evie_df.head(1)

Unnamed: 0,tweet_date,username,tweet_text,full_name
0,Sat Jul 10 18:17:12 +0000 2021,evie4us,@LeylaDoss I'm speechless.,Evie Hantzopoulos


In [13]:
evie_df.columns = ['date', 'username', 'text', 'full_name']

In [5]:
name_list = tweet_df['full_name'].unique()

In [6]:
def match_names(name, names_list):
    match, score = process.extractOne(name, names_list, scorer=fuzz.token_sort_ratio)
    return match if score >= 90 else name

In [7]:
def preprocess_text(text, stopwords):
    text = re.sub(r'http\S+|www\S+|@\w+|#\w+', '', text.lower())  
    text = re.sub(r'[^\w\s]', '', text)  
    tokens = [word for word in text.split() if word not in stopwords]  
    return ' '.join(tokens)

In [14]:
caban_df['date'] = pd.to_datetime(caban_df['date'])
caban_df['month'] = caban_df['date'].dt.to_period('M')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  caban_df['date'] = pd.to_datetime(caban_df['date'])
  caban_df['month'] = caban_df['date'].dt.to_period('M')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  caban_df['month'] = caban_df['date'].dt.to_period('M')


In [15]:
evie_df['date'] = pd.to_datetime(evie_df['date'])
evie_df['month'] = evie_df['date'].dt.to_period('M')

  evie_df['date'] = pd.to_datetime(evie_df['date'])
  evie_df['month'] = evie_df['date'].dt.to_period('M')


In [16]:
district_df = pd.concat([caban_df, evie_df], ignore_index=True)

In [17]:
district_df.to_csv('../data/district_tweets.csv')

OSError: Cannot save file into a non-existent directory: '..\data'