### Use "high-ram"

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import datetime as dt
%matplotlib inline
import matplotlib.pyplot as plt

from itertools import chain
from functools import reduce

In [None]:
df = pd.read_csv("../full_v5_text.csv", lineterminator = '\n')

In [None]:
def preprocess_dataframe(df):
    # Extract the year and year-month from the 'created_at' column
    df["year"] = df["created_at"].str[:4]
    df["year-month"] = df["created_at"].str[:10]

    # Define a dictionary to map category names to numeric values
    cat_dict = {'Positive': 10, 'Neutral': 0, 'Negative': -10}

    # Map the category names to numeric values using the dictionary
    df["category_num"] = df["category_pred"].map(cat_dict)

    # Select and reorder columns
    df = df[['tweet_id', 'text', "category_pred", 'category_num', 'created_at', 'user_id',
             'user_loc', 'like_count', 'retweet_count', 'GEO_ID', 'STATE_FIPS', 'year',
             'year-month']]

    return df

df = preprocess_dataframe(df)

In [None]:
df['category_num'].value_counts()

 10    5154974
 0     1824577
-10    1044958
Name: category_num, dtype: int64

### Datasets by States and Cities

In [None]:
def aggregate_byyear_bygeography(df, year, geography):
    df_year = df.loc[df["year"] == year]

    # Group by user_id and compute the mean and count of the category_num column
    id_grouped = df_year.groupby('user_id').agg({'category_num': ['mean', 'count'], geography: 'last'})

    # Rename the columns
    id_grouped.columns = ['score_mean', 'tweet_count', geography]

    id_grouped.reset_index(inplace=True)

    # Group by geography and get the number of tweets and unique user IDs
    year_id_grouped = id_grouped.groupby(geography).agg({'score_mean': ['mean'], 'tweet_count':['sum'], 'user_id': ['count']})

    # Rename the columns
    year_id_grouped.columns = ['sent_score', 'tweet_count', 'user_count']

    year_id_grouped = year_id_grouped.reset_index().rename(columns={'index': geography})

    # Add the 'year' column
    year_id_grouped['year'] = year

    # NaN columns removed if any
    year_id_grouped = year_id_grouped[year_id_grouped.user_count >= 1]

    # Sort the DataFrame by 'user_count'
    year_id_grouped = year_id_grouped.sort_values(by='user_count', ascending=False)

    # Select specific columns
    year_id_grouped = year_id_grouped[[geography, 'year', 'sent_score', 'tweet_count', 'user_count']]

    year_id_grouped = year_id_grouped.reset_index(drop=True)

    return year_id_grouped

## Cities

In [None]:
# Define the list of years
years = ["2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022"]

In [None]:
# Initialize an empty DataFrame to store the results
df_byyear_bycity = pd.DataFrame()

geography = "GEO_ID"

# Process each year and concatenate the results
for year in years:
    year_data = aggregate_byyear_bygeography(df, year, geography)
    df_byyear_bycity = pd.concat([df_byyear_bycity, year_data], ignore_index=True)

df_byyear_bycity.to_csv("../sent_allyears_allcities_v6.csv", index=False)

# States

In [None]:
df_byyear_bystate = pd.DataFrame()

geography = "STATE_FIPS"

# Process each year and concatenate the results
for year in years:
    year_data = aggregate_byyear_bygeography(df, year, geography)
    df_byyear_bystate = pd.concat([df_byyear_bystate, year_data], ignore_index=True)

# Save the final DataFrame to a CSV file
df_byyear_bystate.to_csv("../sent_allyears_allstates_v6.csv", index=False)

In [None]:
df_byyear_bystate.tweet_count.sum()

8024509