Step 1: import Bluesky Firehose Summer 2025 post data from Snowflake via a cursor (PFC code)

In [5]:
from datasets import Dataset
from datetime import datetime, timedelta
import os
import re
import snowflake.connector
from snowflake.connector.pandas_tools import write_pandas
from time import sleep
import torch
from transformers import pipeline

### GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS 
### GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS 
### GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS 

# this basically means "smoke em if you got em" where the "em" is NVIDIA GPU
DEVICE = 0 if torch.cuda.is_available() else -1

SF_USR = os.getenv('SF_USR')
SF_ID  = os.getenv('SF_ID')
SF_WH  = os.getenv('SF_WH')
SF_DB  = os.getenv('SF_DB')
SF_SC  = os.getenv('SF_SC')
SF_RL  = os.getenv('SF_RL')

# connect to database and init a cursor for querying
xct_params = {
    "user":                 SF_USR
   ,"account":              SF_ID
   ,"warehouse":            SF_WH
   ,"database":             SF_DB
   ,"schema":               SF_SC
   ,"role":                 SF_RL
   ,"private_key_file":     os.getenv('PRIVATE_KEY_PATH')
   ,"private_key_file_pwd": os.getenv('PRIVATE_KEY_PASSPHRASE')
   ,"authenticator":        os.getenv('SF_AUTH')
}

SF_XCT = snowflake.connector.connect(**xct_params) #connection object
CSR = SF_XCT.cursor()

In [6]:
#import needed libraries
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt

Step 2: Query the database. 

Our sampling methodology is an interrupted census (every post for a 5 minute interval, every 3 hours throughout the day), not a random sample, and not a continuous sample.

Therefore for timeseries analysis, we GROUP BY the day field to alow for a continuous timeseries.

In [20]:
query = f"""select
  keyword,
  topic,
  category,
  DAY_POST_CREATED_AT,
  max(MONTH_POST_CREATED_AT) as month_post_created_at,
  max(YEAR_POST_CREATED_AT) as year_post_created_at,
  COUNT(author_thread_mentions) daily_author_thread_mentions,
  SUM(total_posts_with_keyword) as daily_total_posts_keyword,
  SUM(positive_mentions) AS daily_positive_mentions,
  SUM(negative_mentions) AS daily_negative_mentions,
  SUM(neutral_mentions) AS daily_neutral_mentions,
  AVG(pct_positive) as daily_pct_positive,
 AVG(pct_negative) as daily_pct_negative,
 AVG(pct_neutral) as daily_pct_neutral,
from {SF_DB}.{SF_SC}.timeseries_nlp
group by 
keyword
,topic
,category
,day_post_created_at
order by month_post_created_at,day_post_created_at asc;""" 
CSR.execute(query)
df = CSR.fetch_pandas_all()

Step 3: Format Timestamp Column

This step creates a pandas-formatted timestamp that we can use for all the remaining tests in this notebook.

Timeseries data columns (hour, day, month, year) are stored as separate numeric columns in the dataset to allow for other groupins and analyses.

In [21]:
df['TIMESTAMP'] = pd.to_datetime(dict(year=df['YEAR_POST_CREATED_AT'], month=df['MONTH_POST_CREATED_AT'], day=df['DAY_POST_CREATED_AT']))

df = df.set_index('TIMESTAMP')

print(df.head())

           KEYWORD             TOPIC        CATEGORY  DAY_POST_CREATED_AT  \
TIMESTAMP                                                                   
2025-07-17   china  global conflicts  issue specific                   17   
2025-07-17   trump       US domestic  issue specific                   17   
2025-07-17   biden       US domestic  issue specific                   17   
2025-07-17   putin  global conflicts  issue specific                   17   
2025-07-18   putin  global conflicts  issue specific                   18   

            MONTH_POST_CREATED_AT  YEAR_POST_CREATED_AT  \
TIMESTAMP                                                 
2025-07-17                      7                  2025   
2025-07-17                      7                  2025   
2025-07-17                      7                  2025   
2025-07-17                      7                  2025   
2025-07-18                      7                  2025   

            DAILY_AUTHOR_THREAD_MENTIONS DAILY

Step 4: Determine which keyword sentiments show a trend vs a stationary. 

The Augmented Dickey-Fuller Test identifies whether a quantity has a trend within a timeseries.
This test cannot determine the direction of the trend on its own. Those keywords showing a trend will be analyzed with decomposition analysis and autocorrelation.

In [22]:
def get_adf_stats(series: pd.Series):
    """
    Performs the Augmented Dickey-Fuller test and returns statistics
    as dictionary with p-value and stationarity status.
    """
    try:
        result = adfuller(series, autolag='AIC')
        p_value = result[1]
        is_stationary = p_value <= 0.05
        return {'adf_p_value': p_value, 'is_stationary': is_stationary}
    except Exception as e:
        print(f"Error during ADF test: {e}")
        return {'adf_p_value': np.nan, 'is_stationary': np.nan}

Step 5: Perform timeseries decomposition. If there is a trend, this separates out seasonality and residual.

We will also compare the residuals from both additive and multiplicative models. The one with a more random distribution of residuals is the better match to the data.

In [23]:
def get_decomposition_stats(series: pd.Series, model: str, series_name: str):
    """
    Performs seasonal decomposition on a time series using a specified model
    and returns statistics as a dictionary.
    """
    try:
        # Default is 7 day period for weekly seasonality component.
        decomposition = seasonal_decompose(series, model=model, period=7)

        # Populate dictionary output with decomp statistics: trend, seasonal, residual
        trend_stats = {
            'trend_mean': decomposition.trend.mean(),
            'trend_std': decomposition.trend.std()
        }
        
        seasonal_stats = {
            'seasonal_range': decomposition.seasonal.max() - decomposition.seasonal.min()
        }

        residual_stats = {
            'residual_std': decomposition.resid.std()
        }

        # Combine all stats into a single dictionary
        all_decomp_stats = {**trend_stats, **seasonal_stats, **residual_stats}
        
        # Add a prefix to each key to indicate the decomposition model used
        prefixed_decomp_stats = {f"{model}_{series_name}_{key}": value for key, value in all_decomp_stats.items()}
        return prefixed_decomp_stats

    except Exception as e:
        print(f"Error during {model} decomposition for {series_name} series: {e}")
        return {}

In [None]:
#Runs the full timeseries analysis pipeline
def timeseries_analysis_pipeline(df: pd.DataFrame):
    """
    Runs a statistical analysis pipeline on the timeseries DataFrame ('df' by default here).
    It expects a DataFrame with a 'timestamp' column for daily time series index
    and the 'KEYWORD' column for grouping the data. Keywords are determined by table object in Snowflake (steets schema) prior to analysis.
    It will perform ADF tests
    and conditional decomposition on the specified data columns.
    """
    # Define the metric columns for analysis-- both percentages and thread counts.
    time_series_columns = [
        'DAILY_PERCENT_POSITIVE',
        'DAILY_PERCENT_NEGATIVE',
        'DAILY_POSITIVE_MENTIONS',
        'DAILY_NEGATIVE_MENTIONS'
    ]
    
    # Define the two model typs we are running
    decomposition_models = ['additive', 'multiplicative']
    
    # Group the DataFrame by keyword for a row-by-row analysis
    grouped = df.groupby('KEYWORD')
    
    #store results
    pipeline_results = []
    
    for KEYWORD, group_df in grouped:
        # Sort by TIMESTAMP to ensure the time series is in order
        group_df = group_df.set_index('TIMESTAMP').sort_index()
        
        # Create a base dictionary for this keyword's results
        row_stats = {'KEYWORD': KEYWORD}
        
        # Loop through each specified time series column
        for col in time_series_columns:
            series = group_df[col]
            
            # First Pass: Run the ADF test
            adf_stats = get_adf_stats(series)
            
            # Add ADF results to the dictionary with a column-specific prefix
            for key, value in adf_stats.items():
                row_stats[f"{col}_{key}"] = value

            # Second Pass: Conditionally run the decomposition tests
            if not adf_stats['is_stationary']:
                print(f"Running decomposition for '{KEYWORD}' on column '{col}'...")
                for model in decomposition_models:
                    stats = get_decomposition_stats(series, model, col)
                    row_stats.update(stats)
            else:
                print(f"Skipping decomposition for '{KEYWORD}' on column '{col}' (stationary).")
                # Create placeholder columns with NA values
                for model in decomposition_models:
                    for stat_key in ['trend_mean', 'trend_std', 'seasonal_range', 'residual_std']:
                        row_stats[f"{model}_{col}_{stat_key}"] = np.nan
        
        pipeline_results.append(row_stats)
    
    # Convert the list of dictionaries to the final summary DataFrame
    final_df = pd.DataFrame(pipeline_results)
    
    print("--- Final Summary Table with Conditional Test Results ---")
    print(final_df.to_string())
    return final_df

In [31]:
timeseries_analysis_pipeline(df, keyword_column='KEYWORD', timestamp_column='TIMESTAMP')

TypeError: timeseries_analysis_pipeline() got an unexpected keyword argument 'keyword_column'