Augmented Dickey-Fuller (ADF) test is important for time series data. We will be performing this test on the time series of thread based sentiment on keywords. 

Begun 7 Sept 2025.

An ADF in this case will tell us whether we can reject a null hypothesis that there is no trend or change in the average sentiment of posts over time, for the chosen keyword.

I want to make the keyword choice customizable from a dropdown list but for now it is on the sample keywords data.

In [None]:
from datasets import Dataset
from datetime import datetime, timedelta
import os
import re
import snowflake.connector
from snowflake.connector.pandas_tools import write_pandas
from time import sleep
import torch
from transformers import pipeline

### GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS 
### GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS 
### GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS | GLOBAL VARS 

# this basically means "smoke em if you got em" where the "em" is NVIDIA GPU
DEVICE = 0 if torch.cuda.is_available() else -1

SF_USR = os.getenv('SF_USR')
SF_ID  = os.getenv('SF_ID')
SF_WH  = os.getenv('SF_WH')
SF_DB  = os.getenv('SF_DB')
SF_SC  = os.getenv('SF_SC')
SF_RL  = os.getenv('SF_RL')

# connect to database and init a cursor for querying
xct_params = {
    "user":                 SF_USR
   ,"account":              SF_ID
   ,"warehouse":            SF_WH
   ,"database":             SF_DB
   ,"schema":               SF_SC
   ,"role":                 SF_RL
   ,"private_key_file":     os.getenv('PRIVATE_KEY_PATH')
   ,"private_key_file_pwd": os.getenv('PRIVATE_KEY_PASSPHRASE')
   ,"authenticator":        os.getenv('SF_AUTH')
}

SF_XCT = snowflake.connector.connect(**xct_params) #connection object
CSR = SF_XCT.cursor()

In [None]:
#Timeseries is not continuous sampling at the minute or hour scale. ADF is appropriate when aggregating timseries by day

#####see below#####

#Query below is modified from the standard form TO GROUP BY 'DAY_POST_CREATED_AT'

query = f"""select
  keyword,
  topic,
  category,
  DAY_POST_CREATED_AT,
  max(MONTH_POST_CREATED_AT) as month_post_created_at,
  max(YEAR_POST_CREATED_AT) as year_post_created_at,
  COUNT(author_thread_mentions) daily_author_thread_mentions,
  SUM(total_posts_with_keyword) as daily_total_posts_keyword,
  SUM(positive_mentions) AS daily_positive_mentions,
  SUM(negative_mentions) AS daily_negative_mentions,
  SUM(neutral_mentions) AS daily_neutral_mentions,
  AVG(pct_positive) as daily_pct_positive,
 AVG(pct_negative) as daily_pct_negative,
 AVG(pct_neutral) as daily_pct_neutral,
from {SF_DB}.{SF_SC}.timeseries_nlp
group by 
keyword
,topic
,category
,day_post_created_at
order by month_post_created_at,day_post_created_at asc;""" 
CSR.execute(query)
df = CSR.fetch_pandas_all()



In [6]:
print(df.head())

  KEYWORD             TOPIC        CATEGORY  DAY_POST_CREATED_AT  \
0   trump       US domestic  issue specific                   17   
1   china  global conflicts  issue specific                   17   
2   biden       US domestic  issue specific                   17   
3   putin  global conflicts  issue specific                   17   
4   putin  global conflicts  issue specific                   18   

   MONTH_POST_CREATED_AT  YEAR_POST_CREATED_AT  DAILY_AUTHOR_THREAD_MENTIONS  \
0                      7                  2025                            22   
1                      7                  2025                            17   
2                      7                  2025                            16   
3                      7                  2025                            17   
4                      7                  2025                            17   

  DAILY_TOTAL_POSTS_KEYWORD  DAILY_POSITIVE_MENTIONS  DAILY_NEGATIVE_MENTIONS  \
0                      3999  

In [7]:
#import needed libraries
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt

In [None]:
#We use pandas to create the timestamp index from the categorical time labels in the SQL view source.
df['timestamp'] = pd.to_datetime(dict(year=df['YEAR_POST_CREATED_AT'], month=df['MONTH_POST_CREATED_AT'], day=df['DAY_POST_CREATED_AT']))

df = df.set_index('timestamp')

print(df.head())

           KEYWORD             TOPIC        CATEGORY  DAY_POST_CREATED_AT  \
timestamp                                                                   
2025-07-17   trump       US domestic  issue specific                   17   
2025-07-17   china  global conflicts  issue specific                   17   
2025-07-17   biden       US domestic  issue specific                   17   
2025-07-17   putin  global conflicts  issue specific                   17   
2025-07-18   putin  global conflicts  issue specific                   18   

            MONTH_POST_CREATED_AT  YEAR_POST_CREATED_AT  \
timestamp                                                 
2025-07-17                      7                  2025   
2025-07-17                      7                  2025   
2025-07-17                      7                  2025   
2025-07-17                      7                  2025   
2025-07-18                      7                  2025   

            DAILY_AUTHOR_THREAD_MENTIONS DAILY

In [10]:
#Define function to run the Augmented Dickey-Fuller:

def run_adf(series):
    """
    Performs the ADF test and returns the results as a dictionary. 
    'series' is PCT_POSITIVE by default here but can be any daily aggregate quantity.
    """
    result = adfuller(series)
    return {
        'ADF_Statistic': result[0],
        'p_value': result[1],
        'is_stationary': result[1] <= 0.05
    }

In [12]:
#Defining keywords for testing
keywords_for_ADF = df['KEYWORD'].unique()
ADF_results_list = []

#loop through all keywords and test each one
for keyword in keywords_for_ADF:
    #first, filters for a keyword
    keyword_df = df[df['KEYWORD'] == keyword]

    #second, runs ADF on that keyword
    pct_positive_results = run_adf(keyword_df['DAILY_PCT_POSITIVE'])
    pct_negative_results = run_adf(keyword_df['DAILY_PCT_NEGATIVE'])

    results_dict = {
    'keyword': keyword,
    'pct_positive_p_value': pct_positive_results['p_value'],
    'pct_positive_is_stationary': pct_positive_results['is_stationary'],
    'pct_negative_p_value': pct_negative_results['p_value'],
    'pct_negative_is_stationary': pct_negative_results['is_stationary']
    }

    ADF_results_list.append(results_dict)

keyword_ADF_results_df = pd.DataFrame(ADF_results_list)

In [13]:
print(keyword_ADF_results_df.head())

  keyword  pct_positive_p_value  pct_positive_is_stationary  \
0   trump              0.000014                        True   
1   china              0.455396                       False   
2   biden              0.000005                        True   
3   putin              0.000098                        True   

   pct_negative_p_value  pct_negative_is_stationary  
0          2.188902e-04                        True  
1          4.169019e-04                        True  
2          4.519153e-09                        True  
3          7.675325e-04                        True  
