In [1]:
import pandas as pd
import pandas as pd
import numpy as np
import statsmodels.tsa.stattools as smt
from statsmodels.tsa.stattools import adfuller
from tqdm import tqdm

In [2]:
import warnings

warnings.filterwarnings("ignore")

# Helper Functions

In [3]:
def prewhiten(X, Y, ar_order=3):
    """
    Simple prewhitening function with Y as target
    
    Parameters:
    X, Y: input signals (numpy arrays)
    ar_order: AR model order (default=3)
    
    Returns:
    X_prewhite, Y_prewhite: prewhitened signals
    """
    from statsmodels.tsa.arima.model import ARIMA
    import numpy as np
    
    # Step 2: Make both signals stationary
    X_work = X.copy()
    Y_work = Y.copy()
    
    # Step 3: Fit AR model to Y
    model_Y = ARIMA(Y_work, order=(ar_order, 0, 0)).fit()
    ar_params = model_Y.arparams
    
    # Step 4: Get prewhitened Y (residuals)
    Y_prewhite = model_Y.resid
    
    # Step 5: Apply Y's AR filter to X
    X_centered = X_work - np.mean(X_work)
    X_filtered = np.zeros_like(X_centered)
    
    for t in range(ar_order, len(X_centered)):
        ar_component = np.sum(ar_params * X_centered[t-ar_order:t][::-1])
        X_filtered[t] = X_centered[t] - ar_component
    
    X_prewhite = X_filtered[ar_order:]
    
    # Step 6: Make same length
    min_len = min(len(X_prewhite), len(Y_prewhite))
    X_prewhite = X_prewhite[-min_len:]
    Y_prewhite = Y_prewhite[-min_len:]
    
    return X_prewhite, Y_prewhite

In [4]:
def ccf(x, y, lag_max = 6):

    """
    Calculate cross-correlation values between Reddit and CDC rates using the ccf (R package) method

    Args:
    - x (pd.DataFrame): dataframe of reddit national normalized time series
    - y (pd.DataFrame): dataframe of cdc national normalized time series
    - lag_max (int): maximum number of months to check cross correlation values

    Returns:
    - ccf_output (numpy object): cross correlation values
    """    

    backwards = smt.ccf(y, x, unbiased=False, nlags = lag_max + 1)[::-1]
    forwards = smt.ccf(x, y, unbiased=False, nlags = lag_max + 1)
    ccf_output = np.r_[backwards[:-1], forwards]

    return ccf_output

In [5]:
def cum_r(x, y, lag_max = 6):
    trend = ccf(x, y, lag_max)
    detrend = ccf(x.diff().dropna(), y.diff().dropna(), lag_max)

    comb = (trend + detrend)/2
    return comb, trend, detrend

# Topic Age Corr

In [6]:
tpc7_norm = pd.read_csv("./data/topic_norms2.csv")
tpc7_norm

Unnamed: 0,group_id,0,1,10,100,101,102,103,104,105,...,94,95,96,97,98,99,month,month_str,year,Month Code
0,12_2020,0.0,0.001587,0.0,0.0,0.0,0.003819,0.0,0.0,0.000972,...,0.000939,0.0,0.0,0.002057,0.008617,0.0,12,12,2020,2020/12
1,1_2021,0.001447,0.001786,0.001284,0.00177,0.001845,0.001761,0.002165,0.001946,0.003465,...,0.003066,0.001856,0.001837,0.002593,0.002081,0.001056,1,1,2021,2021/01
2,2_2021,0.002392,0.00171,0.001697,0.001766,0.001844,0.001751,0.002028,0.001719,0.002499,...,0.002365,0.001619,0.001724,0.00377,0.002006,0.001492,2,2,2021,2021/02
3,3_2021,0.001575,0.001816,0.001321,0.002326,0.002201,0.002283,0.003623,0.001679,0.007434,...,0.003712,0.001946,0.001295,0.001546,0.00203,0.001485,3,3,2021,2021/03
4,4_2021,0.001103,0.001427,0.001174,0.00173,0.002669,0.00252,0.003153,0.002088,0.002516,...,0.002221,0.001559,0.001073,0.001579,0.002358,0.001332,4,4,2021,2021/04
5,5_2021,0.000923,0.00154,0.001151,0.001752,0.002691,0.003066,0.002517,0.002172,0.00237,...,0.002225,0.001696,0.001145,0.001482,0.002016,0.001429,5,5,2021,2021/05
6,6_2021,0.001821,0.002116,0.001634,0.002161,0.001928,0.001814,0.002804,0.001946,0.003266,...,0.002322,0.001919,0.001413,0.001777,0.001754,0.001465,6,6,2021,2021/06
7,7_2021,0.001596,0.002081,0.000926,0.001691,0.001567,0.002017,0.00175,0.001897,0.002855,...,0.002823,0.002589,0.016589,0.002204,0.001415,0.001546,7,7,2021,2021/07
8,8_2021,0.001055,0.001847,0.001609,0.002242,0.002498,0.001208,0.001896,0.002098,0.004109,...,0.002936,0.002005,0.001406,0.002301,0.002971,0.00114,8,8,2021,2021/08
9,9_2021,0.001264,0.002044,0.001282,0.002044,0.002532,0.002875,0.002612,0.002194,0.003903,...,0.002845,0.001687,0.001419,0.001955,0.002209,0.001529,9,9,2021,2021/09


## Topic Age Corr - Age Range

In [34]:
# age_range_df = pd.read_csv("./other_outcomes_norm.csv")
# age_range_cols = age_range_df.loc[:, 'accident_15_19_norm':'heart_attack_65_69_norm'].columns.tolist()

# print(age_range_cols)
# age_range_df = age_range_df[["Month Code"]+age_range_cols]
# age_range_df

['accident_15_19_norm', 'accident_20_24_norm', 'accident_25_29_norm', 'accident_30_34_norm', 'accident_35_39_norm', 'accident_40_44_norm', 'accident_45_49_norm', 'accident_50_54_norm', 'accident_55_59_norm', 'accident_60_64_norm', 'accident_65_69_norm', 'assault_15_19_norm', 'assault_20_24_norm', 'assault_25_29_norm', 'assault_30_34_norm', 'assault_35_39_norm', 'assault_40_44_norm', 'assault_45_49_norm', 'assault_50_54_norm', 'assault_55_59_norm', 'assault_60_64_norm', 'assault_65_69_norm', 'cancer_15_19_norm', 'cancer_20_24_norm', 'cancer_25_29_norm', 'cancer_30_34_norm', 'cancer_35_39_norm', 'cancer_40_44_norm', 'cancer_45_49_norm', 'cancer_50_54_norm', 'cancer_55_59_norm', 'cancer_60_64_norm', 'cancer_65_69_norm', 'heart_attack_25_29_norm', 'heart_attack_30_34_norm', 'heart_attack_35_39_norm', 'heart_attack_40_44_norm', 'heart_attack_45_49_norm', 'heart_attack_50_54_norm', 'heart_attack_55_59_norm', 'heart_attack_60_64_norm', 'heart_attack_65_69_norm']


Unnamed: 0,Month Code,accident_15_19_norm,accident_20_24_norm,accident_25_29_norm,accident_30_34_norm,accident_35_39_norm,accident_40_44_norm,accident_45_49_norm,accident_50_54_norm,accident_55_59_norm,...,cancer_65_69_norm,heart_attack_25_29_norm,heart_attack_30_34_norm,heart_attack_35_39_norm,heart_attack_40_44_norm,heart_attack_45_49_norm,heart_attack_50_54_norm,heart_attack_55_59_norm,heart_attack_60_64_norm,heart_attack_65_69_norm
0,2021/01,0.018081,0.023806,0.021697,0.017177,0.012958,0.010547,0.008438,0.010547,0.012657,...,2.137142,0.003616,0.005123,0.016875,0.043092,0.065693,0.118429,0.203408,0.302852,0.385722
1,2021/02,0.014769,0.024414,0.015372,0.014467,0.009645,0.010248,0.006631,0.008439,0.012056,...,1.919033,0.003466,0.006932,0.018084,0.033456,0.059376,0.110615,0.188377,0.273373,0.332448
2,2021/03,0.013562,0.019892,0.023508,0.019289,0.014768,0.011754,0.010247,0.009042,0.009644,...,2.088924,0.003315,0.007233,0.012658,0.031646,0.063593,0.100362,0.18023,0.277277,0.315252
3,2021/04,0.01356,0.025613,0.017779,0.019285,0.013259,0.008437,0.008739,0.00904,0.011149,...,1.995114,0.004219,0.008437,0.016272,0.035858,0.050925,0.109986,0.185319,0.265473,0.309768
4,2021/05,0.023499,0.028018,0.025005,0.016871,0.018076,0.015064,0.007532,0.011448,0.00723,...,2.115222,0.003917,0.007833,0.016269,0.033441,0.056639,0.109964,0.185281,0.265118,0.298559
5,2021/06,0.022891,0.023795,0.020181,0.021385,0.012349,0.010542,0.009639,0.009639,0.014458,...,2.063245,0.003715,0.008434,0.015964,0.030723,0.053915,0.099096,0.187951,0.269879,0.284939
6,2021/07,0.017164,0.024691,0.02168,0.015658,0.013249,0.01355,0.010238,0.015056,0.010539,...,2.121351,0.003513,0.011442,0.014755,0.031015,0.062632,0.110509,0.177055,0.257152,0.304427
7,2021/08,0.020162,0.017153,0.019861,0.019861,0.012338,0.012338,0.011435,0.01294,0.01294,...,2.17087,0.00331,0.009931,0.018055,0.036713,0.06169,0.104722,0.182962,0.277753,0.330415
8,2021/09,0.018046,0.020152,0.021656,0.018347,0.015039,0.012332,0.009625,0.011429,0.008722,...,2.053074,0.004512,0.009925,0.019249,0.040604,0.061658,0.117001,0.166929,0.263477,0.294757
9,2021/10,0.020142,0.021946,0.019541,0.015933,0.015633,0.011424,0.007516,0.013829,0.011123,...,2.164248,0.003608,0.007516,0.015032,0.033069,0.0475,0.113338,0.184588,0.26666,0.325584


In [7]:
age_range_df = pd.read_csv("./data/wonders_death_census_age_range.csv")
age_range_cols = age_range_df.loc[:, 'cocaine_20_24_norm':'syn_opioid_65_69_norm'].columns.tolist()

print(age_range_cols)
age_range_df = age_range_df[["Month Code"]+age_range_cols]
age_range_df

['cocaine_20_24_norm', 'cocaine_25_29_norm', 'cocaine_30_34_norm', 'cocaine_35_39_norm', 'cocaine_40_44_norm', 'cocaine_45_49_norm', 'cocaine_50_54_norm', 'cocaine_55_59_norm', 'cocaine_60_64_norm', 'cocaine_65_69_norm', 'heroin_30_34_norm', 'heroin_35_39_norm', 'heroin_40_44_norm', 'heroin_45_49_norm', 'heroin_50_54_norm', 'heroin_55_59_norm', 'heroin_60_64_norm', 'methadone_30_34_norm', 'methadone_35_39_norm', 'methadone_40_44_norm', 'methadone_45_49_norm', 'methadone_50_54_norm', 'methadone_55_59_norm', 'methadone_60_64_norm', 'nat_opioid_25_29_norm', 'nat_opioid_30_34_norm', 'nat_opioid_35_39_norm', 'nat_opioid_40_44_norm', 'nat_opioid_45_49_norm', 'nat_opioid_50_54_norm', 'nat_opioid_55_59_norm', 'nat_opioid_60_64_norm', 'nat_opioid_65_69_norm', 'nat_opioid_70_74_norm', 'syn_opioid_15_19_norm', 'syn_opioid_20_24_norm', 'syn_opioid_25_29_norm', 'syn_opioid_30_34_norm', 'syn_opioid_35_39_norm', 'syn_opioid_40_44_norm', 'syn_opioid_45_49_norm', 'syn_opioid_50_54_norm', 'syn_opioid_55

Unnamed: 0,Month Code,cocaine_20_24_norm,cocaine_25_29_norm,cocaine_30_34_norm,cocaine_35_39_norm,cocaine_40_44_norm,cocaine_45_49_norm,cocaine_50_54_norm,cocaine_55_59_norm,cocaine_60_64_norm,...,syn_opioid_20_24_norm,syn_opioid_25_29_norm,syn_opioid_30_34_norm,syn_opioid_35_39_norm,syn_opioid_40_44_norm,syn_opioid_45_49_norm,syn_opioid_50_54_norm,syn_opioid_55_59_norm,syn_opioid_60_64_norm,syn_opioid_65_69_norm
0,2021/01,0.025313,0.045202,0.062981,0.070816,0.059365,0.059666,0.068405,0.075336,0.050626,...,0.12325,0.206422,0.262773,0.243186,0.179602,0.154892,0.154892,0.144043,0.112101,0.038271
1,2021/02,0.022002,0.043703,0.057568,0.058171,0.057869,0.054855,0.061486,0.058171,0.04521,...,0.106998,0.188377,0.22123,0.222435,0.176019,0.134426,0.138947,0.119958,0.087407,0.037374
2,2021/03,0.03255,0.050332,0.071429,0.084992,0.077457,0.071429,0.072333,0.080772,0.055757,...,0.137433,0.229357,0.273661,0.265222,0.231768,0.184148,0.174805,0.157626,0.115733,0.047017
3,2021/04,0.025914,0.056048,0.072018,0.080154,0.084373,0.07473,0.080455,0.091605,0.056952,...,0.13138,0.213644,0.270294,0.262761,0.234436,0.1811,0.187127,0.183511,0.117218,0.047912
4,2021/05,0.031031,0.048505,0.071702,0.074715,0.084356,0.072606,0.077427,0.083452,0.04971,...,0.124425,0.206069,0.280483,0.266323,0.228664,0.174436,0.183474,0.168712,0.10665,0.050312
5,2021/06,0.024096,0.048494,0.063253,0.075903,0.076205,0.071385,0.078313,0.083735,0.049397,...,0.119578,0.203312,0.258433,0.253011,0.220782,0.176505,0.164457,0.17771,0.106927,0.049096
6,2021/07,0.023788,0.055405,0.078591,0.078892,0.081,0.067751,0.08371,0.080699,0.055104,...,0.115327,0.206564,0.273713,0.263776,0.235471,0.180367,0.174044,0.154472,0.115327,0.052394
7,2021/08,0.02618,0.052963,0.072523,0.081852,0.080347,0.070717,0.088171,0.084861,0.056273,...,0.119768,0.217569,0.26692,0.262707,0.236527,0.179953,0.182661,0.153772,0.117662,0.046643
8,2021/09,0.022257,0.056846,0.069779,0.075193,0.071885,0.067674,0.084818,0.082412,0.057748,...,0.103466,0.201818,0.264981,0.263778,0.224677,0.175651,0.16332,0.181366,0.113091,0.045116
9,2021/10,0.022848,0.051709,0.074256,0.079968,0.073955,0.058022,0.083576,0.091993,0.059525,...,0.104319,0.187895,0.271471,0.268164,0.230885,0.158433,0.176771,0.162642,0.112136,0.050206


In [8]:
age_range_tpc7_df = pd.merge(age_range_df, tpc7_norm, on='Month Code', how='inner')

years = [2022, 2023, 2024]
age_range_tpc7_df = age_range_tpc7_df[age_range_tpc7_df["year"].isin(years)]
age_range_tpc7_df = age_range_tpc7_df.loc[:41]

age_range_tpc7_df

Unnamed: 0,Month Code,cocaine_20_24_norm,cocaine_25_29_norm,cocaine_30_34_norm,cocaine_35_39_norm,cocaine_40_44_norm,cocaine_45_49_norm,cocaine_50_54_norm,cocaine_55_59_norm,cocaine_60_64_norm,...,93,94,95,96,97,98,99,month,month_str,year
12,2022/01,0.022818,0.047437,0.083464,0.084365,0.082564,0.056744,0.070554,0.078961,0.062748,...,0.002038,0.003075,0.001945,0.001093,0.001215,0.001923,0.001337,1,1,2022
13,2022/02,0.024614,0.049828,0.071441,0.076543,0.075943,0.067538,0.093653,0.078044,0.060034,...,0.002282,0.002311,0.001734,0.001652,0.002375,0.001896,0.00211,2,2,2022
14,2022/03,0.025805,0.05341,0.073514,0.088517,0.088517,0.081615,0.081615,0.085816,0.063912,...,0.0028,0.002724,0.001892,0.00156,0.002081,0.001988,0.002052,3,3,2022
15,2022/04,0.020993,0.054281,0.080672,0.090568,0.076173,0.070475,0.08397,0.08427,0.062078,...,0.002665,0.002741,0.001796,0.00206,0.001948,0.001987,0.001506,4,4,2022
16,2022/05,0.024278,0.049455,0.07793,0.081227,0.08812,0.074932,0.091417,0.08842,0.060545,...,0.002109,0.002658,0.001738,0.001352,0.001669,0.001967,0.001788,5,5,2022
17,2022/06,0.023366,0.054521,0.078186,0.081781,0.081781,0.073094,0.073393,0.09616,0.061111,...,0.002032,0.002441,0.001808,0.001472,0.001518,0.001918,0.001868,6,6,2022
18,2022/07,0.022155,0.054189,0.076044,0.089516,0.085924,0.075745,0.087421,0.090714,0.069457,...,0.002231,0.002362,0.001828,0.001457,0.001682,0.001836,0.001787,7,7,2022
19,2022/08,0.020343,0.054447,0.077781,0.089449,0.09603,0.069704,0.081072,0.092739,0.075987,...,0.001902,0.00259,0.001729,0.001179,0.001337,0.002292,0.001674,8,8,2022
20,2022/09,0.02481,0.045137,0.075627,0.085491,0.081007,0.070844,0.093263,0.084893,0.074132,...,0.001772,0.002465,0.001854,0.001245,0.001702,0.00203,0.001793,9,9,2022
21,2022/10,0.025389,0.058245,0.077362,0.089907,0.083933,0.069596,0.090504,0.101556,0.071089,...,0.001824,0.002194,0.001709,0.001274,0.001784,0.001999,0.002103,10,10,2022


In [9]:
drug_name = "norm"
drug_cols = [i  for i in age_range_cols if drug_name in i]

print(len(drug_cols))
print(drug_cols)

45
['cocaine_20_24_norm', 'cocaine_25_29_norm', 'cocaine_30_34_norm', 'cocaine_35_39_norm', 'cocaine_40_44_norm', 'cocaine_45_49_norm', 'cocaine_50_54_norm', 'cocaine_55_59_norm', 'cocaine_60_64_norm', 'cocaine_65_69_norm', 'heroin_30_34_norm', 'heroin_35_39_norm', 'heroin_40_44_norm', 'heroin_45_49_norm', 'heroin_50_54_norm', 'heroin_55_59_norm', 'heroin_60_64_norm', 'methadone_30_34_norm', 'methadone_35_39_norm', 'methadone_40_44_norm', 'methadone_45_49_norm', 'methadone_50_54_norm', 'methadone_55_59_norm', 'methadone_60_64_norm', 'nat_opioid_25_29_norm', 'nat_opioid_30_34_norm', 'nat_opioid_35_39_norm', 'nat_opioid_40_44_norm', 'nat_opioid_45_49_norm', 'nat_opioid_50_54_norm', 'nat_opioid_55_59_norm', 'nat_opioid_60_64_norm', 'nat_opioid_65_69_norm', 'nat_opioid_70_74_norm', 'syn_opioid_15_19_norm', 'syn_opioid_20_24_norm', 'syn_opioid_25_29_norm', 'syn_opioid_30_34_norm', 'syn_opioid_35_39_norm', 'syn_opioid_40_44_norm', 'syn_opioid_45_49_norm', 'syn_opioid_50_54_norm', 'syn_opioid

In [10]:
age_range_tpc7_df[["Month Code"]+drug_cols]

Unnamed: 0,Month Code,cocaine_20_24_norm,cocaine_25_29_norm,cocaine_30_34_norm,cocaine_35_39_norm,cocaine_40_44_norm,cocaine_45_49_norm,cocaine_50_54_norm,cocaine_55_59_norm,cocaine_60_64_norm,...,syn_opioid_20_24_norm,syn_opioid_25_29_norm,syn_opioid_30_34_norm,syn_opioid_35_39_norm,syn_opioid_40_44_norm,syn_opioid_45_49_norm,syn_opioid_50_54_norm,syn_opioid_55_59_norm,syn_opioid_60_64_norm,syn_opioid_65_69_norm
12,2022/01,0.022818,0.047437,0.083464,0.084365,0.082564,0.056744,0.070554,0.078961,0.062748,...,0.104781,0.199654,0.268407,0.266306,0.239885,0.170832,0.151017,0.166929,0.106582,0.052841
13,2022/02,0.024614,0.049828,0.071441,0.076543,0.075943,0.067538,0.093653,0.078044,0.060034,...,0.108962,0.189107,0.249142,0.250642,0.224828,0.164193,0.1756,0.153987,0.115566,0.05193
14,2022/03,0.025805,0.05341,0.073514,0.088517,0.088517,0.081615,0.081615,0.085816,0.063912,...,0.113421,0.215441,0.274552,0.26525,0.244246,0.197137,0.171032,0.169832,0.119122,0.062412
15,2022/04,0.020993,0.054281,0.080672,0.090568,0.076173,0.070475,0.08397,0.08427,0.062078,...,0.099865,0.192532,0.264507,0.253711,0.223721,0.17094,0.172139,0.159244,0.116059,0.051582
16,2022/05,0.024278,0.049455,0.07793,0.081227,0.08812,0.074932,0.091417,0.08842,0.060545,...,0.097112,0.185532,0.246378,0.25507,0.237985,0.18793,0.190328,0.161554,0.118093,0.057848
17,2022/06,0.023366,0.054521,0.078186,0.081781,0.081781,0.073094,0.073393,0.09616,0.061111,...,0.09646,0.180338,0.263317,0.257925,0.235158,0.177342,0.162064,0.176743,0.12402,0.053922
18,2022/07,0.022155,0.054189,0.076044,0.089516,0.085924,0.075745,0.087421,0.090714,0.069457,...,0.108078,0.196696,0.276932,0.272441,0.264358,0.181128,0.182625,0.167656,0.132927,0.057482
19,2022/08,0.020343,0.054447,0.077781,0.089449,0.09603,0.069704,0.081072,0.092739,0.075987,...,0.096629,0.19924,0.268047,0.281808,0.251593,0.175607,0.183983,0.165136,0.132827,0.063123
20,2022/09,0.02481,0.045137,0.075627,0.085491,0.081007,0.070844,0.093263,0.084893,0.074132,...,0.092067,0.170982,0.254082,0.256772,0.239136,0.178754,0.185031,0.181146,0.118671,0.060382
21,2022/10,0.025389,0.058245,0.077362,0.089907,0.083933,0.069596,0.090504,0.101556,0.071089,...,0.102452,0.208488,0.270019,0.281071,0.238955,0.183099,0.189073,0.176229,0.145763,0.063622


In [11]:
from tqdm import tqdm

In [12]:
data = []

for tpc_num in tqdm(range(200)):
    for drug_age_range in drug_cols:
        tpc_series_diff = age_range_tpc7_df[str(tpc_num)].diff().dropna()
        outcome_diff = age_range_tpc7_df[drug_age_range].diff().dropna()
        stationary_pvalue = adfuller(tpc_series_diff)[1]
        
        tpc_pw, outcome_pw = prewhiten(tpc_series_diff.values, outcome_diff.values, 2)
        pw_ccf_result = ccf(tpc_pw, outcome_pw, 6)

        mean, trend, detrend = cum_r(age_range_tpc7_df[str(tpc_num)], age_range_tpc7_df[drug_age_range])
        max_id = mean[:7].argmax()
        
        ls = drug_age_range.split("_")
        age_range = f"{ls[-3]}_{ls[-2]}"
        d_name = "_".join(ls[:-3])

        data.append({
            "topic": tpc_num,
            "drug_age_range_list": drug_age_range,
            "drug_name": d_name,
            "age": age_range,
            "mean_r": mean[max_id],
            "trend_r": trend[max_id],
            "detrend_r": detrend[max_id],
            "lead": -(max_id-6),
            "adf_pvalue": stationary_pvalue,
            "pw_r": pw_ccf_result[max_id]
        })

corr_age_range_df = pd.DataFrame(data)
# corr_age_range_df.to_csv("./tpc_age_range_corr_OTHER_OUTCOMES.csv", index=False)
corr_age_range_df.to_csv("./data/tpc_age_range_corr.csv", index=False)
corr_age_range_df

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 200/200 [03:28<00:00,  1.04s/it]


Unnamed: 0,topic,drug_age_range_list,drug_name,age,mean_r,trend_r,detrend_r,lead,adf_pvalue,pw_r
0,0,cocaine_20_24_norm,cocaine,20_24,0.040617,-0.148660,0.229895,0,8.991104e-01,0.211319
1,0,cocaine_25_29_norm,cocaine,25_29,-0.032612,-0.258565,0.193342,4,8.991104e-01,0.124425
2,0,cocaine_30_34_norm,cocaine,30_34,0.141047,-0.001690,0.283784,4,8.991104e-01,0.355866
3,0,cocaine_35_39_norm,cocaine,35_39,0.120500,0.041419,0.199580,1,8.991104e-01,0.228541
4,0,cocaine_40_44_norm,cocaine,40_44,0.119353,0.063493,0.175212,2,8.991104e-01,0.244621
...,...,...,...,...,...,...,...,...,...,...
8995,199,syn_opioid_45_49_norm,syn_opioid,45_49,-0.079536,-0.178419,0.019347,0,4.797703e-10,-0.074922
8996,199,syn_opioid_50_54_norm,syn_opioid,50_54,0.068901,-0.116174,0.253975,1,4.797703e-10,0.425572
8997,199,syn_opioid_55_59_norm,syn_opioid,55_59,-0.061909,-0.183338,0.059520,1,4.797703e-10,0.069090
8998,199,syn_opioid_60_64_norm,syn_opioid,60_64,0.086611,-0.092600,0.265821,1,4.797703e-10,0.136513


## Topic Age Corr - All Ages

In [7]:
deaths_df = pd.read_csv("./data/wonders_death_census_merge.csv")

drug_names = ["heroin", "nat_opioid", "methadone",  "syn_opioid", "cocaine", "unspecified", "cannabis"]
drug_names_cols = [i+"_norm" for i in drug_names]

deaths_df = deaths_df[["Month Code"] + drug_names_cols]
deaths_df

Unnamed: 0,Month Code,heroin_norm,nat_opioid_norm,methadone_norm,syn_opioid_norm,cocaine_norm,unspecified_norm,cannabis_norm
0,2021/01,0.284169,0.35619,0.10306,1.670358,0.55086,0.049421,0.032545
1,2021/02,0.226655,0.324912,0.094038,1.473861,0.484958,0.034059,0.031647
2,2021/03,0.294457,0.374927,0.102773,1.879459,0.63171,0.036769,0.03044
3,2021/04,0.273307,0.382992,0.100343,1.889949,0.666545,0.039776,0.033146
4,2021/05,0.245234,0.366044,0.09731,1.855526,0.634777,0.037358,0.031935
5,2021/06,0.217469,0.346384,0.087048,1.788849,0.610841,0.039759,0.026205
6,2021/07,0.241494,0.347486,0.087022,1.829571,0.640771,0.038242,0.0271
7,2021/08,0.225393,0.34486,0.091481,1.850687,0.655414,0.03581,0.027986
8,2021/09,0.206631,0.335662,0.085419,1.798921,0.629517,0.036995,0.026769
9,2021/10,0.198417,0.307847,0.081171,1.778237,0.634634,0.029161,0.030364


In [8]:
death_tpc7_df = pd.merge(deaths_df, tpc7_norm, on='Month Code', how='inner')

years = [2022,2023, 2024]
death_tpc7_df = death_tpc7_df[death_tpc7_df["year"].isin(years)]
death_tpc7_df = death_tpc7_df.loc[:41]
death_tpc7_df

Unnamed: 0,Month Code,heroin_norm,nat_opioid_norm,methadone_norm,syn_opioid_norm,cocaine_norm,unspecified_norm,cannabis_norm,group_id,0,...,93,94,95,96,97,98,99,month,month_str,year
12,2022/01,0.183742,0.317045,0.093973,1.789981,0.638293,0.031825,0.034226,1_2022,0.001161,...,0.002038,0.003075,0.001945,0.001093,0.001215,0.001923,0.001337,1,1,2022
13,2022/02,0.164193,0.297769,0.071741,1.740089,0.636662,0.028516,0.031218,2_2022,0.001465,...,0.002282,0.002311,0.001734,0.001652,0.002375,0.001896,0.00211,2,2,2022
14,2022/03,0.171032,0.311159,0.076214,1.893357,0.69013,0.032106,0.033906,3_2022,0.001615,...,0.0028,0.002724,0.001892,0.00156,0.002081,0.001988,0.002052,3,3,2022
15,2022/04,0.152346,0.292697,0.074974,1.76218,0.669365,0.031489,0.02789,4_2022,0.001384,...,0.002665,0.002741,0.001796,0.00206,0.001948,0.001987,0.001506,4,4,2022
16,2022/05,0.146568,0.297332,0.084224,1.802572,0.683383,0.029373,0.026376,5_2022,0.00154,...,0.002109,0.002658,0.001738,0.001352,0.001669,0.001967,0.001788,5,5,2022
17,2022/06,0.145289,0.296868,0.082081,1.784505,0.669826,0.02756,0.027859,6_2022,0.001642,...,0.002032,0.002441,0.001808,0.001472,0.001518,0.001918,0.001868,6,6,2022
18,2022/07,0.148495,0.31675,0.088319,1.903494,0.694275,0.025448,0.028741,7_2022,0.001832,...,0.002231,0.002362,0.001828,0.001457,0.001682,0.001836,0.001787,7,7,2022
19,2022/08,0.140605,0.282407,0.090645,1.891885,0.712,0.028121,0.029617,8_2022,0.001907,...,0.001902,0.00259,0.001729,0.001179,0.001337,0.002292,0.001674,8,8,2022
20,2022/09,0.130329,0.295333,0.074132,1.7992,0.691402,0.020625,0.028397,9_2022,0.001923,...,0.001772,0.002465,0.001854,0.001245,0.001702,0.00203,0.001793,9,9,2022
21,2022/10,0.124854,0.295408,0.085128,1.925083,0.721346,0.026882,0.028077,10_2022,0.001798,...,0.001824,0.002194,0.001709,0.001274,0.001784,0.001999,0.002103,10,10,2022


In [9]:
data = []

for tpc_num in range(200):
    for drug_col in drug_names_cols:
        
        tpc_series_diff = death_tpc7_df[str(tpc_num)].diff().diff().dropna()
        outcome_diff = death_tpc7_df[drug_col].diff().diff().dropna()
        stationary_pvalue = adfuller(tpc_series_diff)[1]
        
        tpc_pw, outcome_pw = prewhiten(tpc_series_diff.values, outcome_diff.values, 2)
        pw_ccf_result = ccf(tpc_pw, outcome_pw, 6)
        
        mean, trend, detrend = cum_r(death_tpc7_df[str(tpc_num)], death_tpc7_df[drug_col])
        max_id = mean[:7].argmax()
        min_id = mean[:7].argmin()

        if abs(mean[max_id]) > abs(mean[min_id]): id_ = max_id
        else: id_ = min_id
        # id_ = max_id
        
        ls = drug_col.split("_")
        d_name = "_".join(ls[:-1])

        data.append({
            "topic": tpc_num,
            "drug_name": d_name,
            "mean_r": mean[id_],
            "trend_r": trend[id_],
            "detrend_r": detrend[id_],
            "lead": -(id_ - 6),
            "adf_pvalue": stationary_pvalue,
            "pw_r": pw_ccf_result[id_]
        })

corr_age_range_df = pd.DataFrame(data)
corr_age_range_df.to_csv("./data/tpc_age_all_corr.csv", index=False)
corr_age_range_df

Unnamed: 0,topic,drug_name,mean_r,trend_r,detrend_r,lead,adf_pvalue,pw_r
0,0,heroin,-0.432021,-0.522721,-0.341321,0,1.378577e-02,-0.235107
1,0,nat_opioid,-0.374593,-0.498043,-0.251143,3,1.378577e-02,-0.190204
2,0,methadone,0.441878,0.459731,0.424024,1,1.378577e-02,0.547114
3,0,syn_opioid,-0.301046,-0.353062,-0.249030,3,1.378577e-02,-0.395366
4,0,cocaine,-0.224596,-0.173549,-0.275642,3,1.378577e-02,-0.397459
...,...,...,...,...,...,...,...,...
1395,199,methadone,-0.281222,-0.378563,-0.183881,6,1.180586e-07,-0.255044
1396,199,syn_opioid,-0.243248,-0.369877,-0.116620,3,1.180586e-07,-0.150846
1397,199,cocaine,-0.201226,-0.434900,0.032449,6,1.180586e-07,-0.147857
1398,199,unspecified,-0.438384,-0.460865,-0.415902,3,1.180586e-07,-0.508237


In [12]:
a = corr_age_range_df[corr_age_range_df["drug_name"]=="syn_opioid"].sort_values(by=["trend_r", "detrend_r"], ascending=False)
a[(a["trend_r"]>0.4) & (a["detrend_r"]>0.2)]

Unnamed: 0,topic,drug_name,mean_r,trend_r,detrend_r,lead,adf_pvalue,pw_r
1389,198,syn_opioid,0.411449,0.5892,0.233698,3,0.00926211,0.32144
1165,166,syn_opioid,0.496727,0.560737,0.432717,3,1.569191e-07,0.525543
199,28,syn_opioid,0.382307,0.532758,0.231856,3,0.3186993,0.166407
528,75,syn_opioid,0.373854,0.527203,0.220504,0,0.03170688,-0.048112
724,103,syn_opioid,0.410471,0.523016,0.297925,3,0.0006168281,0.300553
864,123,syn_opioid,0.44099,0.512893,0.369086,3,0.02615769,0.287306
1326,189,syn_opioid,0.44143,0.496049,0.38681,3,1.183977e-07,0.318154
59,8,syn_opioid,0.483587,0.490385,0.476789,3,0.009805029,0.593926
990,141,syn_opioid,0.355198,0.485108,0.225287,3,0.0006610749,0.213383
1277,182,syn_opioid,0.497773,0.470928,0.524618,3,3.532622e-05,0.463523
