In [1]:
import polars as pl
import hvplot.polars

In [2]:
df_news = pl.read_csv("financial_sentiment_analysis.csv")
df_news = df_news.unique()

In [3]:
df = pl.read_csv('data/processed/final_news_data.csv')
df = df.unique()

In [4]:
df

ITI(13D),ITI(impatient),ITI(patient),ITI(insider),ITI(short),date,permco,ret,prc,vol,on_rdq,vol_missing_flag,comnam,Article_title
f64,f64,f64,f64,f64,str,i64,f64,f64,f64,i64,i64,str,str
0.321933,0.484775,0.138845,0.356138,0.3709567,"""2018-04-02""",20330,-0.022887,20.92,229361.0,0,0,"""BRIGGS & STRATTON CORP""",
0.176877,0.2555752,0.040945,0.3436,0.2961639,"""2018-04-12""",21326,-0.011884,34.09,3.010754e6,0,0,"""FIRSTENERGY CORP""",
0.357584,0.471489,0.289649,0.67533,0.444972,"""2018-07-17""",20990,-0.013543,143.49001,5.096741e6,0,0,"""INTERNATIONAL BUSINESS MACHS C…","""Earnings Preview: Legacy Tech …"
0.1766,0.290188,0.483589,0.488859,0.371787,"""2018-08-03""",54818,-0.022222,7.48,61045.0,0,0,"""CORIUM INTERNATIONAL INC""",
0.4779767,0.554258,0.508357,0.7199349,0.4519313,"""2018-09-05""",7195,0.014545,27.9,2.14038e6,0,0,"""SCIENTIFIC GAMES CORP""","""Stocks Which Set New 52-Week L…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.340844,0.632984,0.2138182,0.478544,0.462988,"""2010-04-27""",20327,-0.007111,11.17,822200.0,0,0,"""BOWNE & CO INC""",
0.463524,0.5911203,0.277646,0.3481879,0.430012,"""2014-10-07""",54170,-0.093717,8.51,735422.0,0,0,"""TILE SHOP HOLDINGS INC""",
0.26367,0.530391,0.3389039,0.298766,0.398141,"""2010-12-03""",47216,-0.001603,37.38,76350.0,0,0,"""R B C BEARINGS INC""",
0.199722,0.2551815,0.054845,0.404778,0.314042,"""2012-09-10""",13008,-0.002908,61.72,226300.0,0,0,"""GARDNER DENVER INC""",


In [5]:
df_articles = df.filter(pl.col('Article_title').is_not_null()).select('date', 'comnam', 'Article_title')

In [6]:
news_df = df_articles.join(df_news, left_on=['date', 'comnam', 'Article_title'], right_on=['date', 'comnam', 'Headline'], how='inner').unique()

In [7]:
final_df = df.join(news_df, on=['date', 'comnam', 'Article_title'], how='left').unique(subset=['date', 'comnam', 'Article_title'])

In [8]:
filter_df = final_df.with_columns(
    pl.col("ret").shift(-1).alias("next_day_ret")
).select(['ITI(13D)', 'next_day_ret', 'permco', 'Positive', 'Negative', 'Neutral', 'date']).sort(['permco', 'date']).drop_nulls(subset=['ITI(13D)', 'next_day_ret'])

In [9]:
filter_df = filter_df.with_columns(pl.col('Positive').fill_null(0), pl.col('Negative').fill_null(0), pl.col('Neutral').fill_null(0))

In [10]:
filter_df = filter_df.with_columns(
    (
        (pl.col('Positive') - pl.col('Negative')) 
    ).alias('sentiment_score')
)


In [11]:
filter_df = filter_df.with_columns(
    pl.col("date").str.strptime(pl.Date, "%Y-%m-%d").alias("date")
)

In [12]:
filter_df

ITI(13D),next_day_ret,permco,Positive,Negative,Neutral,date,sentiment_score
f64,f64,i64,f64,f64,f64,date,f64
0.152583,0.018913,7,0.0,0.0,0.0,2009-05-27,0.0
0.261909,-0.009471,7,0.0,0.0,0.0,2009-05-28,0.0
0.096691,0.002885,7,0.0,0.0,0.0,2009-05-29,0.0
0.089552,-0.038889,7,0.0,0.0,0.0,2009-06-01,0.0
0.018411,0.051546,7,0.0,0.0,0.0,2009-06-02,0.0
…,…,…,…,…,…,…,…
0.319228,-0.015441,58620,0.0,0.0,0.0,2009-08-25,0.0
0.114453,-0.022811,58620,0.0,0.0,0.0,2009-08-26,0.0
0.215733,-0.016275,58620,0.0,0.0,0.0,2009-08-27,0.0
0.225597,0.0,58620,0.0,0.0,0.0,2009-08-28,0.0


In [13]:
n = 10 # EMA window size

filter_df = (
    filter_df
    .sort(["permco", "date"])
    .with_columns([
        pl.col("sentiment_score")
        .ewm_mean(alpha=2/(n+1), adjust=False)
        .over("permco")
        .alias(f"ema_sentiment_{n}")
    ])
)

In [14]:
filter_df

ITI(13D),next_day_ret,permco,Positive,Negative,Neutral,date,sentiment_score,ema_sentiment_10
f64,f64,i64,f64,f64,f64,date,f64,f64
0.152583,0.018913,7,0.0,0.0,0.0,2009-05-27,0.0,0.0
0.261909,-0.009471,7,0.0,0.0,0.0,2009-05-28,0.0,0.0
0.096691,0.002885,7,0.0,0.0,0.0,2009-05-29,0.0,0.0
0.089552,-0.038889,7,0.0,0.0,0.0,2009-06-01,0.0,0.0
0.018411,0.051546,7,0.0,0.0,0.0,2009-06-02,0.0,0.0
…,…,…,…,…,…,…,…,…
0.319228,-0.015441,58620,0.0,0.0,0.0,2009-08-25,0.0,0.0
0.114453,-0.022811,58620,0.0,0.0,0.0,2009-08-26,0.0,0.0
0.215733,-0.016275,58620,0.0,0.0,0.0,2009-08-27,0.0,0.0
0.225597,0.0,58620,0.0,0.0,0.0,2009-08-28,0.0,0.0


In [15]:
filter_df = filter_df.filter(
    pl.col("sentiment_score").is_not_null() & (pl.col("sentiment_score") != 0)
)

In [16]:
filter_df.filter(
    pl.col("ema_sentiment_10").is_not_null() & (pl.col("ema_sentiment_10") != 0)
)


ITI(13D),next_day_ret,permco,Positive,Negative,Neutral,date,sentiment_score,ema_sentiment_10
f64,f64,i64,f64,f64,f64,date,f64,f64
0.030262,-0.006728,37,0.148186,0.012093,0.839721,2010-08-04,0.136093,0.024744
0.1824342,0.038868,37,0.023458,0.032603,0.943939,2010-09-10,-0.009145,-0.001529
0.1824342,-0.001397,37,0.032024,0.028116,0.9398598,2010-09-10,0.003908,-0.00054
0.481816,0.002006,37,0.068105,0.288555,0.6433402,2010-09-13,-0.220451,-0.040524
0.509262,0.050674,37,0.7095045,0.007535,0.282961,2010-09-28,0.70197,0.123174
…,…,…,…,…,…,…,…,…
0.082986,0.019886,56668,0.142438,0.012201,0.845361,2019-07-31,0.130237,-0.027577
0.082986,-0.018719,56668,0.035469,0.150262,0.8142691,2019-07-31,-0.114793,-0.043434
0.082986,0.003821,56668,0.069916,0.012613,0.9174706,2019-07-31,0.057303,-0.025118
0.082986,1.72233,56668,0.15948,0.751438,0.089082,2019-07-31,-0.591958,-0.12818


In [17]:
valid_permcos = (
    filter_df.group_by("permco")
    .agg(pl.len().alias("n_obs"))
    .filter(pl.col("n_obs") >= 2)
    .select("permco")
)

filter_df = filter_df.join(valid_permcos, on="permco", how="inner")

In [18]:
filter_df.group_by("permco").agg([
    pl.corr("ema_sentiment_10", "next_day_ret").alias("corr_sentiment_ret"),
    pl.corr("ITI(13D)", "next_day_ret").alias("corr_ITI_ret"),
])

permco,corr_sentiment_ret,corr_ITI_ret
i64,f64,f64
37,-0.013092,0.0212
90,-0.000332,-0.017775
92,0.024549,-0.020852
116,-0.093467,0.086886
137,0.002462,-0.009349
…,…,…
56518,-0.360313,0.045058
56550,-0.0487,-0.035118
56616,0.146708,0.214033
56662,0.029496,0.225311


In [19]:
filter_df.describe()

statistic,ITI(13D),next_day_ret,permco,Positive,Negative,Neutral,date,sentiment_score,ema_sentiment_10
str,f64,f64,f64,f64,f64,f64,str,f64,f64
"""count""",1046012.0,1046012.0,1046012.0,1046012.0,1046012.0,1046012.0,"""1046012""",1046012.0,1046012.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,"""0""",0.0,0.0
"""mean""",0.350079,0.000751,26929.537592,0.276516,0.199291,0.524194,"""2015-09-13 02:24:34.014447""",0.077225,0.047101
"""std""",0.195987,0.034584,18347.572172,0.329548,0.301611,0.369576,,0.512402,0.173318
"""min""",0.000663,-0.928571,37.0,0.00607,0.005645,0.008015,"""2009-05-27""",-0.96954,-0.924943
"""25%""",0.2004462,-0.010582,12305.0,0.034653,0.018544,0.103952,"""2013-09-10""",-0.083208,-0.048086
"""50%""",0.3158928,0.000423,21068.0,0.088214,0.040139,0.628479,"""2015-10-21""",0.033937,0.041859
"""75%""",0.4669854,0.011632,47220.0,0.4989567,0.21436,0.8982955,"""2018-01-22""",0.3848,0.155328
"""max""",0.9997168,3.429348,56668.0,0.961015,0.9773637,0.9589839,"""2019-07-31""",0.946633,0.926176


In [20]:
corr_global = filter_df.select(
    pl.corr("ITI(13D)", "next_day_ret").alias("corr_ITI_ret_global")
)
print(corr_global)

shape: (1, 1)
┌─────────────────────┐
│ corr_ITI_ret_global │
│ ---                 │
│ f64                 │
╞═════════════════════╡
│ 0.000015            │
└─────────────────────┘


In [21]:
corr_by_firm = (
    filter_df
    .group_by("permco")
    .agg(pl.corr("ITI(13D)", "next_day_ret").alias("corr_ITI_ret"))
    .drop_nulls()  # supprime les corrélations NaN
)

# Compter combien d'entreprises ont une corrélation proche de -1, 0, ou 1
corr_distribution = corr_by_firm.select([
    pl.col("corr_ITI_ret").mean().alias("mean_corr"),
    pl.col("corr_ITI_ret").std().alias("std_corr"),
    pl.col("corr_ITI_ret").min().alias("min_corr"),
    pl.col("corr_ITI_ret").max().alias("max_corr"),
    ((pl.col("corr_ITI_ret").abs() < 0.1).sum()).alias("near_zero"),   # |corr| < 0.1
    ((pl.col("corr_ITI_ret") > 0.7).sum()).alias("near_pos1"),         # corr > 0.7
    ((pl.col("corr_ITI_ret") < -0.7).sum()).alias("near_neg1"),        # corr < -0.7
    pl.len().alias("total_firms")
])

print(corr_distribution)



shape: (1, 8)
┌───────────┬──────────┬──────────┬──────────┬───────────┬───────────┬───────────┬─────────────┐
│ mean_corr ┆ std_corr ┆ min_corr ┆ max_corr ┆ near_zero ┆ near_pos1 ┆ near_neg1 ┆ total_firms │
│ ---       ┆ ---      ┆ ---      ┆ ---      ┆ ---       ┆ ---       ┆ ---       ┆ ---         │
│ f64       ┆ f64      ┆ f64      ┆ f64      ┆ u32       ┆ u32       ┆ u32       ┆ u32         │
╞═══════════╪══════════╪══════════╪══════════╪═══════════╪═══════════╪═══════════╪═════════════╡
│ NaN       ┆ NaN      ┆ -1.0     ┆ 1.0      ┆ 1693      ┆ 14        ┆ 7         ┆ 2037        │
└───────────┴──────────┴──────────┴──────────┴───────────┴───────────┴───────────┴─────────────┘


In [22]:
filter_df = filter_df.with_columns(pl.col("ITI(13D)").qcut(10).alias("ITI_decile"))

iti_decile_stats = (
    filter_df.group_by("ITI_decile")
    .agg(pl.col("next_day_ret").mean().alias("avg_next_day_ret"))
    .sort("ITI_decile")
)
print(iti_decile_stats)


shape: (10, 2)
┌─────────────────────────────────┬──────────────────┐
│ ITI_decile                      ┆ avg_next_day_ret │
│ ---                             ┆ ---              │
│ cat                             ┆ f64              │
╞═════════════════════════════════╪══════════════════╡
│ (-inf, 0.12406062]              ┆ 0.000836         │
│ (0.12406062, 0.1770104]         ┆ 0.000607         │
│ (0.1770104, 0.22323705]         ┆ 0.000866         │
│ (0.22323705, 0.26850203]        ┆ 0.000651         │
│ (0.26850203, 0.3158928]         ┆ 0.000663         │
│ (0.3158928, 0.36798325]         ┆ 0.000791         │
│ (0.36798325, 0.430091823999999… ┆ 0.000694         │
│ (0.43009182399999996, 0.510508… ┆ 0.000971         │
│ (0.5105084, 0.6314507940000001… ┆ 0.000743         │
│ (0.6314507940000001, inf]       ┆ 0.000688         │
└─────────────────────────────────┴──────────────────┘


In [24]:
filter_df = filter_df.with_columns([
    # Colonne indiquant si le signe du retour a changé par rapport à la veille
    (pl.col("next_day_ret").sign() != pl.col("next_day_ret").shift(1).sign())
        .cast(pl.Int8)
        .alias("ret_sign_change")
])

In [25]:
filter_df.select([
    pl.col("next_day_ret"),
    pl.col("ret_sign_change"),
    pl.col("ITI(13D)")
]).head(10)

next_day_ret,ret_sign_change,ITI(13D)
f64,i8,f64
-0.006728,,0.030262
0.038868,1.0,0.1824342
-0.001397,1.0,0.1824342
0.002006,1.0,0.481816
0.050674,0.0,0.509262
0.059165,0.0,0.509262
0.016298,0.0,0.185055
0.008351,0.0,0.174374
0.022901,0.0,0.2680761
-0.017632,1.0,0.311135


In [None]:
corr_sign_iti = filter_df.select(pl.corr("ret_sign_change", "ITI(13D)")).item()
print(f"Corrélation entre changement de signe et ITI(13D): {corr_sign_iti:.4f}")

Corrélation entre changement de signe et ITI(13D): 0.0008


In [27]:
# Corrélation ITI - changement de signe du return, par firme
corr_by_firm_sign = (
    filter_df
    .group_by("permco")
    .agg(pl.corr("ITI(13D)", "ret_sign_change").alias("corr_ITI_sign"))
    .drop_nulls()
)

# Statistiques de distribution des corrélations
corr_sign_distribution = corr_by_firm_sign.select([
    pl.col("corr_ITI_sign").mean().alias("mean_corr"),
    pl.col("corr_ITI_sign").std().alias("std_corr"),
    pl.col("corr_ITI_sign").min().alias("min_corr"),
    pl.col("corr_ITI_sign").max().alias("max_corr"),
    ((pl.col("corr_ITI_sign").abs() < 0.1).sum()).alias("near_zero"),   # |corr| < 0.1
    ((pl.col("corr_ITI_sign") > 0.7).sum()).alias("near_pos1"),         # corr > 0.7
    ((pl.col("corr_ITI_sign") < -0.7).sum()).alias("near_neg1"),        # corr < -0.7
    pl.len().alias("total_firms")
])

print(corr_sign_distribution)


shape: (1, 8)
┌───────────┬──────────┬──────────┬──────────┬───────────┬───────────┬───────────┬─────────────┐
│ mean_corr ┆ std_corr ┆ min_corr ┆ max_corr ┆ near_zero ┆ near_pos1 ┆ near_neg1 ┆ total_firms │
│ ---       ┆ ---      ┆ ---      ┆ ---      ┆ ---       ┆ ---       ┆ ---       ┆ ---         │
│ f64       ┆ f64      ┆ f64      ┆ f64      ┆ u32       ┆ u32       ┆ u32       ┆ u32         │
╞═══════════╪══════════╪══════════╪══════════╪═══════════╪═══════════╪═══════════╪═════════════╡
│ NaN       ┆ NaN      ┆ -1.0     ┆ 1.0      ┆ 1647      ┆ 21        ┆ 7         ┆ 2037        │
└───────────┴──────────┴──────────┴──────────┴───────────┴───────────┴───────────┴─────────────┘


In [28]:
# Corrélation entre ema_sentiment_10 et next_day_ret, par entreprise
corr_by_firm_sent = (
    filter_df
    .group_by("permco")
    .agg(pl.corr("ema_sentiment_10", "next_day_ret").alias("corr_sentiment_ret"))
    .drop_nulls()  # supprime les corrélations NaN
)

# Statistiques globales sur la distribution des corrélations
corr_sent_distribution = corr_by_firm_sent.select([
    pl.col("corr_sentiment_ret").mean().alias("mean_corr"),
    pl.col("corr_sentiment_ret").std().alias("std_corr"),
    pl.col("corr_sentiment_ret").min().alias("min_corr"),
    pl.col("corr_sentiment_ret").max().alias("max_corr"),
    ((pl.col("corr_sentiment_ret").abs() < 0.1).sum()).alias("near_zero"),   # |corr| < 0.1
    ((pl.col("corr_sentiment_ret") > 0.7).sum()).alias("near_pos1"),         # corr > 0.7
    ((pl.col("corr_sentiment_ret") < -0.7).sum()).alias("near_neg1"),        # corr < -0.7
    pl.len().alias("total_firms")
])

print(corr_sent_distribution)


shape: (1, 8)
┌───────────┬──────────┬──────────┬──────────┬───────────┬───────────┬───────────┬─────────────┐
│ mean_corr ┆ std_corr ┆ min_corr ┆ max_corr ┆ near_zero ┆ near_pos1 ┆ near_neg1 ┆ total_firms │
│ ---       ┆ ---      ┆ ---      ┆ ---      ┆ ---       ┆ ---       ┆ ---       ┆ ---         │
│ f64       ┆ f64      ┆ f64      ┆ f64      ┆ u32       ┆ u32       ┆ u32       ┆ u32         │
╞═══════════╪══════════╪══════════╪══════════╪═══════════╪═══════════╪═══════════╪═════════════╡
│ -0.000591 ┆ 0.142995 ┆ -1.0     ┆ 1.0      ┆ 1673      ┆ 13        ┆ 8         ┆ 2037        │
└───────────┴──────────┴──────────┴──────────┴───────────┴───────────┴───────────┴─────────────┘


In [31]:
import polars as pl

# 1. Détecter le changement de signe du sentiment (EMA)
filter_df = filter_df.sort(["permco", "date"]).with_columns([
    pl.col("ema_sentiment_10").shift(1).over("permco").alias("prev_sentiment"),
])

filter_df = filter_df.with_columns([
    (
        (pl.col("prev_sentiment") * pl.col("ema_sentiment_10") < 0)
    ).cast(pl.Int8).alias("sentiment_regime_change")
])

# 2. Définir si l’ITI est "haut"
high_threshold = 0.8  # ajustable selon la distribution de l'ITI
filter_df = filter_df.with_columns([
    (pl.col("ITI(13D)") > high_threshold).cast(pl.Int8).alias("high_ITI")
])

# 3. Créer une variable d’interaction
filter_df = filter_df.with_columns([
    (pl.col("high_ITI") * pl.col("sentiment_regime_change")).alias("signal_change_high_ITI")
])

# 4. Corrélation par firme
corr_by_firm = (
    filter_df
    .group_by("permco")
    .agg(pl.corr("signal_change_high_ITI", "next_day_ret").alias("corr_signal_ret"))
    .drop_nulls()
    .sort("corr_signal_ret", descending=True)
)

# 5. Statistiques descriptives
corr_stats = corr_by_firm.select([
    pl.col("corr_signal_ret").mean().alias("mean_corr"),
    pl.col("corr_signal_ret").std().alias("std_corr"),
    pl.col("corr_signal_ret").min().alias("min_corr"),
    pl.col("corr_signal_ret").max().alias("max_corr"),
    ((pl.col("corr_signal_ret").abs() < 0.1).sum()).alias("near_zero"),
    ((pl.col("corr_signal_ret") > 0.7).sum()).alias("near_pos1"),
    ((pl.col("corr_signal_ret") < -0.7).sum()).alias("near_neg1"),
    pl.len().alias("total_firms")
])

print(corr_stats)


shape: (1, 8)
┌───────────┬──────────┬───────────┬──────────┬───────────┬───────────┬───────────┬─────────────┐
│ mean_corr ┆ std_corr ┆ min_corr  ┆ max_corr ┆ near_zero ┆ near_pos1 ┆ near_neg1 ┆ total_firms │
│ ---       ┆ ---      ┆ ---       ┆ ---      ┆ ---       ┆ ---       ┆ ---       ┆ ---         │
│ f64       ┆ f64      ┆ f64       ┆ f64      ┆ u32       ┆ u32       ┆ u32       ┆ u32         │
╞═══════════╪══════════╪═══════════╪══════════╪═══════════╪═══════════╪═══════════╪═════════════╡
│ NaN       ┆ NaN      ┆ -0.697056 ┆ 0.654582 ┆ 1205      ┆ 731       ┆ 0         ┆ 2037        │
└───────────┴──────────┴───────────┴──────────┴───────────┴───────────┴───────────┴─────────────┘


In [33]:
filter_df.describe()

statistic,ITI(13D),next_day_ret,permco,Positive,Negative,Neutral,date,sentiment_score,ema_sentiment_10,ITI_decile,ret_sign_change,prev_sentiment,sentiment_regime_change,high_ITI,signal_change_high_ITI
str,f64,f64,f64,f64,f64,f64,str,f64,f64,str,f64,f64,f64,f64,f64
"""count""",1046012.0,1046012.0,1046012.0,1046012.0,1046012.0,1046012.0,"""1046012""",1046012.0,1046012.0,"""1046012""",1046011.0,1043975.0,1043975.0,1046012.0,1043975.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,"""0""",0.0,0.0,"""0""",1.0,2037.0,2037.0,0.0,2037.0
"""mean""",0.350079,0.000751,26929.537592,0.276516,0.199291,0.524194,"""2015-09-13 02:24:34.014447""",0.077225,0.047101,,0.514572,0.047127,0.171459,0.029197,0.004708
"""std""",0.195987,0.034584,18347.572172,0.329548,0.301611,0.369576,,0.512402,0.173318,,0.499788,0.173327,0.37691,0.168357,0.068453
"""min""",0.000663,-0.928571,37.0,0.00607,0.005645,0.008015,"""2009-05-27""",-0.96954,-0.924943,,0.0,-0.924943,0.0,0.0,0.0
"""25%""",0.2004462,-0.010582,12305.0,0.034653,0.018544,0.103952,"""2013-09-10""",-0.083208,-0.048086,,0.0,-0.048043,0.0,0.0,0.0
"""50%""",0.3158928,0.000423,21068.0,0.088214,0.040139,0.628479,"""2015-10-21""",0.033937,0.041859,,1.0,0.041888,0.0,0.0,0.0
"""75%""",0.4669854,0.011632,47220.0,0.4989567,0.21436,0.8982955,"""2018-01-22""",0.3848,0.155328,,1.0,0.155351,0.0,0.0,0.0
"""max""",0.9997168,3.429348,56668.0,0.961015,0.9773637,0.9589839,"""2019-07-31""",0.946633,0.926176,,1.0,0.926176,1.0,1.0,1.0
