# Step 7 : Alert Generation & Evaluation
Carta et al. (2021) — Étape 7 du pipeline

Détecter les hot events en mesurant la proportion de tweets assignés :$$R(d) = \frac{\text{nb tweets assignés le jour } d}{\text{nb total tweets le jour } d}$$Si $R(d) > \theta$, une alerte est générée.

### Ground Truth

- $\delta_d = |\text{closed}_{d+7} - \text{closed}_d| / \text{closed}_d$
- Event day si $\delta_d > 2\%$
- Métriques : Precision, Recall, F-score

### Libraries

In [1]:
import pandas as pd
import plotly.graph_objects as go

### Data loading

In [13]:
tweets_ready = pd.read_csv('../data/for_models/tweets_features.csv')
tweets_ready['date'] = pd.to_datetime(tweets_ready['date'])
final_tweets_assigned = pd.read_csv('../data/for_models/tweets_assigned.csv')
final_tweets_assigned['date'] = pd.to_datetime(final_tweets_assigned['date'])
sp500_prices = pd.read_csv('../data/processed/sp500_2023.csv')

### Alert generation

In [18]:
import pandas as pd
import plotly.graph_objects as go

# 1. Chargement et nettoyage du fichier sp500_2023.csv
# On ignore les lignes d'en-tête superflues (Ticker, Date row)
df = pd.read_csv('../data/processed/sp500_2023.csv', skiprows=[1, 2])
df.rename(columns={'Price': 'date', 'Close': 'close'}, inplace=True)
df['date'] = pd.to_datetime(df['date'])

# Filtrage pour l'année 2023 uniquement
df = df[df['date'].dt.year == 2023].reset_index(drop=True)

# 2. Calcul de la variation hebdomadaire (retour sur 5 jours glissants)
df['weekly_return'] = df['close'].pct_change(periods=5)

# 3. Identification des segments d'événements (variation > 2% en valeur absolue)
event_mask = df['weekly_return'].abs() > 0.02
events = []
in_event = False
start_date = None

for i in range(len(df)):
    if event_mask.iloc[i]:
        if not in_event:
            start_date = df['date'].iloc[i]
            in_event = True
    else:
        if in_event:
            end_date = df['date'].iloc[i-1]
            events.append((start_date, end_date))
            in_event = False
if in_event:
    events.append((start_date, df['date'].iloc[-1]))

# 4. Création de la figure Plotly
fig = go.Figure()

# Courbe du S&P 500
fig.add_trace(go.Scatter(
    x=df['date'], 
    y=df['close'], 
    mode='lines',
    name='S&P 500',
    line=dict(color='#3498db', width=2),
    hovertemplate="<b>%{x|%d %b %Y}</b><br>Prix: %{y:.2f}$<extra></extra>"
))

# Ajout des zones d'événements en rouge
for start, end in events:
    fig.add_vrect(
        x0=start, x1=end,
        fillcolor="red", opacity=0.15, layer="below", line_width=0
    )

# Ligne verticale pour la SVB (10 mars)
fig.add_vline(
    x='2023-03-10', 
    line_width=2, 
    line_dash="dot", 
    line_color="orange"
)

# Annotation SVB
fig.add_annotation(
    x='2023-03-10',
    y=1.05,
    yref='paper',
    text="Faillite SVB (10 mars)",
    showarrow=False,
    font=dict(color="orange", size=12),
    bgcolor="white"
)

# Mise en page
fig.update_layout(
    title='<b>S&P 500 (2023) — Zones de Volatilité Hebdomadaire > 2%</b>',
    xaxis_title='Date',
    yaxis_title='Prix de clôture ($)',
    template='plotly_white',
    hovermode='x unified',
    xaxis=dict(tickformat='%b %Y', dtick="M1", tickangle=-45),
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)

fig.show()

In [None]:
def generate_market_alerts(all_tweets_df, assigned_tweets_df, alert_threshold=0.20):
    """
    Calcule le ratio d'assignation quotidien et génère des alertes.
    all_tweets_df : Tweets après dédoublonnage (le dénominateur)
    assigned_tweets_df : Tweets ayant passé le seuil Delta (le numérateur)
    """
    # 1. Volume total par jour
    total_daily = all_tweets_df.groupby(all_tweets_df['date'].dt.date).size().reset_index(name='total_count')
    
    # 2. Volume assigné par jour
    assigned_daily = assigned_tweets_df.groupby(assigned_tweets_df['date'].dt.date).size().reset_index(name='assigned_count')
    
    # 3. Fusion et calcul du ratio
    alert_metrics = pd.merge(total_daily, assigned_daily, on='date', how='left').fillna(0)
    alert_metrics['ratio'] = alert_metrics['assigned_count'] / alert_metrics['total_count']
    
    # 4. Identification des alertes
    alert_metrics['is_alert'] = alert_metrics['ratio'] >= alert_threshold
    
    return alert_metrics

# --- EXECUTION ---
alert_data = generate_market_alerts(tweets_ready, final_tweets_assigned, alert_threshold=0.25)

print("Jours avec Alertes Générées :")
print(alert_data[alert_data['is_alert'] == True][['date', 'ratio']])

Jours avec Alertes Générées :
          date     ratio
52  2023-03-08  1.000000
53  2023-03-09  0.846154
54  2023-03-10  0.882353
55  2023-03-11  0.333333
56  2023-03-12  0.571429
57  2023-03-13  0.902439
58  2023-03-14  0.833333
59  2023-03-15  0.941176
60  2023-03-16  0.954545
61  2023-03-17  0.750000


In [11]:
def plot_alert_generation(alert_data, alert_threshold=0.25):
    fig = go.Figure()

    # Courbe du Ratio
    fig.add_trace(go.Scatter(
        x=alert_data['date'], 
        y=alert_data['ratio'],
        mode='lines+markers',
        name='Social Heat Ratio',
        line=dict(color='#1DA1F2', width=2),
        fill='tozeroy' # Remplissage sous la courbe pour le "volume"
    ))

    # Ligne du Seuil d'Alerte
    fig.add_hline(
        y=alert_threshold, 
        line_dash="dash", 
        line_color="red", 
        annotation_text="ALERT THRESHOLD", 
        annotation_position="top left"
    )

    # Marquage des points d'alerte
    alerts = alert_data[alert_data['is_alert']]
    fig.add_trace(go.Scatter(
        x=alerts['date'], 
        y=alerts['ratio'],
        mode='markers',
        marker=dict(color='red', size=12, symbol='triangle-up'),
        name='Market Alert Generated'
    ))

    fig.update_layout(
        title="<b>Pipeline Final : Génération d'Alertes de Marché</b><br><sup>Basé sur le ratio d'assignation Social Media</sup>",
        xaxis_title="Date",
        yaxis_title="% de Tweets Assignés aux News",
        template="plotly_white",
        yaxis=dict(tickformat=".0%") # Format pourcentage
    )
    
    fig.show()

plot_alert_generation(alert_data, alert_threshold=0.25)

In [22]:
import pandas as pd

# 1. Chargement et nettoyage
df_sp500 = pd.read_csv('../data/processed/sp500_2023.csv', skiprows=[1, 2])
df_sp500.rename(columns={'Price': 'date', 'Close': 'close'}, inplace=True)
df_sp500['date'] = pd.to_datetime(df_sp500['date'])

# On garde uniquement 2023
df_sp500 = df_sp500[df_sp500['date'].dt.year == 2023].reset_index(drop=True)

# 2. Calcul de la variation hebdomadaire (5 jours)
df_sp500['weekly_return'] = df_sp500['close'].pct_change(periods=5)

# 3. Extraction des périodes d'événements (variation > 2%)
event_mask = df_sp500['weekly_return'].abs() > 0.02
gt_events = []
in_event = False
start_date = None

for i in range(len(df_sp500)):
    if event_mask.iloc[i]:
        if not in_event:
            start_date = df_sp500['date'].iloc[i]
            in_event = True
    else:
        if in_event:
            end_date = df_sp500['date'].iloc[i-1]
            gt_events.append({'start': start_date, 'end': end_date})
            in_event = False
if in_event:
    gt_events.append({'start': start_date, 'end': df_sp500['date'].iloc[-1]})

print(f"✅ {len(gt_events)} événements Ground Truth (S&P 500) ont été définis.")

✅ 26 événements Ground Truth (S&P 500) ont été définis.


In [23]:
def calculate_pipeline_metrics(alert_data, ground_truth_events):
    """
    Calcule Precision, Recall et F-score selon la méthodologie Carta et al.
    """
    # 1. Identification des alertes générées par le système (Signal)
    system_alerts = alert_data[alert_data['is_alert'] == True]['date'].tolist()
    total_alerts = len(system_alerts)
    
    if total_alerts == 0:
        return {"Precision": 0, "Recall": 0, "F1": 0}

    # 2. Comptage des HITS (Alertes tombant dans un événement GT)
    hits = 0
    detected_events_count = 0
    
    # On vérifie chaque événement de marché (GT)
    for event in ground_truth_events:
        event_detected = False
        
        # Un événement est "détecté" si au moins une alerte tombe dedans
        for alert_date in system_alerts:
            # On convertit en timestamp pour la comparaison
            ts_alert = pd.Timestamp(alert_date)
            
            if event['start'] <= ts_alert <= event['end']:
                if not event_detected:
                    detected_events_count += 1
                    event_detected = True
                
                # Pour la précision, on compte chaque alerte correcte
                hits += 1
    
    # 3. Calcul des Métriques
    # Precision : Combien d'alertes étaient correctes ?
    precision = hits / total_alerts
    
    # Recall : Combien d'événements réels ont été capturés ?
    recall = detected_events_count / len(ground_truth_events)
    
    # F-score : Équilibre entre les deux
    f_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        "Total Ground Truth Events": len(ground_truth_events),
        "Total Alerts Generated": total_alerts,
        "Precision": round(precision, 4),
        "Recall": round(recall, 4),
        "F-Score": round(f_score, 4)
    }

# --- APPLICATION ---
# Note : 'gt_events' est la liste des 26 périodes calculées sur le S&P 500
metrics = calculate_pipeline_metrics(alert_data, gt_events)

print("--- RÉSULTATS DE L'EXPÉRIENCE (Carta et al. Methodology) ---")
for k, v in metrics.items():
    print(f"{k}: {v}")

--- RÉSULTATS DE L'EXPÉRIENCE (Carta et al. Methodology) ---
Total Ground Truth Events: 26
Total Alerts Generated: 10
Precision: 0.5
Recall: 0.0769
F-Score: 0.1333
