In [3]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score


# Pfade anpassen
sen_path = "merged_senator_transactions_2022_2025.csv"
stock_path = "stock_data.parquet"

df_sen = pd.read_csv(sen_path)
df_stock = pd.read_csv(stock_path)

AttributeError: partially initialized module 'pandas' has no attribute 'core' (most likely due to a circular import)

In [None]:
# 3. Parse timestamps

df_sen['TimeOfFiled'] = pd.to_datetime(df_sen['TimeOfFiled'])
df_stock['timestamp'] = pd.to_datetime(df_stock['timestamp'])


df_stock = df_stock.sort_values('timestamp')

In [None]:
# 4. Senatoren-Feature Engineering

df_sen['signed_amount'] = df_sen['Mean tx_Amount']

# SELL → negative Werte
df_sen.loc[df_sen['BUY/SELL'] == 'SELL', 'signed_amount'] *= -1



# 4.2 Zeitfeatures
df_sen['tx_hour'] = df_sen['TimeOfFiled'].dt.hour + df_sen['TimeOfFiled'].dt.minute/60
df_sen['tx_weekday'] = df_sen['TimeOfFiled'].dt.weekday


In [None]:
# 5. Aktienkurs zur Veröffentlichungsminute mergen

def get_price_at_or_after(t):
    """ Erste Aktienkursminute >= TimeOfFiled """
    row = df_stock[df_stock['timestamp'] >= t].head(1)
    if len(row) == 0:
        return None
    return row.iloc[0]

results = []

for idx, row in df_sen.iterrows():
    t = row['TimeOfFiled']

    stock_row = get_price_at_or_after(t)
    if stock_row is None:
        continue  # kein Kurs vorhanden

    merged = {
        'TimeOfFiled': t,
        'signed_amount': row['signed_amount'],
        'tx_hour': row['tx_hour'],
        'tx_weekday': row['tx_weekday'],
        'price_before': stock_row['close'],
        'vol_before': stock_row['volume'],
        'vwap_before': stock_row['vwap'],
    }

    # Für Target: Preis 30 Minuten später
    t_future = t + timedelta(minutes=30)
    future_row = df_stock[df_stock['timestamp'] >= t_future].head(1)

    if len(future_row) == 0:
        continue

    price_future = future_row.iloc[0]['close']
    price_now = merged['price_before']

    merged['price_change_pct'] = (price_future - price_now) / price_now * 100

    results.append(merged)

df = pd.DataFrame(results)


In [None]:
# 6. Show sample rows for presentation

print("Sample of features + target:\n")
display(df.head())

In [None]:
# 7. Train/Test Split

features = [
    'signed_amount',
    'tx_hour',
    'tx_weekday',
    'price_before',
    'vol_before',
    'vwap_before'
]

target = 'price_change_pct'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [None]:
# 8. Modeling: Random Forest

model = RandomForestRegressor(
    n_estimators=300,
    max_depth=6,
    min_samples_leaf=3,
    random_state=42
)

model.fit(X_train, y_train)

preds = model.predict(X_test)

mae = mean_absolute_error(y_test, preds)
r2 = r2_score(y_test, preds)

In [None]:
# 9. Ergebnisse

print("\n===== Model Evaluation =====")
print(f"MAE: {mae:.4f} %")
print(f"R² : {r2:.4f}")

print("\nBaseline MAE (always predict 0% change):")
baseline_mae = mean_absolute_error(y_test, np.zeros_like(y_test))
print(f"Baseline MAE: {baseline_mae:.4f} %")
