In [35]:
import pandas as pd
import numpy as np
from datetime import timedelta

sen_path = r"C:\Users\User\PycharmProjects\Project-Trading\data\merged_senator_transactions_2022_2025.csv"
stock_path = r"C:\Users\User\PycharmProjects\Project-Trading\data\stock_data.parquet"

df_sen = pd.read_csv(sen_path)
df_stock = pd.read_parquet(stock_path)

print("Senator rows:", len(df_sen))
print("Stock rows:", len(df_stock))


Senator rows: 3468
Stock rows: 2037471


In [36]:
# 3. Parse timestamps

df_sen['time_of_filed'] = pd.to_datetime(df_sen['time_of_filed'])
df_stock['timestamp'] = pd.to_datetime(df_stock['timestamp'])


df_stock = df_stock.sort_values('timestamp')

In [37]:
df_sen.columns.tolist()


['tx_date',
 'file_date',
 'last_name',
 'first_name',
 'order_type',
 'ticker',
 'asset_name',
 'tx_amount',
 'Link',
 'filed_timestamp',
 'time_of_filed',
 'Mean Amount',
 'mean_amount',
 'Unnamed: 13',
 'tx_date.1',
 'file_date.1',
 'tx_amount.1',
 'TimeOfFiled',
 'Mean tx_Amount',
 'BUY/SELL']

In [38]:
# 4. Senatoren-Feature Engineering

# 4.1 Amount mit Vorzeichen
df_sen['signed_amount'] = df_sen['Mean Amount']

# SELL → negative Werte
df_sen.loc[df_sen['BUY/SELL'] == 'SELL', 'signed_amount'] *= -1


# 4.2 Zeitfeatures
df_sen['TimeOfFiled'] = pd.to_datetime(df_sen['TimeOfFiled'], errors='coerce')

df_sen['tx_hour'] = df_sen['TimeOfFiled'].dt.hour + df_sen['TimeOfFiled'].dt.minute / 60
df_sen['tx_weekday'] = df_sen['TimeOfFiled'].dt.weekday


In [39]:
# 5. Aktienkurs zur Veröffentlichungsminute mergen
#Zeiten anpassen und auf den selben nenner bringen
df_sen['TimeOfFiled'] = pd.to_datetime(df_sen['TimeOfFiled'], errors='coerce')
df_sen['TimeOfFiled'] = df_sen['TimeOfFiled'].dt.tz_localize('UTC')

df_stock['timestamp'] = pd.to_datetime(df_stock['timestamp'], errors='coerce')
if df_stock['timestamp'].dt.tz is None:
    df_stock['timestamp'] = df_stock['timestamp'].dt.tz_localize('UTC')


def get_price_at_or_after(t):
    """ Erste Aktienkursminute >= TimeOfFiled """
    row = df_stock[df_stock['timestamp'] >= t].head(1)
    if len(row) == 0:
        return None
    return row.iloc[0]

results = []

for idx, row in df_sen.iterrows():
    t = row['TimeOfFiled']

    stock_row = get_price_at_or_after(t)
    if stock_row is None:
        continue  # kein Kurs vorhanden

    merged = {
        'TimeOfFiled': t,
        'signed_amount': row['signed_amount'],
        'tx_hour': row['tx_hour'],
        'tx_weekday': row['tx_weekday'],
        'price_before': stock_row['close'],
        'vol_before': stock_row['volume'],
        'vwap_before': stock_row['vwap'],
    }

    # Für Target: Preis 30 Minuten später
    t_future = t + timedelta(minutes=30)
    future_row = df_stock[df_stock['timestamp'] >= t_future].head(1)

    if len(future_row) == 0:
        continue

    price_future = future_row.iloc[0]['close']
    price_now = merged['price_before']

    merged['price_change_pct'] = (price_future - price_now) / price_now * 100

    results.append(merged)

df = pd.DataFrame(results)


In [40]:
# 6. Show sample rows for presentation

print("Sample of features + target:\n")
display(df.head())

Sample of features + target:



Unnamed: 0,TimeOfFiled,signed_amount,tx_hour,tx_weekday,price_before,vol_before,vwap_before,price_change_pct
0,2022-10-04 11:06:00+00:00,32501,11.1,1.0,120.118,2256186.0,119.974289,0.0
1,2022-10-04 11:06:00+00:00,8001,11.1,1.0,120.118,2256186.0,119.974289,0.0


In [41]:
from sklearn.model_selection import train_test_split

# 7. Train/Test Split

features = [
    'signed_amount',
    'tx_hour',
    'tx_weekday',
    'price_before',
    'vol_before',
    'vwap_before'
]

target = 'price_change_pct'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [42]:
from sklearn.metrics import mean_absolute_error, r2_score

# 8. Modeling: Random Forest

model = RandomForestRegressor(
    n_estimators=300,
    max_depth=6,
    min_samples_leaf=3,
    random_state=42
)

model.fit(X_train, y_train)

preds = model.predict(X_test)

mae = mean_absolute_error(y_test, preds)
r2 = r2_score(y_test, preds)



In [46]:
print("Spalten in df_sen:")
print(df_sen.columns.tolist())

print("\nSpalten in df_stock:")
print(df_stock.columns.tolist())

Spalten in df_sen:
['tx_date', 'file_date', 'last_name', 'first_name', 'order_type', 'ticker', 'asset_name', 'tx_amount', 'Link', 'filed_timestamp', 'time_of_filed', 'Mean Amount', 'mean_amount', 'Unnamed: 13', 'tx_date.1', 'file_date.1', 'tx_amount.1', 'TimeOfFiled', 'Mean tx_Amount', 'BUY/SELL', 'signed_amount', 'tx_hour', 'tx_weekday']

Spalten in df_stock:
['symbol', 'timestamp', 'open', 'high', 'low', 'close', 'volume', 'trade_count', 'vwap']


In [44]:
# 9. Ergebnisse

print("\n===== Model Evaluation =====")
print(f"MAE: {mae:.4f} %")
print(f"R² : {r2:.4f}")

print("\nBaseline MAE (always predict 0% change):")
baseline_mae = mean_absolute_error(y_test, np.zeros_like(y_test))
print(f"Baseline MAE: {baseline_mae:.4f} %")



===== Model Evaluation =====
MAE: 0.0000 %
R² : nan

Baseline MAE (always predict 0% change):
Baseline MAE: 0.0000 %
