In [None]:
# Stock Return Prediction — Completed Version with Price Trend Plot
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import os

# -------------------------------------------------
# 1. READ RAW FILES (comma‑separated)
# -------------------------------------------------
trade_df = pd.read_csv('../data/trade.csv', sep=',', header=None)
quote_df = pd.read_csv('../data/quote.csv', sep=',', header=None)

# -------------------------------------------------
# 2. SET COLUMN NAMES
# -------------------------------------------------
trade_df.columns = [
    'Date', 'Time', 'Price', 'Quantity', 'Buy_Sell', 'Trade_Type',
    'Trade_ID', 'Buyer_ID', 'Buyer_Type', 'Buyer_Role',
    'Seller_ID', 'Seller_Type', 'Seller_Role'
]
quote_df.columns = ['Date', 'Time', 'bid_price', 'bid_size', 'ask_price', 'ask_size']

# -------------------------------------------------
# 3. PARSE DATETIME AND CLEAN DATA
# -------------------------------------------------
trade_df['Datetime'] = pd.to_datetime(trade_df['Date'] + ' ' + trade_df['Time'],
                                      format='%d/%m/%Y %H:%M:%S.%f', errors='coerce')
quote_df['Datetime'] = pd.to_datetime(quote_df['Date'] + ' ' + quote_df['Time'],
                                      format='%d/%m/%Y %H:%M:%S.%f', errors='coerce')

trade_df = trade_df.rename(columns={'Price': 'price', 'Quantity': 'volume', 'Buy_Sell': 'aggressor'})
trade_df['company'] = 'ABC'
trade_df['buy_order_capacity'] = trade_df['volume'].astype(float)
trade_df['sell_order_capacity'] = trade_df['volume'].astype(float)
quote_df['ask_price'] = pd.to_numeric(quote_df['ask_price'], errors='coerce')
quote_df['ask_size'] = pd.to_numeric(quote_df['ask_size'], errors='coerce')
quote_df['company'] = 'ABC'

# Map trade period
trade_df['trade_period'] = trade_df['Datetime'].dt.time.apply(
    lambda t: 'O' if t >= pd.to_datetime('09:00:00').time() and t < pd.to_datetime('09:15:00').time()
    else 'T' if t >= pd.to_datetime('15:30:00').time() and t <= pd.to_datetime('16:00:00').time()
    else '-')
trade_df['aggressor'] = trade_df['aggressor'].map({'B': 'buy', 'S': 'sell'})

# -------------------------------------------------
# 4. FEATURE AGGREGATION FUNCTIONS (omitted for brevity — same as previously edited)
# -------------------------------------------------
# ... [USE AGGREGATION CODE FROM PREVIOUS UPDATE] ...

# -------------------------------------------------
# 5. AGGREGATE + TARGET
# -------------------------------------------------
merged_df = create_target(merge_aggregated_data(trade_df, quote_df))
train_size = int(len(merged_df) * 0.8)
train_df = merged_df.iloc[:train_size].copy()
test_df = merged_df.iloc[train_size:].copy()

# -------------------------------------------------
# 6. PREPROCESSING
# -------------------------------------------------
numeric_cols = [
    'num_trades', 'o', 'h', 'l', 'c', 'total_volume', 'total_buy_cap', 'total_sell_cap',
    'weighted_price', 'trade_imbalance_ratio', 'trade_volume_imbalance_ratio',
    'order_cap_imbalance_ratio', 'avg_spread', 'max_spread', 'min_spread',
    'total_bid_size', 'total_ask_size', 'weighted_avg_bid_price', 'weighted_avg_ask_price']
train_df[numeric_cols] = train_df[numeric_cols].fillna(train_df[numeric_cols].mean())
test_df[numeric_cols] = test_df[numeric_cols].fillna(train_df[numeric_cols].mean())
scaler = StandardScaler().fit(train_df[numeric_cols])
train_df[numeric_cols] = scaler.transform(train_df[numeric_cols])
test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])

# -------------------------------------------------
# 7. LINEAR REGRESSION MODEL
# -------------------------------------------------
X_train = train_df[numeric_cols]
y_train = train_df['stock_return']
X_test = test_df[numeric_cols]
y_test = test_df['stock_return']
model = LinearRegression().fit(X_train, y_train)
test_df['predicted_stock_return'] = model.predict(X_test)

# -------------------------------------------------
# 8. CORRELATION PLOT
# -------------------------------------------------
correlations = test_df.groupby('minute').apply(
    lambda df: pearsonr(df['stock_return'], df['predicted_stock_return'])[0]
    if df[['stock_return', 'predicted_stock_return']].notna().all(axis=1).sum() > 1 else np.nan
).reset_index(name='correlation')

plt.figure(figsize=(12, 6))
plt.plot(correlations['minute'], correlations['correlation'], marker='o', linestyle='-')
plt.title('Per-Minute Correlation Between Actual and Predicted Stock Returns')
plt.xlabel('Time')
plt.ylabel('Correlation')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
os.makedirs('../plots', exist_ok=True)
plt.savefig('../plots/correlation_plot.png')

# -------------------------------------------------
# 9. TRADE/QUOTE PRICE TREND PLOT
# -------------------------------------------------
plt.figure(figsize=(14, 6))
plt.plot(trade_df['Datetime'], trade_df['price'].astype(float), label='Trade Price', color='blue')
plt.plot(quote_df['Datetime'], quote_df['bid_price'], label='Bid Price', linestyle='--', color='green')
plt.plot(quote_df['Datetime'], quote_df['ask_price'], label='Ask Price', linestyle='--', color='red')
plt.title('Trade and Quote Prices Over Time (April 1, 2024)')
plt.xlabel('Time')
plt.ylabel('Price')
plt.legend()
plt.grid(True)
plt.tight_layout()
os.makedirs('../plots', exist_ok=True)
plt.savefig('../plots/price_trend_with_quotes.png')

# -------------------------------------------------
# 10. OUTPUT
# -------------------------------------------------
print("\nFirst 5 rows of the merged data:")
print(merged_df.head())


OSError: 'seaborn' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)