In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Load datasets
files = [
    "../data_1/prices_round_1_day_-2.csv",
    "../data_1/prices_round_1_day_-1.csv",
    "../data_1/prices_round_1_day_0.csv"
]
df = pd.concat([pd.read_csv(f, sep=";") for f in files]).sort_values(by="timestamp").reset_index(drop=True)

# Filter for KELP product
kelp = df[df["product"] == "KELP"].copy()
kelp["mid_price"] = (kelp["bid_price_1"] + kelp["ask_price_1"]) / 2

# Compute log returns
kelp["log_price"] = np.log(kelp["mid_price"])
kelp["log_return"] = kelp["log_price"].diff()

# Compute order flow features
kelp["bid_ask_spread"] = kelp["ask_price_1"] - kelp["bid_price_1"]
kelp["order_flow_imbalance"] = (kelp["bid_volume_1"] - kelp["ask_volume_1"]) / (
    kelp["bid_volume_1"] + kelp["ask_volume_1"]
)

# Drop rows with any NaNs
kelp.dropna(subset=["log_return", "bid_ask_spread", "order_flow_imbalance"], inplace=True)

# Construct lagged features
lags = 5
X, y = [], []

for i in range(lags, len(kelp)):
    lagged_returns = kelp["log_return"].iloc[i-lags:i].values
    spread = kelp["bid_ask_spread"].iloc[i]
    imbalance = kelp["order_flow_imbalance"].iloc[i]
    if np.any(np.isnan(lagged_returns)) or np.isnan(spread) or np.isnan(imbalance):
        continue
    features = np.concatenate([lagged_returns, [spread, imbalance]])
    X.append(features)
    y.append(kelp["log_return"].iloc[i])

X = np.array(X)
y = np.array(y)

# Ensure we have data
if X.size == 0 or y.size == 0:
    raise ValueError("Insufficient data to train model.")

# Normalize and fit
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

model = LinearRegression()
model.fit(X_scaled, y)

y_pred = model.predict(X_scaled)
mse = mean_squared_error(y, y_pred)

model.coef_, model.intercept_, mse


(array([-0.002273  , -0.00482843, -0.0077017 , -0.00944253, -0.00959978,
         0.00013705, -0.00024133]),
 np.float64(5.371537895427946e-07),
 2.5172536931463943e-05)

In [5]:
np.sqrt(mse)

np.float64(0.005017224026437722)

#NON STANDARDIZED FEATURES NORMAL PRICE (NOT LOG RETURNS)

In [56]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the three datasets
file1 = pd.read_csv("../data_1/prices_round_1_day_-2.csv", delimiter=";")
file2 = pd.read_csv("../data_1/prices_round_1_day_-1.csv", delimiter=";")
file3 = pd.read_csv("../data_1/prices_round_1_day_0.csv", delimiter=";")

# Combine all datasets into one
prices = pd.concat([file1, file2, file3], ignore_index=True)

# Filter for KELP product
kelp_prices = prices[prices["product"] == "KELP"].copy()

# Compute mid-price
kelp_prices["mid_price"] = (kelp_prices["bid_price_1"] + kelp_prices["ask_price_1"]) / 2

# Create order flow signals
kelp_prices["bid_ask_spread"] = kelp_prices["ask_price_1"] - kelp_prices["bid_price_1"]
kelp_prices["order_flow_imbalance"] = (kelp_prices["bid_volume_1"] - kelp_prices["ask_volume_1"]) / (
    kelp_prices["bid_volume_1"] + kelp_prices["ask_volume_1"]
)

# Drop rows with NaNs
kelp_prices.dropna(subset=["mid_price", "bid_ask_spread", "order_flow_imbalance"], inplace=True)

# Function to create lagged dataset with target P(t+1)
def create_lagged_features_target_next(series, spread_imbalance, lags=5):
    X, y = [], []
    for i in range(lags, len(series) - 1):
        features = list(series[i - lags:i]) + list(spread_imbalance[i])  # P(t-4) to P(t) + features at t
        X.append(features)
        y.append(series[i + 1])  # target = P(t+1)
    return np.array(X), np.array(y)

# Extract mid-prices and signals
mid_prices = kelp_prices["mid_price"].values
spread_imbalance = kelp_prices[["bid_ask_spread", "order_flow_imbalance"]].values

# Create feature-target dataset using last 5 prices (t-4 to t) to predict t+1
X, y = create_lagged_features_target_next(mid_prices, spread_imbalance, lags=5)

# Fit model and evaluate
model = LinearRegression().fit(X, y)
predictions = model.predict(X)
rmse = np.sqrt(mean_squared_error(y, predictions))

# Extract coefficients and intercept
coefficients = model.coef_
intercept = model.intercept_

(coefficients, intercept, rmse)


(array([ 0.15251124,  0.15576791,  0.17976303,  0.21362462,  0.29735446,
        -0.00146693, -0.01186836]),
 np.float64(1.9887312933824433),
 np.float64(0.6581312888039098))

In [58]:
import numpy as np

# --- Use your trained model values here ---
coeffs = np.array([ 0.15251124,  0.15576791,  0.17976303,  0.21362462,  0.29735446,
        -0.00146693, -0.01186836])
intercept = 1.9887312933824433

# --- Select a timestamp ---
t = 700  # Feel free to change this

# --- Pull raw features from kelp_prices ---
mid_prices = kelp_prices["mid_price"].values
spread = kelp_prices["bid_ask_spread"].values
imbalance = kelp_prices["order_flow_imbalance"].values

# --- Prepare feature vector to predict price at t+1 ---
input_features = list(mid_prices[t-4:t+1]) + [spread[t], imbalance[t]]
predicted = np.dot(coeffs, input_features) + intercept
actual = mid_prices[t+1]

print(f"Predicted: {predicted:.2f}")
print(f"Actual   : {actual:.2f}")
print(f"Error    : {predicted - actual:.2f}")


Predicted: 2004.21
Actual   : 2004.00
Error    : 0.21


#TESTING COEFFS

In [61]:
files = [
    "../data_1/testing/testing_1.csv",
    "../data_1/testing/testing_2.csv"
]
df = pd.concat([pd.read_csv(f, sep=";") for f in files]).sort_values(by="timestamp").reset_index(drop=True)

kelp = df[df["product"] == "KELP"].copy()

kelp.head()

Unnamed: 0,day,timestamp,product,bid_price_1,bid_volume_1,bid_price_2,bid_volume_2,bid_price_3,bid_volume_3,ask_price_1,ask_volume_1,ask_price_2,ask_volume_2,ask_price_3,ask_volume_3,mid_price,profit_and_loss
2,1,0,KELP,2032,23,,,,,2035,23,,,,,2033.5,0.0
4,0,0,KELP,2028,23,,,,,2032,23,,,,,2030.0,0.0
6,0,100,KELP,2031,1,2030.0,2.0,2028.0,22.0,2032,22,,,,,2031.5,0.0
9,1,100,KELP,2034,1,2033.0,2.0,2032.0,22.0,2035,22,,,,,2034.5,0.0
13,0,200,KELP,2029,1,2028.0,21.0,,,2032,22,,,,,2030.5,0.0


In [62]:
# Step 1: Feature Engineering on testing data
kelp["mid_price"] = (kelp["bid_price_1"] + kelp["ask_price_1"]) / 2
kelp["bid_ask_spread"] = kelp["ask_price_1"] - kelp["bid_price_1"]
kelp["order_flow_imbalance"] = (kelp["bid_volume_1"] - kelp["ask_volume_1"]) / (
    kelp["bid_volume_1"] + kelp["ask_volume_1"]
)

# Drop any rows with NaNs
kelp.dropna(subset=["mid_price", "bid_ask_spread", "order_flow_imbalance"], inplace=True)

# Step 2: Extract features
mid_prices_test = kelp["mid_price"].values
spread_test = kelp["bid_ask_spread"].values
imbalance_test = kelp["order_flow_imbalance"].values

# Step 3: Generate predictions
preds = []
actuals = []
errors = []
timestamps = []

for t in range(4, len(mid_prices_test) - 1):
    input_features = list(mid_prices_test[t - 4:t + 1]) + [spread_test[t], imbalance_test[t]]
    predicted = np.dot(coeffs, input_features) + intercept
    actual = mid_prices_test[t + 1]

    preds.append(predicted)
    actuals.append(actual)
    errors.append(predicted - actual)
    timestamps.append(kelp.iloc[t + 1]["timestamp"])  # timestamp of the predicted point

# Step 4: Create a result DataFrame
results_df = pd.DataFrame({
    "timestamp": timestamps,
    "predicted": preds,
    "actual": actuals,
    "error": errors
})

# Optional: Show sample predictions
print(results_df.head())

# Optional: Evaluate RMSE on test data
from sklearn.metrics import mean_squared_error
rmse_test = np.sqrt(mean_squared_error(results_df["actual"], results_df["predicted"]))
print(f"Test RMSE: {rmse_test:.4f}")


   timestamp    predicted  actual     error
0        200  2031.921747  2033.5 -1.578253
1        300  2032.187627  2031.5  0.687627
2        300  2032.224733  2034.5 -2.275267
3        400  2033.063299  2030.5  2.563299
4        400  2032.008850  2033.5 -1.491150
Test RMSE: 2.9909


#RETRAIN ON NEW DATA

In [63]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the three datasets
file1 = pd.read_csv("../data_1/prices_round_1_day_-2.csv", delimiter=";")
file2 = pd.read_csv("../data_1/prices_round_1_day_-1.csv", delimiter=";")
file3 = pd.read_csv("../data_1/prices_round_1_day_0.csv", delimiter=";")
    
file4 = pd.read_csv("../data_1/testing/testing_1.csv", delimiter=";")
file5 = pd.read_csv( "../data_1/testing/testing_2.csv", delimiter=";")

# Combine all datasets into one
prices = pd.concat([file1, file2, file3, file4, file5], ignore_index=True)

# Filter for KELP product
kelp_prices = prices[prices["product"] == "KELP"].copy()

# Compute mid-price
kelp_prices["mid_price"] = (kelp_prices["bid_price_1"] + kelp_prices["ask_price_1"]) / 2

# Create order flow signals
kelp_prices["bid_ask_spread"] = kelp_prices["ask_price_1"] - kelp_prices["bid_price_1"]
kelp_prices["order_flow_imbalance"] = (kelp_prices["bid_volume_1"] - kelp_prices["ask_volume_1"]) / (
    kelp_prices["bid_volume_1"] + kelp_prices["ask_volume_1"]
)

# Drop rows with NaNs
kelp_prices.dropna(subset=["mid_price", "bid_ask_spread", "order_flow_imbalance"], inplace=True)

# Function to create lagged dataset with target P(t+1)
def create_lagged_features_target_next(series, spread_imbalance, lags=5):
    X, y = [], []
    for i in range(lags, len(series) - 1):
        features = list(series[i - lags:i]) + list(spread_imbalance[i])  # P(t-4) to P(t) + features at t
        X.append(features)
        y.append(series[i + 1])  # target = P(t+1)
    return np.array(X), np.array(y)

# Extract mid-prices and signals
mid_prices = kelp_prices["mid_price"].values
spread_imbalance = kelp_prices[["bid_ask_spread", "order_flow_imbalance"]].values

# Create feature-target dataset using last 5 prices (t-4 to t) to predict t+1
X, y = create_lagged_features_target_next(mid_prices, spread_imbalance, lags=5)

# Fit model and evaluate
model = LinearRegression().fit(X, y)
predictions = model.predict(X)
rmse = np.sqrt(mean_squared_error(y, predictions))

# Extract coefficients and intercept
coefficients = model.coef_
intercept = model.intercept_

(coefficients, intercept, rmse)


(array([ 0.15035997,  0.15469056,  0.17986631,  0.21411357,  0.29993668,
        -0.00362124, -0.01229463]),
 np.float64(2.104067415856889),
 np.float64(0.6583568158658357))