# Préambule
___

In [None]:
import pandas as pd

import data_explor.rui.data_preambule as data_preambule
import strategies
from tool_kit import*

import sys
import runpy
from joblib import load, Parallel, delayed
from tqdm import tqdm


In [None]:
# parameters
DATA_PATH = '../../data/high_10m.parquet'
NUM_LAGS = 10
NUM_STOCKS = 10
TRAIN_WINDOW = 50
OUTPUT_PATH = '../../data/processed_high_10m_subset.parquet'
SEED = 42
SHRINKAGE_LIST = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]


In [None]:
sys.argv = [
    'data_preambule.py',
    '--data_path', DATA_PATH,
    '--num_lags', str(NUM_LAGS),
    '--num_random_stocks', str(NUM_STOCKS),
    '--output_path', OUTPUT_PATH,
    '--seed', str(SEED)
]

In [None]:
data_preambule.main()

In [None]:
# Define the vector of features
features = [f"return_lag_{i}" for i in range(1, 11)]

# Linear Regression
___

In [None]:
# Load dataset
table2 = pd.read_parquet(OUTPUT_PATH)
table2.head(2)

In [None]:
# Run in parallel
results_nested = Parallel(n_jobs=-1)(
    delayed(run_regression_for_symbol)(
        symbol, group, features=features, train_window=TRAIN_WINDOW
    )
    for symbol, group in tqdm(table2.groupby("symbol"))
)

# Flatten the list of lists
OLS_regression_results = [item for sublist in results_nested for item in sublist]

# Store the results
pd.DataFrame(OLS_regression_results).to_parquet('../../data/OLS_regression_results.parquet', index=False)

In [None]:
sys.argv = [
    'regressions.py',
    '--data_path', str(OUTPUT_PATH),
    '--model_path', '../../data/',
    '--model', 'ols',
    '--plot', str(False)
]

strategies.main()

# Ridge regression (non-random)
___

In [None]:
# Run in parallel over all symbols
results = Parallel(n_jobs=-1)(
    delayed(run_ridge_for_symbol)(symbol, group, features=features, train_window= TRAIN_WINDOW, shrinkage_list=SHRINKAGE_LIST)
    for symbol, group in tqdm(table2.groupby("symbol"))
)
# Flatten the nested list of predictions
ridge_regression_results = [item for sublist in results for item in sublist]

# Store the results
pd.DataFrame(ridge_regression_results).to_parquet('../../data/ridge_regression_results.parquet', index=False)

In [None]:
sys.argv = [
    'regressions.py',
    '--data_path', str(OUTPUT_PATH),
    '--model_path', '../../data/',
    '--model', 'ridge',
    '--shrinkage_list', *map(str, SHRINKAGE_LIST),
    '--plot', str(True)
]


strategies.main()

# Ridge with random featuring
___

## Creating random stuff

In [None]:
functions = [
    lambda x: np.sin(x),
    lambda x: np.cos(x),
    lambda x: np.tan(x),
    lambda x: x**2,
    lambda x: np.abs(x),
    lambda x: np.sign(x),
]

In [None]:
# Creating std_deviation, skew, kurtosis columns for each lag
df_copy = table2.copy()

df_copy['lag_std'] = df_copy[lag_columns].std(axis=1)
df_copy['lag_skew'] = df_copy[lag_columns].skew(axis=1)
df_copy['lag_kurt'] = df_copy[lag_columns].kurt(axis=1)

# Features for regression
features = features + 'lag_std lag_skew lag_kurt'.split()

In [None]:
F = 5000
# Creating random features coming from non-linear functions
split = int(F/2)

random_features = [f'feature_{feature}' for feature in range(F)]

feature_dict = {}

# Creating random features coming from non-linear functions
for feature in random_features[:split]:
    func = np.random.choice(functions)
    feature_dict[feature] = func(df_copy['return_lag_1'])

# Generate noise based on lagstd of each feature
noise_matrix = np.random.randn(len(df_copy), len(random_features[split:])) * df_copy['lag_std'].values[:, None]
noise_features = dict(zip(random_features[split:], noise_matrix.T))

all_features = {**feature_dict, **noise_features}

df_copy = pd.concat([df_copy, pd.DataFrame(all_features)], axis=1)

features = features + random_features


In [None]:
table2 = df_copy
del df_copy

## Running the ridge regression

In [None]:
# Run in parallel over all symbols
results = Parallel(n_jobs=-1)(
    delayed(run_ridge_for_symbol)(symbol, group, features=features, train_window= train_window, shrinkage_list=shrinkage_list)
    for symbol, group in tqdm.tqdm(table2.groupby("SYMBOL"))
)

# Flatten the nested list of predictions
results = [item for sublist in results for item in sublist]

# Convert to DataFrame
df_predictions = pd.DataFrame(results).set_index(['symbol', 'timestamp'])

### Scores

In [None]:
r2 = r2_score(df_predictions['y_true'], df_predictions['y_pred'])
mae = mean_absolute_error(df_predictions['y_true'], df_predictions['y_pred'])

print(f"Overall R²: {r2:.4f}")
print(f"Overall MAE: {mae:.6f}")

### Managed returns

In [None]:
# Market benchmark
df_predictions = df_predictions.reset_index()
df_market_returns = df_predictions.groupby('timestamp')['y_true'].mean()

# Our strategy returns
df_market_timing = df_predictions[['symbol','timestamp','alpha','y_pred','managed_return']]
df_market_timing = pd.pivot_table(df_market_timing, values=['managed_return'], index=['symbol', 'timestamp'],
                       columns=['alpha'], aggfunc="mean")
df_market_timing = df_market_timing.groupby(level=1).mean()

In [None]:
tmp = pd.concat([df_market_returns,df_market_timing], axis = 1)
tmp = tmp/tmp.std()
sr = sharpe_ratio(tmp)

In [None]:
market_returns = tmp['y_true']
managed_returns = tmp.iloc[:,1:]

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(market_returns.cumsum(), label="Market returns", linestyle="--")

for alpha in shrinkage_list:

    plt.plot(
       managed_returns.cumsum()[('managed_return', alpha)],
       label=f"Managed returns (α={alpha})"
    )

plt.xlabel("Time")
plt.ylabel("Cumulative Return")
plt.title("Managed Strategy vs. Market Benchmark")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
print(sr)