In [31]:
import numpy as np
import pandas as pd


from xgboost import XGBRegressor
from sklearn.linear_model import LassoCV
from sklearn.model_selection import TimeSeriesSplit

In [32]:
sent = pd.read_csv("sentiment.csv", index_col=0, parse_dates=True)
rec  = pd.read_csv("recommendation.csv", index_col=0, parse_dates=True)
price = pd.read_csv("price.csv", index_col=0, parse_dates=True)

sent, rec = sent.align(rec, join="inner", axis=0)
sent, price = sent.align(price, join="inner", axis=0)
rec = rec.loc[sent.index]

In [33]:
f_sent1 = sent.add_suffix("_s1")
f_sent3 = sent.rolling(3).mean().add_suffix("_s3")
f_sent7 = sent.rolling(7).mean().add_suffix("_s7")

f_rec1 = rec.add_suffix("_r1")
f_rec3 = rec.rolling(3).mean().add_suffix("_r3")

mom5  = price.pct_change(5).add_suffix("_mom5")
mom21 = price.pct_change(21).add_suffix("_mom21")
vol10 = price.pct_change().rolling(10).std().add_suffix("_vol10")

  mom5  = price.pct_change(5).add_suffix("_mom5")
  mom21 = price.pct_change(21).add_suffix("_mom21")
  vol10 = price.pct_change().rolling(10).std().add_suffix("_vol10")


In [34]:
features = pd.concat([
    f_sent1, f_sent3, f_sent7,
    f_rec1, f_rec3,
    mom5, mom21, vol10
], axis=1)

features = features.shift(1)
returns = price.pct_change().shift(-1)


  returns = price.pct_change().shift(-1)


In [35]:
X = features.stack().reset_index()
X.columns = ["date", "col", "value"]

X["ric"] = X["col"].str.split("_").str[0]
X["feature_name"] = X["col"].str.split("_").str[1]

X = X.drop(columns=["col"])

X = X.pivot_table(index=["date","ric"], 
                  columns="feature_name", 
                  values="value")

y = returns.stack().reset_index()
y.columns = ["date", "ric", "target"]
y = y.set_index(["date","ric"])

df = X.join(y).dropna()

X_mat = df.drop(columns=["target"])
y_vec = df["target"]

## LASSO REGRESSION

In [36]:
model = LassoCV(cv=5, n_alphas=100)
model.fit(X_mat, y_vec)

pred = model.predict(X_mat)
df["predicted_return"] = pred

market_ret = returns.mean(axis=1)
df["market"] = df.index.get_level_values("date").map(market_ret)
df["alpha"] = df["predicted_return"] - df["market"]

In [37]:
ic = np.corrcoef(df["predicted_return"], df["target"])[0,1]
print("IC:", ic)

IC: 0.007257679483169667


In [38]:
accuracy = ((df["predicted_return"] > 0) 
            == (df["target"] > 0)).mean()

print("Directional accuracy:", accuracy)


Directional accuracy: 0.5092920829304973


## XGBOOST

In [39]:
tscv = TimeSeriesSplit(n_splits=5)

preds = []
actuals = []

for train_idx, test_idx in tscv.split(X_mat):
    X_train, X_test = X_mat.iloc[train_idx], X_mat.iloc[test_idx]
    y_train, y_test = y_vec.iloc[train_idx], y_vec.iloc[test_idx]

    xgb = XGBRegressor(
        n_estimators=400,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror"
    )
    xgb.fit(X_train, y_train)
    preds.extend(xgb.predict(X_test))
    actuals.extend(y_test)

ic_oos = np.corrcoef(preds, actuals)[0,1]
print("IC out-of-sample:", ic_oos)

IC out-of-sample: -0.0055802279875674385
