In [1]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [2]:
data = pd.read_csv("../data/processed/all_features.csv", index_col=0)

In [3]:
df = pd.read_pickle("../data/processed/finbert_probs_only_qa.pkl")

In [4]:
merged_df = pd.merge(df, data, how='left', left_index=True, right_index=True)

In [5]:
merged_df.columns

Index(['positive', 'negative', 'neutral', 'date', 'exchange', 'q', 'ticker',
       'transcript', 'year', 'time', 'change_sp500_1week',
       'change_sp500_1month', 'change_sp500_3month', 'interest_rate',
       'change_interest_rate_3month', 'change_during_news', 'change_day_after',
       'change_week_after', 'change_month_after', 'past_change_1week',
       'past_change_1month', 'past_change_3month', 'processed_transcript'],
      dtype='object')

In [6]:
features_with_sentiment = ['positive','negative','neutral','change_sp500_1week',
       'change_sp500_1month', 'change_sp500_3month', 'interest_rate',
       'change_interest_rate_3month', 'past_change_1week',
       'past_change_1month', 'past_change_3month']

features_without_sentiment = ['change_sp500_1week',
       'change_sp500_1month', 'change_sp500_3month', 'interest_rate',
       'change_interest_rate_3month', 'past_change_1week',
       'past_change_1month', 'past_change_3month']

In [7]:
merged_df.index.name = 'Index'
merged_df = merged_df.sort_values(['date', 'Index'])

In [8]:
param_grid = {
    'max_depth': [2, 3, 4],
    'learning_rate': [0.001, 0.01, 0.05],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.9]
}

In [9]:
def correct_direction(y_test, predictions):
    same_sign_count = sum((y_test >= 0) == (predictions >= 0))
    total_samples = len(y_test)
    percentage_same_sign = (same_sign_count / total_samples) * 100
    return percentage_same_sign


def sum_returns_strat(y_test, predictions, buy_threshold=0.00, sell_threshold=-0.00):
    money_for_each_bet = 1
    money = 0
    nb_bets = ((predictions > buy_threshold) | (predictions < sell_threshold)).sum()
    for nb, p in enumerate(predictions):
        if p > buy_threshold:
            money += y_test.iloc[nb] * money_for_each_bet
        if p < sell_threshold:
            money -= min(y_test.iloc[nb] * money_for_each_bet, 1)
    return money / nb_bets if nb_bets > 0 else 0

def prop_sum_returns_strat(y_test, predictions, buy_threshold=0.00, sell_threshold=-0.00):
    
    total_money = len(predictions)
    sum_pred = np.sum(np.abs(predictions))
    nb_bets = ((predictions > buy_threshold) | (predictions < sell_threshold)).sum()
    money = 0
    for nb, p in enumerate(predictions):
        money_for_the_bet = (np.abs(predictions[nb]) / sum_pred) * total_money
        if p > buy_threshold:
            money += y_test.iloc[nb] * money_for_the_bet
        if p < sell_threshold:
            money -= min(y_test.iloc[nb] * money_for_the_bet, money_for_the_bet)
    return  money / nb_bets if nb_bets > 0 else 0

In [10]:
def best_sent_strat(y_test, X_test, predictions, percentile=5):
    # Calculate the score difference
    score_diff = X_test['positive'] - X_test['negative']
    
    # Determine the 5th and 95th percentiles
    lower_bound = np.percentile(score_diff, percentile)
    upper_bound = np.percentile(score_diff, 100-percentile)
    
    # Initialize the variables
    money_for_each_bet = 1
    money = 0
    nb_bets = 0
    
    for nb, p in enumerate(predictions):
        # Check if the current score difference is within the desired percentiles
        if score_diff.iloc[nb] > lower_bound and score_diff.iloc[nb] < upper_bound:
            # Continue if not within the 5th or 95th percentile bounds
            continue
        
        # Trade if the prediction is above the buy threshold
        if score_diff.iloc[nb] > upper_bound:
            money += y_test.iloc[nb] * money_for_each_bet
            nb_bets += 1
        
        # Trade if the prediction is below the sell threshold
        if score_diff.iloc[nb] < lower_bound:
            money -= min(y_test.iloc[nb] * money_for_each_bet, 1)
            nb_bets += 1
    
    # Return the average money per bet or 0 if no bets were made
    return money / nb_bets if nb_bets > 0 else 0

### During news with sentiments

In [12]:
X = merged_df[features_with_sentiment]
y = merged_df['change_during_news']

X_train, X_test_d_s, y_train, y_test_d_s = train_test_split(X, y, test_size=0.4, shuffle=False)


xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score (MSE):", -best_score)

# Use the best parameters to train a new model
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
best_predictions_d_s = best_model.predict(X_test_d_s)

# Evaluate the best model
best_mse = mean_squared_error(y_test_d_s, best_predictions_d_s)
print("Best Model Mean Squared Error:", best_mse)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.4s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.2s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.2s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.2s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=1.0; total time=   0.2s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=1.0; total time=   0.1s
[C

In [13]:
correct_direction(y_test_d_s, best_predictions_d_s)

54.6620621608288

In [14]:
sum_returns_strat(y_test_d_s, best_predictions_d_s)

0.007435692527310543

In [15]:
prop_sum_returns_strat(y_test_d_s, best_predictions_d_s)

0.008310513635956248

In [16]:
best_sent_strat(y_test_d_s, X_test_d_s, best_predictions_d_s)

-0.008528989456452684

In [17]:
# Calculate the MSE
mse = mean_squared_error(y_test_d_s, best_predictions_d_s)
np.sqrt(mse)

0.07882619474858593

In [18]:
basline = (y_test_d_s * 1).sum() / len(y_test_d_s)
basline

-0.0028414708954421788

## During news without sentiments

In [19]:
X = merged_df[features_without_sentiment]
y = merged_df['change_during_news']

X_train, X_test_d_w, y_train, y_test_d_w = train_test_split(X, y, test_size=0.4, shuffle=False)


xgb_model = xgb.XGBRegressor(objective='reg:squarederror')


grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score (MSE):", -best_score)

# Use the best parameters to train a new model
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
best_predictions_d_w = best_model.predict(X_test_d_w)

# Evaluate the best model
best_mse = mean_squared_error(y_test_d_w, best_predictions_d_w)
print("Best Model Mean Squared Error:", best_mse)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   1.2s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.2s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=1.0; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=1.0; total time=   0.1s
[C

In [20]:
correct_direction(y_test_d_w, best_predictions_d_w)

51.57868771583621

In [21]:
sum_returns_strat(y_test_d_w, best_predictions_d_w)

-0.002995722736827787

In [22]:
prop_sum_returns_strat(y_test_d_w, best_predictions_d_w)

-0.002171790556735618

In [24]:
# Calculate the MSE
mse = mean_squared_error(y_test_d_w, best_predictions_d_w)
np.sqrt(mse)

0.07909855631056645

In [25]:
basline = (y_test_d_w * 1).sum() / len(y_test_d_w)
basline

-0.0028414708954421788

## Day after with sentiments

In [26]:
X = merged_df[features_with_sentiment]
y = merged_df['change_day_after']

X_train, X_test_dds, y_train, y_test_dds = train_test_split(X, y, test_size=0.4, shuffle=False)

xgb_model = xgb.XGBRegressor(objective='reg:squarederror')


grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score (MSE):", -best_score)

# Use the best parameters to train a new model
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
best_predictions_dds = best_model.predict(X_test_dds)

# Evaluate the best model
best_mse = mean_squared_error(y_test_dds, best_predictions_dds)
print("Best Model Mean Squared Error:", best_mse)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.4s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.0s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.0s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=1.0; total time=   0.0s
[C

In [27]:
best_params

{'colsample_bytree': 0.9,
 'learning_rate': 0.01,
 'max_depth': 4,
 'n_estimators': 100,
 'subsample': 0.8}

In [28]:
correct_direction(y_test_dds, best_predictions_dds)

50.962012826837686

In [29]:
sum_returns_strat(y_test_dds, best_predictions_dds)

0.00020567313095137574

In [30]:
prop_sum_returns_strat(y_test_dds, best_predictions_dds)

0.004620336189894433

In [31]:
best_sent_strat(y_test_dds, X_test_dds, best_predictions_dds)

0.0011419368166128738

In [32]:
# Calculate the MSE
mse = mean_squared_error(y_test_dds, best_predictions_dds)
np.sqrt(mse)

0.07155967141134784

In [33]:
basline = (y_test_dds * 1).sum() / len(y_test_dds)
basline

0.001982538564973389

## Day after without sentiments

In [34]:
X = merged_df[features_without_sentiment]
y = merged_df['change_day_after']

X_train, X_test_ddw, y_train, y_test_ddw = train_test_split(X, y, test_size=0.4, shuffle=False)

xgb_model = xgb.XGBRegressor(objective='reg:squarederror')


grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score (MSE):", -best_score)

# Use the best parameters to train a new model
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
best_predictions_ddw = best_model.predict(X_test_ddw)

# Evaluate the best model
best_mse = mean_squared_error(y_test_ddw, best_predictions_ddw)
print("Best Model Mean Squared Error:", best_mse)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.2s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=1.0; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=1.0; total time=   0.1s
[C

In [35]:
correct_direction(y_test_ddw, best_predictions_ddw)

50.888011840157866

In [36]:
sum_returns_strat(y_test_ddw, best_predictions_ddw)

0.0010272805152618

In [37]:
prop_sum_returns_strat(y_test_ddw, best_predictions_ddw)

0.0029421991211713767

In [38]:
# Calculate the MSE
mse = mean_squared_error(y_test_ddw, best_predictions_ddw)
np.sqrt(mse)

0.07168551412271854

In [39]:
basline = (y_test_ddw * 1).sum() / len(y_test_ddw)
basline

0.001982538564973389

## Week after with sentiments

In [40]:
X = merged_df[features_with_sentiment]
y = merged_df['change_week_after']

X_train, X_test_ws, y_train, y_test_ws = train_test_split(X, y, test_size=0.4, shuffle=False)

xgb_model = xgb.XGBRegressor(objective='reg:squarederror')


grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score (MSE):", -best_score)

# Use the best parameters to train a new model
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
best_predictions_ws = best_model.predict(X_test_ws)

# Evaluate the best model
best_mse = mean_squared_error(y_test_ws, best_predictions_ws)
print("Best Model Mean Squared Error:", best_mse)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.5s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=1.0; total time=   0.0s
[C

In [41]:
correct_direction(y_test_ws, best_predictions_ws)

49.82733103108041

In [42]:
sum_returns_strat(y_test_ws, best_predictions_ws)

0.002793435034659102

In [43]:
prop_sum_returns_strat(y_test_ws, best_predictions_ws)

0.017580517007359735

In [44]:
best_sent_strat(y_test_ws, X_test_ws, best_predictions_ws)

0.0026420544040490573

In [45]:
# Calculate the MSE
mse = mean_squared_error(y_test_ws, best_predictions_ws)
np.sqrt(mse)

0.11215543943798936

In [46]:
basline = (y_test_ws * 1).sum() / len(y_test_ws)
basline

0.004382150000065283

## Week after without sentiments

In [47]:
X = merged_df[features_without_sentiment]
y = merged_df['change_week_after']

X_train, X_test_ww, y_train, y_test_ww = train_test_split(X, y, test_size=0.4, shuffle=False)


xgb_model = xgb.XGBRegressor(objective='reg:squarederror')


grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score (MSE):", -best_score)

# Use the best parameters to train a new model
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
best_predictions_ww = best_model.predict(X_test_ww)

# Evaluate the best model
best_mse = mean_squared_error(y_test_ww, best_predictions_ww)
print("Best Model Mean Squared Error:", best_mse)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.0s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.0s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.0s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=1.0; total time=   0.0s
[C

In [48]:
correct_direction(y_test_ww, best_predictions_ww)

52.71336951159349

In [49]:
sum_returns_strat(y_test_ww, best_predictions_ww)

0.007991287601031046

In [50]:
prop_sum_returns_strat(y_test_ww, best_predictions_ww)

0.017461006689534115

In [51]:
# Calculate the MSE
mse = mean_squared_error(y_test_ww, best_predictions_ww)
np.sqrt(mse)

0.11215370336613269

In [52]:
basline = (y_test_ww * 1).sum() / len(y_test_ww)
basline

0.004382150000065283

## Month after with sentiments

In [53]:
X = merged_df[features_with_sentiment]
y = merged_df['change_month_after']

X_train, X_test_ms, y_train, y_test_ms = train_test_split(X, y, test_size=0.4, shuffle=False)

xgb_model = xgb.XGBRegressor(objective='reg:squarederror')


grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score (MSE):", -best_score)

# Use the best parameters to train a new model
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
best_predictions_ms = best_model.predict(X_test_ms)

# Evaluate the best model
best_mse = mean_squared_error(y_test_ms, best_predictions_ms)
print("Best Model Mean Squared Error:", best_mse)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.0s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.0s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=1.0; total time=   0.0s
[C

In [54]:
correct_direction(y_test_ms, best_predictions_ms)

41.267883571780956

In [55]:
sum_returns_strat(y_test_ms, best_predictions_ms)

-0.02515117033299294

In [56]:
prop_sum_returns_strat(y_test_ms, best_predictions_ms)

-0.02490535588426038

In [57]:
best_sent_strat(y_test_ms, X_test_ms, best_predictions_ms)

0.008701094084890998

In [58]:
# Calculate the MSE
mse = mean_squared_error(y_test_ms, best_predictions_ms)
np.sqrt(mse)

0.16588311580341317

In [59]:
basline = (y_test_ms * 1).sum() / len(y_test_ms)
basline

-0.025151170332992905

## Month after without sentiments

In [60]:
X = merged_df[features_without_sentiment]
y = merged_df['change_month_after']

X_train, X_test_mw, y_train, y_test_mw = train_test_split(X, y, test_size=0.4, shuffle=False)

xgb_model = xgb.XGBRegressor(objective='reg:squarederror')


grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score (MSE):", -best_score)

# Use the best parameters to train a new model
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
best_predictions_mw = best_model.predict(X_test_mw)

# Evaluate the best model
best_mse = mean_squared_error(y_test_mw, best_predictions_mw)
print("Best Model Mean Squared Error:", best_mse)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.8s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.0s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=0.9; total time=   0.0s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.9, learning_rate=0.001, max_depth=2, n_estimators=100, subsample=1.0; total time=   0.0s
[C

In [61]:
correct_direction(y_test_mw, best_predictions_mw)

41.267883571780956

In [62]:
sum_returns_strat(y_test_mw, best_predictions_mw)

-0.02515117033299294

In [63]:
prop_sum_returns_strat(y_test_mw, best_predictions_mw)

-0.024911900006503097

In [64]:
# Calculate the MSE
mse = mean_squared_error(y_test_mw, best_predictions_mw)
np.sqrt(mse)

0.1658660867935694

In [65]:
basline = (y_test_mw * 1).sum() / len(y_test_mw)
basline

-0.025151170332992905