In [None]:
from stock_features import prepare_data_for_ml, create_target_variable
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score
from sklearn.feature_selection import SelectFromModel

import warnings
warnings.filterwarnings('ignore')

In [29]:
# Define the parameters for your data pipeline
tickers_list = ['PG', 'KO', 'PEP', 'WMT', 'COST', '^GSPC']
start_date_str = '1986-01-01'
end_date_str = '2023-01-01'
output_filename = "consumer_stocks_final_engineered.csv"

# Make the single function call to run the entire pipeline
final_engineered_df = prepare_data_for_ml(
    tickers=tickers_list,
    start_date=start_date_str,
    end_date=end_date_str,
    output_engineered_csv=output_filename
)

# 2. Create your target variable before modeling
'''
    ticker (str): The stock ticker for the target.
    window (int): The number of days for the return period (e.g., 1 for next day, 5 for a week).
    threshold (float): The minimum return to be considered "up".
'''
target_ticker = 'WMT'

data_target = create_target_variable(final_engineered_df.copy(), target_ticker, window=5, threshold=0.01)



--- Starting Data Preparation Pipeline ---
Fetching data for 6 tickers from 1986-01-01 to 2023-01-01...

Discovered stock prefixes: ['COST', 'KO', 'PEP', 'PG', 'WMT', '^GSPC']

Processing features for stock prefix: COST
  - Calculating ATR for COST (window=14)...
  - Calculating RSI for COST (window=14)...
  - Calculating MACD for COST (fast=12, slow=26, signal=9)...

Processing features for stock prefix: KO
  - Calculating ATR for KO (window=14)...
  - Calculating RSI for KO (window=14)...
  - Calculating MACD for KO (fast=12, slow=26, signal=9)...

Processing features for stock prefix: PEP
  - Calculating ATR for PEP (window=14)...
  - Calculating RSI for PEP (window=14)...
  - Calculating MACD for PEP (fast=12, slow=26, signal=9)...

Processing features for stock prefix: PG
  - Calculating ATR for PG (window=14)...
  - Calculating RSI for PG (window=14)...
  - Calculating MACD for PG (fast=12, slow=26, signal=9)...

Processing features for stock prefix: WMT
  - Calculating ATR for 

In [25]:
# 3. HANDLE MISSING DATA
nan_per_column = df.isnull().sum()
print("Number of NaN values per column:")
print(nan_per_column)

# Count total NaN values in the entire DataFrame
total_nan = df.isnull().sum().sum()
print("\nTotal number of NaN values in the DataFrame:", total_nan)

#df.dropna(inplace=True)

Number of NaN values per column:
Close_COST                    0
Close_KO                      0
Close_PEP                     0
Close_PG                      0
Close_WMT                     0
                             ..
^GSPC_RSI14_lag3              0
^GSPC_RSI14_lag5              0
^GSPC_Volume_MA_Ratio_lag1    0
^GSPC_Volume_MA_Ratio_lag3    0
^GSPC_Volume_MA_Ratio_lag5    0
Length: 290, dtype: int64

Total number of NaN values in the DataFrame: 0


In [None]:
# 4. SEPARATE FEATURES (X) AND TARGET (y)
# The data is ready to be used with the best performing target
target_ticker = 'WMT'
data_target = create_target_variable(final_engineered_df.copy(), target_ticker, window=5, threshold=0.01)

# Separate features (X) and target (y)
target_col_name = f'{target_ticker}_Target'
target_return_col_name = [col for col in data_target.columns if col.startswith(f'{target_ticker}_target_return_')][0]
columns_to_drop = [target_col_name, target_return_col_name, f'Open_{target_ticker}', f'High_{target_ticker}', f'Low_{target_ticker}', f'Close_{target_ticker}']
X = data_target.drop(columns=columns_to_drop)
y = data_target[target_col_name]

X_train = X.loc[:'2021-01-01'].copy()
y_train = y.loc[:'2021-01-01'].copy()
X_test = X.loc['2021-01-01':].copy()
y_test = y.loc['2021-01-01':].copy()

Training data shape: (8644, 286), (8644,)
Testing data shape: (503, 286), (503,)


In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("--- Model Evaluation on Test Data ---")
print(classification_report(y_test, y_pred))

# Optional: Feature Importances
print("\n--- Top 10 Feature Importances ---")
feature_importances = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print(feature_importances.head(10).to_string())

--- Model Evaluation on Test Data ---
              precision    recall  f1-score   support

           0       0.67      0.67      0.67       317
           1       0.44      0.44      0.44       186

    accuracy                           0.58       503
   macro avg       0.55      0.55      0.55       503
weighted avg       0.58      0.58      0.58       503


--- Top 10 Feature Importances ---
COST_MACD_Signal          0.006768
^GSPC_MACD_Line           0.006529
WMT_BB_Bandwidth20        0.006481
WMT_vs_^GSPC_ATR_Ratio    0.006298
PG_MACD_Signal            0.005963
WMT_vs_PEP_CloseRatio     0.005767
WMT_ADX_14                0.005538
WMT_RSI14                 0.005367
^GSPC_PlusDI_14           0.005356
PEP_ADX_14                0.005336


In [None]:
# Define the base model using the best parameters
base_xgboost = xgb.XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    scale_pos_weight=neg_to_pos_ratio,
    **best_params
)

# Fit the base model to the entire training set
base_xgboost.fit(X_train, y_train)

# Use SelectFromModel to select features with an importance above the median
selector = SelectFromModel(base_xgboost, prefit=True, threshold='median')
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

print(f"Original number of features: {X_train.shape[1]}")
print(f"Number of features after selection: {X_train_selected.shape[1]}")

print("\n--- 2. Training the final model on the selected features ---")

# Define the final model with the best parameters and the class imbalance handled
final_xgboost_model = xgb.XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    scale_pos_weight=neg_to_pos_ratio,
    **best_params
)

# Fit the final model to the reduced feature set
final_xgboost_model.fit(X_train_selected, y_train)

# Evaluate the final model on the test data
y_pred_final = final_xgboost_model.predict(X_test_selected)
print("\n--- Final Model Evaluation on Test Data with Pruned Features ---")
print(classification_report(y_test, y_pred_final))

# Optional: Identify the names of the selected features and their importances
selected_features_mask = selector.get_support()
selected_feature_names = X_train.columns[selected_features_mask]

print("\n--- Top 10 Feature Importances (Final Model) ---")
final_feature_importances = pd.Series(final_xgboost_model.feature_importances_, index=selected_feature_names).sort_values(ascending=False)
print(final_feature_importances.head(10).to_string())

Starting GridSearchCV fitting. This may take some time...
Fitting 5 folds for each of 81 candidates, totalling 405 fits

Best parameters found:  {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.9}
Best cross-validation accuracy: 0.47

--- Tuned XGBoost Model Evaluation on Test Data ---
              precision    recall  f1-score   support

           0       0.65      0.74      0.69       317
           1       0.41      0.31      0.35       186

    accuracy                           0.58       503
   macro avg       0.53      0.52      0.52       503
weighted avg       0.56      0.58      0.56       503


--- Top 10 Feature Importances (Tuned XGBoost) ---
Close_WMT_lag1            0.018809
^GSPC_OpenClose_Range     0.014019
WMT_BB_Lower20            0.012717
^GSPC_Stoch_K_14          0.011662
^GSPC_Stoch_D_14_3        0.011606
^GSPC_RSI14               0.010915
PG_ATR14                  0.010513
^GSPC_BB_PctB20           0.010307
COST_ATR14                0