In [10]:
import pandas as pd

In [None]:
# 1. LOAD CSV AND PREPARE DATAFRAME
consumer_stocks = pd.read_csv("consumer_supply_chain_stocks.csv")
consumer_stocks.head(3)

Unnamed: 0,Date,Close_COST,Close_KO,Close_PEP,Close_PG,Close_WMT,High_COST,High_KO,High_PEP,High_PG,...,Close_WMT_lag5,Close_WMT_daily_return_lag1,Close_WMT_daily_return_lag3,Close_WMT_daily_return_lag5,WMT_RSI14_lag1,WMT_RSI14_lag3,WMT_RSI14_lag5,WMT_Volume_MA_Ratio_lag1,WMT_Volume_MA_Ratio_lag3,WMT_Volume_MA_Ratio_lag5
0,2010-03-16,43.420586,16.792847,42.058556,40.499794,13.515332,43.456228,16.846008,42.20497,40.735667,...,12.976312,0.028201,0.00634,-0.001662,81.730175,62.542809,57.376452,1.734993,0.777034,0.926712
1,2010-03-17,43.719963,16.836634,42.376858,40.805786,13.498437,43.812633,16.883542,42.383225,40.818534,...,12.945661,0.010285,-0.001297,-0.002362,85.23603,59.505615,52.295963,1.301111,0.765837,0.928588
2,2010-03-18,43.677212,16.871029,42.345016,40.627296,13.503267,43.79127,16.899173,42.478702,40.882291,...,13.02773,-0.00125,0.028201,0.00634,82.979855,81.730175,62.542809,0.890571,1.734993,0.777034


In [None]:
# Set the 'Date' column as the index and convert it to datetime objects
consumer_stocks['Date'] = pd.to_datetime(consumer_stocks['Date'])
consumer_stocks.set_index('Date', inplace=True)

In [12]:
column_names = consumer_stocks.columns
wmt_columns = [col for col in column_names if 'WMT' in col]

print("Columns containing 'WMT':")
print(wmt_columns)

Columns containing 'WMT':
['Close_WMT', 'High_WMT', 'Low_WMT', 'Open_WMT', 'Volume_WMT', 'WMT_HighLow_Range', 'WMT_OpenClose_Range', 'WMT_Close_to_Range_Ratio', 'WMT_True_Range', 'WMT_ATR14', 'WMT_Volume_Daily_Change', 'WMT_Volume_MA_20D', 'WMT_Volume_MA_Ratio', 'WMT_OBV', 'WMT_RSI14', 'WMT_MACD_Line', 'WMT_MACD_Signal', 'WMT_MACD_Hist', 'WMT_SMA_10', 'WMT_SMA_20', 'WMT_SMA_50', 'WMT_EMA_12', 'WMT_EMA_26', 'WMT_BB_Middle20', 'WMT_BB_Upper20', 'WMT_BB_Lower20', 'WMT_BB_Bandwidth20', 'WMT_BB_PctB20', 'WMT_Stoch_K_14', 'WMT_Stoch_D_14_3', 'WMT_PlusDI_14', 'WMT_MinusDI_14', 'WMT_DX_14', 'WMT_ADX_14', 'Close_WMT_daily_return', 'Close_WMT_Next_Day_Return', 'Close_WMT_lag1', 'Close_WMT_lag3', 'Close_WMT_lag5', 'Close_WMT_daily_return_lag1', 'Close_WMT_daily_return_lag3', 'Close_WMT_daily_return_lag5', 'WMT_RSI14_lag1', 'WMT_RSI14_lag3', 'WMT_RSI14_lag5', 'WMT_Volume_MA_Ratio_lag1', 'WMT_Volume_MA_Ratio_lag3', 'WMT_Volume_MA_Ratio_lag5']


In [21]:
# 2. DEFINE TARGET VARIABLE
target_ticker = 'WMT'
target_col = f'Close_{target_ticker}_Next_Day_Return'
consumer_stocks[f'{target_ticker}_Target'] = (consumer_stocks[target_col] > 0).astype(int)


print(consumer_stocks[f'{target_ticker}_Target'].value_counts())

WMT_Target
1    1699
0    1523
Name: count, dtype: int64


In [None]:
# 3. HANDLE MISSING DATA
nan_per_column = consumer_stocks.isnull().sum()
print("Number of NaN values per column:")
print(nan_per_column)

# Count total NaN values in the entire DataFrame
total_nan = consumer_stocks.isnull().sum().sum()
print("\nTotal number of NaN values in the DataFrame:", total_nan)

#consumer_stocks.dropna(inplace=True)

Number of NaN values per column:
Date                        0
Close_COST                  0
Close_KO                    0
Close_PEP                   0
Close_PG                    0
                           ..
WMT_RSI14_lag5              0
WMT_Volume_MA_Ratio_lag1    0
WMT_Volume_MA_Ratio_lag3    0
WMT_Volume_MA_Ratio_lag5    0
WMT_Target                  0
Length: 242, dtype: int64

Total number of NaN values in the DataFrame: 0


In [None]:
# 4. SEPARATE FEATURES (X) AND TARGET (y)
columns_to_drop = [
    f'{target_ticker}_Target',
    f'Close_{target_ticker}_Next_Day_Return',
    f'Open_{target_ticker}',
    f'High_{target_ticker}',
    f'Low_{target_ticker}',
    f'Close_{target_ticker}'
]

# Ensure we only try to drop columns that actually exist
existing_columns_to_drop = [col for col in columns_to_drop if col in consumer_stocks.columns]

# Separate features (X) and target (y)
X = consumer_stocks.drop(columns=existing_columns_to_drop)
y = consumer_stocks[f'{target_ticker}_Target']

# Define the date for the split
split_date = '2021-01-01'

# Split the data chronologically
X_train = X.loc[:split_date]
y_train = y.loc[:split_date]

X_test = X.loc[split_date:]
y_test = y.loc[split_date:]

print(f"Training data shape: {X_train.shape}, {y_train.shape}")
print(f"Testing data shape: {X_test.shape}, {y_test.shape}")

Training data shape: (2022, 236), (2022,)
Testing data shape: (1200, 236), (1200,)


In [18]:
from sklearn.ensemble import RandomForestClassifier

# Create an instance of the model
# The random_state ensures reproducibility
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

# Now, let's see how it performs on the unseen test data
# We'll use the model to make predictions on the test set
y_pred = model.predict(X_test)

# You can now evaluate the model's performance
from sklearn.metrics import classification_report

print("--- Model Evaluation on Test Data ---")
print(classification_report(y_test, y_pred))

ValueError: could not convert string to float: '2010-03-16'