# Combined Stock Training Notebook
This notebook loads multiple stock CSVs, combines them safely, computes Inside Day labels per ticker, and prepares the training pipeline.

In [173]:
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler
import os

# Path to your folder containing stock CSVs
DATA_PATH = "stocks/*.csv"

files = glob.glob(DATA_PATH)
print("Found files:", files)

half = len(files) // 50
files = files[:half]

# Load and combine
df_list = []
for f in files:
    ticker = os.path.basename(f).replace(".csv", "")
    temp = pd.read_csv(f)
    temp["Ticker"] = ticker
    df_list.append(temp)

data = pd.concat(df_list, ignore_index=True)
print("Combined shape:", data.shape)
data.head()


Found files: ['stocks\\A.csv', 'stocks\\AA.csv', 'stocks\\AACG.csv', 'stocks\\AAL.csv', 'stocks\\AAMC.csv', 'stocks\\AAME.csv', 'stocks\\AAN.csv', 'stocks\\AAOI.csv', 'stocks\\AAON.csv', 'stocks\\AAP.csv', 'stocks\\AAPL.csv', 'stocks\\AAT.csv', 'stocks\\AAU.csv', 'stocks\\AAWW.csv', 'stocks\\AAXN.csv', 'stocks\\AB.csv', 'stocks\\ABB.csv', 'stocks\\ABBV.csv', 'stocks\\ABC.csv', 'stocks\\ABCB.csv', 'stocks\\ABEO.csv', 'stocks\\ABEV.csv', 'stocks\\ABG.csv', 'stocks\\ABIO.csv', 'stocks\\ABM.csv', 'stocks\\ABMD.csv', 'stocks\\ABR.csv', 'stocks\\ABT.csv', 'stocks\\ABTX.csv', 'stocks\\ABUS.csv', 'stocks\\AC.csv', 'stocks\\ACA.csv', 'stocks\\ACAD.csv', 'stocks\\ACAM.csv', 'stocks\\ACAMU.csv', 'stocks\\ACB.csv', 'stocks\\ACBI.csv', 'stocks\\ACC.csv', 'stocks\\ACCO.csv', 'stocks\\ACEL.csv', 'stocks\\ACER.csv', 'stocks\\ACGL.csv', 'stocks\\ACGLO.csv', 'stocks\\ACGLP.csv', 'stocks\\ACH.csv', 'stocks\\ACHC.csv', 'stocks\\ACHV.csv', 'stocks\\ACIA.csv', 'stocks\\ACIU.csv', 'stocks\\ACIW.csv', 'stocks

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Ticker
0,1999-11-18,32.546494,35.765381,28.612303,31.473534,27.068665,62546300.0,A
1,1999-11-19,30.71352,30.758226,28.478184,28.880543,24.838577,15234100.0,A
2,1999-11-22,29.551144,31.473534,28.657009,31.473534,27.068665,6577800.0,A
3,1999-11-23,30.400572,31.205294,28.612303,28.612303,24.60788,5975600.0,A
4,1999-11-24,28.701717,29.998211,28.612303,29.372318,25.261524,4843200.0,A


In [174]:
# Sort by ticker and date to ensure correct ordering
data['Date'] = pd.to_datetime(data['Date'])
data = data.sort_values(['Ticker','Date']).reset_index(drop=True)

# Compute Inside Day per ticker independently
data['Inside Day'] = data.groupby('Ticker').apply(
    lambda df: (df['High'] < df['High'].shift(1)) & (df['Low'] > df['Low'].shift(1))
).reset_index(level=0, drop=True)

data['range'] = data.groupby('Ticker').apply(
    lambda df: df['High'] - df['Low']
).reset_index(level=0, drop=True)

data['prev_range'] = data.groupby('Ticker').apply(
    lambda df: (df['High'] - df['Low']).shift(1)
).reset_index(level=0, drop=True)

data['compression'] = data.groupby('Ticker').apply(
    lambda df: (df['High'] - df['Low']) / (df['High'] - df['Low']).shift(1)
).reset_index(level=0, drop=True)

data['range_shrink'] = data.groupby('Ticker').apply(
    lambda df: ((df['High'] - df['Low']).shift(-1) < (df['High'] - df['Low'])).astype(int)
).reset_index(level=0, drop=True)

data['pct_change'] = data.groupby('Ticker').apply(
    lambda df: df['Close'].pct_change()
).reset_index(level=0, drop=True)

data['volatility'] = data.groupby('Ticker').apply(
    lambda df: (df['High'] - df['Low']) / df['Close']
).reset_index(level=0, drop=True)

data['prev_volatility'] = data.groupby('Ticker').apply(
    lambda df: ((df['High'] - df['Low']) / df['Close']).shift(1)
).reset_index(level=0, drop=True)

data['volatility_ratio'] = data.groupby('Ticker').apply(
    lambda df: ((df['High'] - df['Low']) / df['Close']) /
               (((df['High'] - df['Low']) / df['Close']).shift(1))
).reset_index(level=0, drop=True)

data = data.dropna().reset_index(drop=True)

data['Inside Day Final'] = data['Inside Day'].replace({True:1, False:0})
data['Inside Day Final'].value_counts()


  data['Inside Day'] = data.groupby('Ticker').apply(
  data['range'] = data.groupby('Ticker').apply(
  data['prev_range'] = data.groupby('Ticker').apply(
  data['compression'] = data.groupby('Ticker').apply(
  data['range_shrink'] = data.groupby('Ticker').apply(
  lambda df: df['Close'].pct_change()
  data['pct_change'] = data.groupby('Ticker').apply(
  data['volatility'] = data.groupby('Ticker').apply(
  data['prev_volatility'] = data.groupby('Ticker').apply(
  data['volatility_ratio'] = data.groupby('Ticker').apply(
  data['Inside Day Final'] = data['Inside Day'].replace({True:1, False:0})


Inside Day Final
0    455559
1     59254
Name: count, dtype: int64

In [175]:
# Select features
features = data[['Open','High','Low','Close','Adj Close','Volume',
            'range','prev_range','compression','pct_change','volatility','volatility_ratio']].fillna(0).values
features = np.where(np.isfinite(features), features, np.nan)
features = np.nan_to_num(features, nan=0.0, posinf=0.0, neginf=0.0)
X = features
y = data['Inside Day Final'].values


In [176]:
# Train-test split
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, test_idx in sss.split(X, y):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# Oversampling only training data
ros = RandomOverSampler(sampling_strategy=0.44, random_state=42)
#X_train_resampled, y_train_resampled = ros.fit_resample(X_train_scaled, y_train)

X_train_resampled, y_train_resampled = X_train_scaled, y_train

# Train model
rf = RandomForestClassifier(n_estimators=250,max_depth=11,random_state=42)
rf.fit(X_train_resampled, y_train_resampled)

# Evaluate
y_pred = rf.predict(X_test_scaled)
print(confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[89781  1331]
 [ 9907  1944]]
Accuracy: 0.8908539960956848
              precision    recall  f1-score   support

           0       0.90      0.99      0.94     91112
           1       0.59      0.16      0.26     11851

    accuracy                           0.89    102963
   macro avg       0.75      0.57      0.60    102963
weighted avg       0.87      0.89      0.86    102963



In [177]:
# load separate dataset to predict

#df = pd.read_excel("yahoo_data.xlsx")
df = pd.read_csv("stocks/SAFM.csv")
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date').reset_index(drop=True)

df['Inside Day'] = (df['High'] < df['High'].shift(1)) & (df['Low'] > df['Low'].shift(1))
df['Inside Day Final'] = df['Inside Day'].replace({True:1, False:0})
df['Inside Day Final'].value_counts()
df['range'] = df['High'] - df['Low']
df['prev_range'] = df['range'].shift(1)
df['compression'] = df['range'] / df['prev_range']
df['range_shrink'] = (df['range'].shift(-1) < df['range']).astype(int)
df['pct_change'] = df['Close'].pct_change()
df['volatility'] = df['range'] / df['Close']
df['prev_volatility'] = df['volatility'].shift(1)
df['volatility_ratio'] = df['volatility'] / df['prev_volatility']
df = df.dropna().reset_index(drop=True)

features_new = df[['Open','High','Low','Close','Adj Close','Volume',
            'range','prev_range','compression','pct_change','volatility','volatility_ratio']].fillna(0).values
target_new = df["Inside Day Final"].values

features_new = np.where(np.isfinite(features_new), features_new, np.nan)
features_new = np.nan_to_num(features_new, nan=0.0, posinf=0.0, neginf=0.0)

X_test_scaled_new  = scaler.transform(features_new)

  df['Inside Day Final'] = df['Inside Day'].replace({True:1, False:0})


In [178]:
random_forest_test_predictions_new = rf.predict(X_test_scaled_new)

In [179]:
conf_matrix = confusion_matrix(target_new,random_forest_test_predictions_new)
print(conf_matrix)
print("Accuracy:", accuracy_score(target_new, random_forest_test_predictions_new))
print(classification_report(target_new, random_forest_test_predictions_new))

[[7178  122]
 [ 729  173]]
Accuracy: 0.896244818336991
              precision    recall  f1-score   support

           0       0.91      0.98      0.94      7300
           1       0.59      0.19      0.29       902

    accuracy                           0.90      8202
   macro avg       0.75      0.59      0.62      8202
weighted avg       0.87      0.90      0.87      8202

