In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorfloiw .keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

: 

In [None]:
df = pd.read_csv("refined_data.csv")
df = pd.read_csv("stock_data.csv")

In [None]:
# Replace '_' with NaN in numeric columns, then conert to float
for col in ["Open", "High", "Low", "Close", "Adj Closed", "Volume"]:
    df2[col] = df2[col].replace('_', np.nan) # Replace with NaN
    df2[col] = df2[col].replace(',', '', regex=True) # Remove commas
    df2[col] = df2[col].astype(float) # Convert to float

# Handle missing values (forward fill)
df2.fillna(method='ffill', inplace=True)

In [None]:
# Convert Date columns to datetime (handles mixed formats automatically)
df1['Date'] = pd.to_datetime(df1['Date'], format='mixed') 
df2['Date'] = pd.to_datetime(df2['Date'], format='mixed')

# Merge datesets on Date
df = pd.merge(df1, df2, on='Date', how='inner')

print(df.shape)
df.head()

In [None]:
# Handles Missing Values
df = df.fillna(method='ffill') # Forward fill missing values

In [None]:
# Create target variable (Uptrend = 1, Downtrend = 0)
df['Target'] = (df['Close_y'].shift(-1) > df['Close_y']).astype(int)

In [None]:
# Selecting new features 
features = [col for col in df.columns if col not in ['Date', 'Targt']]
x = df[features].values
y = df['Target'].values

In [None]:
# Scale features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)

In [None]:
# LSTM expects 3D input (samples, timesteps, features)
timesteps = 10
def create_sequences(X, y, timesteps):
    Xs, ys = [], []
    for i in range(len(X) - timesteps):
        Xs.append(X[i:i+timesteps])
        ys.append(y[i+timesteps])
    return np.array(Xs), np.array(ys)

In [None]:
X_train_seq, y_train_seq = create_sequences(X_train, y_train, timesteps)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, timesteps)

In [None]:
# Build the LSTM Model
lstm_model = Sequential([
    LSTM(64, input_shape=(X_train_seq.shape[1], X_train_seq.shape[2]), return_sequences=False),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

In [None]:
lstm_model.compile(loss= 'binary_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])

In [None]:
history = lstm_model.fit(X_train_seq, y_train_seq, epochs=40, batch_size=32,
                         validation_data=(X_test_seq, y_test_seq), verbose=1) 

In [None]:
# Predict with LSTM
y_pred_lstm = (lstm_model.predict(X_test_seq) > 0.5).astype(int).flatten()

In [None]:
# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [None]:
# Train Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)

In [None]:
# Train XGBoost
xgb = XGBoostClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42, eval_metric='logloss')
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(x_test)

In [None]:
# Ensemble prediction (majority voting)
final_preds = []
for i in range(len(y_test)):
    votes = [y_pred_lstm[i % len(y_pred_lstm)], y_pred_rf[i], y_pred_gb[i], y_pred_xgb[i]]
    final_preds.append(int(sum(votes) >= 2))
    

In [None]:
# Evaluate Models
def evaluate_model(y_true, y_pred, name):
    rms = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f"{name} -> RMSE: {rms:.4f}, ")