In [1]:
# Stock Movement Prediction: Best Practices Implementation (Classification)

import pandas as pd
import numpy as np

# === 1. Load and Prepare Data ===
df = pd.read_csv("MSFT.csv")
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date').reset_index(drop=True)

# Binary Target: Predict if Close_t+1 > Close_t
df['Target'] = (df['Close'].shift(-1) > df['Close']).astype(int)

# === 2. Feature Engineering ===
df['DailyReturn'] = (df['Close'] - df['Open']) / df['Open']
df['Volatility'] = (df['High'] - df['Low']) / df['Open']
df['Close_Open_Diff'] = df['Close'] - df['Open']

# Lag features
df['Close_t-1'] = df['Close'].shift(1)
df['Volume_t-1'] = df['Volume'].shift(1)
df['DailyReturn_t-1'] = df['DailyReturn'].shift(1)

# Rolling features
df['SMA_3'] = df['Close'].rolling(3).mean()
df['SMA_7'] = df['Close'].rolling(7).mean()
df['EMA_10'] = df['Close'].ewm(span=10, adjust=False).mean()
df['Momentum_3'] = df['Close'] - df['Close'].shift(3)

# RSI
change = df['Close'].diff()
gain = change.clip(lower=0).rolling(14).mean()
loss = -change.clip(upper=0).rolling(14).mean()
rs = gain / loss
df['RSI_14'] = 100 - (100 / (1 + rs))

# Drop NA
features = [
    'DailyReturn', 'Volatility', 'Close_Open_Diff', 'Close_t-1',
    'Volume_t-1', 'DailyReturn_t-1', 'SMA_3', 'SMA_7', 'EMA_10',
    'Momentum_3', 'RSI_14']
df = df.dropna(subset=features + ['Target']).reset_index(drop=True)

# === 3. Train/Test Split ===
from sklearn.model_selection import train_test_split
X = df[features]
y = df['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# === 4. Scaling ===
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === 5. Modeling with Best Practices ===
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.metrics import classification_report, roc_auc_score

base_models = [
    ('svm', SVC(kernel='rbf', C=1, gamma='scale', probability=True)),
    ('mlp', MLPClassifier(hidden_layer_sizes=(50,), max_iter=500)),
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=5))
]

meta_model = LogisticRegression()
stack = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5, passthrough=True)
stack.fit(X_train_scaled, y_train)

# === 6. Evaluation ===
y_pred = stack.predict(X_test_scaled)
y_proba = stack.predict_proba(X_test_scaled)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))


  df['Date'] = pd.to_datetime(df['Date'])


              precision    recall  f1-score   support

           0       0.49      0.83      0.62       859
           1       0.63      0.25      0.36       991

    accuracy                           0.52      1850
   macro avg       0.56      0.54      0.49      1850
weighted avg       0.56      0.52      0.48      1850

ROC AUC: 0.5457499333348214
