In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ta.momentum import RSIIndicator
from ta.trend import MACD, SMAIndicator
from ta.volatility import BollingerBands
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import datetime

In [None]:
# Project time frame
start_date = "2020-01-01"
end_date = "2025-05-01"

# Benchmark index
benchmark_symbol = "^GSPC"

# Top 10 symbols from each sector (replace with your actual tickers if needed)
sectors = {
    "Technology": ["AAPL"],
    # "Technology": ['AAPL', 'GOOGL', 'MSFT', 'AMZN', 'META', 'TSLA', 'NFLX', 'NVDA', 'INTC', 'AMD'],
    # "Communication": ["GOOGL", "META", "NFLX", "TMUS", "DIS", "VZ", "T", "CHTR", "CMCSA", "WBD"],
    # "Health": ["JNJ", "PFE", "ABBV", "LLY", "MRK", "TMO", "BMY", "UNH", "ABT", "CVS"],
    # "Financials": ["JPM", "BAC", "WFC", "C", "GS", "MS", "AXP", "USB", "BK", "SCHW"],
    # "Defensive": ["PG", "KO", "PEP", "WMT", "COST", "CL", "MO", "KMB", "MDLZ", "KR"],
    # "Cyclical": ["AMZN", "HD", "LOW", "MCD", "NKE", "SBUX", "BKNG", "TGT", "EBAY", "GM"],
    # "Property": ["PLD", "AMT", "CCI", "EQIX", "DLR"benchmark_symbol "SPG", "O", "PSA", "VTR", "EXR"],
    # "Benchmark": [benchmark_symbol]
}

# Flatten list of all tickers
all_symbols = [symbol for group in sectors.values() for symbol in group]

### Load from CSV

In [None]:
# Load dataset from CSV
import os

if os.path.exists("stock_sector_data.csv"):
    stock_df = pd.read_csv("stock_sector_data.csv", parse_dates=["Date"])
    print("Loaded dataset from CSV.")
else:
    print("CSV file not found. Please run the download step.")
    
# Drop any rows that are not in sectors dictionary
stock_df = stock_df[stock_df["Symbol"].isin(all_symbols)]
    

# Drop any rows with symbol that match the benchmark symbol
stock_df = stock_df[stock_df["Symbol"] != benchmark_symbol]


### Feature Engineering — Add Technical Indicators


In [None]:
def add_technical_indicators(df):
    result = []

    for symbol in df["Symbol"].unique():
        sub = df[df["Symbol"] == symbol].sort_values("Date").copy()

        # Momentum: past 1-week and 4-week returns
        sub["Return_1w"] = sub["Close"].pct_change(1*7)
        sub["Return_4w"] = sub["Close"].pct_change(4*7)

        # Volatility: Rolling std dev
        sub["Volatility_4w"] = sub["Close"].rolling(window=4*7).std()
        
        # Moving averages
        sub['MA5'] = sub['Close'].rolling(window=5*7).mean()
        sub['MA20'] = sub['Close'].rolling(window=20*7).mean()
        sub['MA50'] = sub['Close'].rolling(window=50*7).mean()
        
        # Volume indicators
        sub['Volume_Change'] = sub['Volume'].pct_change(periods=7)
        sub['Volume_MA5'] = sub['Volume'].rolling(window=5*7).mean()
        
        # MACD
        sub["MACD"] = MACD(close=sub["Close"]).macd()
        sub["MACD_Signal"] = MACD(close=sub["Close"]).macd_signal()

        # RSI
        sub["RSI"] = RSIIndicator(close=sub["Close"], window=14).rsi()

        # SMAbenchmark_symbol
        # Bollinger Bands
        bb = BollingerBands(close=sub["Close"], window=20)
        sub["BB_Upper"] = bb.bollinger_hband()
        sub["BB_Lower"] = bb.bollinger_lband()
        
        # 

        result.append(sub)

    features_df = pd.concat(result).reset_index(drop=True)
    return features_df

features_df = add_technical_indicators(stock_df)
features_df = features_df.dropna()
features_df.head()


### Create Outperformance Label

In [None]:
def create_outperformance_labels(features_df, benchmark_df):
    # Calculate future return for S&P 500
    benchmark_df["Benchmark_Return_Next"] = benchmark_df["Close"].pct_change().shift(-1)

    # Ensure 'Date' columns are of the same type
    features_df["Date"] = pd.to_datetime(features_df["Date"])
    benchmark_df["Date"] = pd.to_datetime(benchmark_df["Date"])

    # Merge benchmark return into stock data
    df = features_df.merge(benchmark_df[["Date", "Benchmark_Return_Next"]], on="Date", how="left")
    
    # Calculate benchmark return this week
    df["Benchmark_Return"] = df["Benchmark_Return_Next"].shift(1)

    # Calculate stock return this week and next week
    price_col = "Adj Close" if "Adj Close" in df.columns else "Close"
    df["Stock_Return_Next"] = df.groupby("Symbol")[price_col].pct_change().shift(-1)
    df["Stock_Return"] = df["Stock_Return_Next"].shift(1)
    
    df.dropna(inplace=True)
    
    # Calculate this week's pct return over to the benchmark
    df["Pct_Difference"] = (df["Stock_Return"] - df["Benchmark_Return"]) / df["Benchmark_Return"]

    # Label = 1 if stock outperforms benchmark, else 0
    df["Label"] = (df["Stock_Return_Next"] > df["Benchmark_Return_Next"]).astype(int)

    return df

# Load benchmark data from CSV
benchmark_df = pd.read_csv("stock_sector_data.csv", usecols=["Date", "Symbol", "Close"])
benchmark_df = benchmark_df[benchmark_df["Symbol"] == benchmark_symbol].reset_index(drop=True)

labeled_df = create_outperformance_labels(features_df, benchmark_df)
labeled_df.tail()


### Train/Test Split & Preprocessing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def preprocess_data(df):
    # Drop rows where the label is missing (NaN due to shifting)
    df = df.dropna(subset=["Label"])
    
    # Fill remaining missing values with median (safe for numeric features)
    df = df.fillna(df.median(numeric_only=True))

    # Keep only numeric feature columns
    # exclude_cols = ['Date', 'Symbol', 'Sector', 'Label', 'Stock_Return_Next', 'Benchmark_Return_Next']
    exclude_cols = ['Date', 'Sector', 'Label', 'Stock_Return_Next', 'Benchmark_Return_Next']
    feature_columns = [col for col in df.columns if col not in exclude_cols and df[col].dtype != 'object']
    
    X = df[feature_columns]
    y = df["Label"].astype(int)  # Ensure label is int

    # Optional: Scaling (not strictly necessary for Random Forest)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    return X_scaled, y

In [None]:
# Split the data: train up to 2023, test after
train_data = labeled_df[labeled_df["Date"] < "2024-11-01"]
test_data = labeled_df[labeled_df["Date"] >= "2024-11-01"]

X_train, y_train = preprocess_data(train_data)
X_test, y_test = preprocess_data(test_data)

In [None]:
# Shape of the data
print(f"Training data shape: {X_train.shape}, Labels shape: {y_train.shape}")
print(f"Testing data shape: {X_test.shape}, Labels shape: {y_test.shape}")

### Train Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluation: Metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))

### Feature Importance Plot

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

exclude_cols = ['Date', 'Sector', 'Label', 'Stock_Return_Next', 'Benchmark_Return_Next']

# Get feature importances and names
feature_columns = [col for col in train_data.columns if col not in exclude_cols and train_data[col].dtype != 'object']
importances = rf_model.feature_importances_

# Create DataFrame for plotting
feat_imp_df = pd.DataFrame({
    'Feature': feature_columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(data=feat_imp_df, x='Importance', y='Feature')
plt.title('Top Feature Importances')
plt.tight_layout()
plt.show()

In [None]:
top_6_features = feat_imp_df.nlargest(6, 'Importance')['Feature'].tolist()
top_6_features

### Correlation Analysis

In [None]:
# Compute correlation matrix on the training features
exclude_cols = ['Date', 'Symbol', 'Sector', 'Label', 'Stock_Return_Next', 'Benchmark_Return_Next']

numeric_cols = [col for col in train_data.columns if col not in exclude_cols and train_data[col].dtype != 'object']
corr_matrix = train_data[numeric_cols].corr()

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

### Reduce Multicollinearity

In [None]:
selected_features = top_6_features

### Train/Test Split & Preprocessing after Reduce Multicollinearity

In [None]:
from itertools import combinations
from sklearn.metrics import f1_score
from joblib import Parallel, delayed
import time

import matplotlib.pyplot as plt

# Store the best combination and its score
best_combination = None
best_score = 0
scores = []

# Function to evaluate a combination of features
def evaluate_combination(combo):
    # Preprocess data with the current combination of features
    start_time = time.time()
    X_train_combo = train_data[list(combo)]
    X_test_combo = test_data[list(combo)]

    rf_model = RandomForestClassifier(
        max_depth=None,
        max_features=None,
        min_samples_leaf=2,
        min_samples_split=2,
        n_estimators=500,
        bootstrap=False
    )
    # Train the model
    rf_model.fit(X_train_combo, y_train)

    # Predict and evaluate
    y_pred_combo = rf_model.predict(X_test_combo)
    score = f1_score(y_test, y_pred_combo)
    presision = classification_report(y_test, y_pred_combo, output_dict=True)["1"]["precision"]
    
    print(f"Combination: {combo}, F1 Score: {score}, Precision: {presision}")
    print(f"Time taken: {time.time() - start_time:.2f} seconds")

    return combo, score

# Run combinations in parallel
results = Parallel(n_jobs=-1)(
    delayed(evaluate_combination)(combo)
    for r in range(1, len(selected_features) + 1)
    for combo in combinations(selected_features, r)
)

In [None]:
# Collect and rank results
scores = sorted(results, key=lambda x: x[1], reverse=True)
best_combination, best_score = scores[0]

best_combination = np.array(best_combination)

print(f"Best combination: {best_combination}")
print(f"Best F1 Score: {best_score}")

In [None]:
results.sort(key=lambda x: x[1], reverse=True)
results

### Apply best combination

In [None]:
# Apply the specified hyperparameters and best feature combination
rf_model = RandomForestClassifier(
    max_depth=None,
    max_features=None,
    min_samples_leaf=4,
    min_samples_split=2,
    n_estimators=100,
    bootstrap=False
)

# exclude_cols = ['Date', 'Symbol', 'Sector', 'Label', 'Stock_Return_Next', 'Benchmark_Return_Next', 'Open', 'High', 'Low']
# select_features = [col for col in train_data.columns if col not in exclude_cols and train_data[col].dtype != 'object']

# Use the best combination of features
select_features = best_combination

# Create test and train sets
train_data = labeled_df[labeled_df["Date"] < "2025-01-01"]
test_data = labeled_df[labeled_df["Date"] >= "2025-01-01"]

In [None]:
print(train_data['Label'].value_counts())
print(test_data['Label'].value_counts())

In [None]:
# Create a new DataFrame with the selected features
X_train = train_data[select_features]
X_test = test_data[select_features]
y_train = train_data["Label"]
y_test = test_data["Label"]

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluation: Metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))

Train with all data except last row of Symbol

In [None]:
# Create train and test sets
train_data = labeled_df.iloc[:-1]  # All rows except the last one
test_data = labeled_df.iloc[-1:]  # Only the last row

In [None]:
print(train_data['Label'].value_counts())
print(test_data['Label'].value_counts())

In [None]:
# Create a new DataFrame with the selected features
X_train = train_data[select_features]
X_test = test_data[select_features]
y_train = train_data["Label"]
y_test = test_data["Label"]

# Apply the specified hyperparameters and best feature combination
big_rf_model = RandomForestClassifier(
    max_depth=None,
    max_features=None,
    min_samples_leaf=2,
    min_samples_split=2,
    n_estimators=200,
    bootstrap=False
)

# Train the model
big_rf_model.fit(X_train, y_train)

# Make predictions
y_pred = big_rf_model.predict(X_test)

# Show the prediction
print(f"Predicted label for the last row: {y_pred[0]}")
print(f"Actual label for the last row: {y_test.values[0]}")

### Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [200, 300, 400],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [2, 4, 5, 6],
    'max_features': [None],
    'bootstrap': [False]
}

grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=3, scoring='f1', n_jobs=-1)
grid.fit(X_train, y_train)
print("Best parameters found: ", grid.best_params_)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the parameter distribution
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'max_features': ['sqrt', 'log2', None],
}

# Perform Randomized Search
random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=100,  # Number of parameter settings sampled
    scoring='f1',
    cv=3,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

# Output the best parameters
print("Best parameters found: ", random_search.best_params_)

###  Cross-Validation Evaluation

In [None]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=TimeSeriesSplit(n_splits=5), scoring='f1')

# Print the results
print("Cross-Validation F1 Scores:", cv_scores)
print("Mean F1 Score:", np.mean(cv_scores))