In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # Added OneHotEncoder
from sklearn.compose import ColumnTransformer # For combining different transformers
from sklearn.pipeline import Pipeline # For chaining steps
import numpy as np # For numerical operations

# --- Configuration ---
# IMPORTANT: Define the path to your processed Parquet data on your local machine
# This should be the folder containing the 'part-' files from your Spark processing.
processed_data_path = "/Users/mac/Desktop/processed_stock_data_parquet_local/" 

# --- Data Loading ---
try:
    df = pd.read_parquet(processed_data_path)
    print("DataFrame loaded successfully!")
    print(f"Shape of the DataFrame: {df.shape}")
    print("\nFirst 5 rows of the DataFrame:")
    print(df.head()) # Display initial rows to confirm load
    print("\nDataFrame Info (data types and non-null counts):")
    df.info() # Display data types and non-null counts

except Exception as e:
    print(f"Error loading Parquet data: {e}")
    print("Please ensure the 'processed_data_path' is correct and pyarrow is installed.")
    # It's good practice to exit or raise in notebooks if a critical step fails
    raise # Re-raise the exception to stop execution if data load fails

# --- Data Preparation for Machine Learning ---

# 1. Engineer Target Variable: Predict if the price will go up (1) or down/stay same (0) tomorrow
# Ensure data is sorted by Ticker and Date for correct 'next day' calculation
df = df.sort_values(by=['Ticker', 'Date']).reset_index(drop=True)

# Calculate next day's close price for each ticker using a window function
# .shift(-1) gets the value from the next row within each ticker group
window_spec_next_day = df.groupby('Ticker')['Close']
df['Next_Day_Close'] = window_spec_next_day.shift(-1)

# Define the target variable: 1 if Next_Day_Close > Close (price goes up), 0 otherwise
# We cast to int for binary classification
df['Price_Change_Direction'] = (df['Next_Day_Close'] > df['Close']).astype(int)

# Handle rows where 'Next_Day_Close' is NaN (the last day for each ticker in the dataset)
# Also drop any rows where features engineered in Task 3 might be NaN (e.g., first few Daily_Return/SMAs)
df = df.dropna(subset=['Next_Day_Close', 'Daily_Return', 'SMA_5_Day', 'SMA_20_Day']) 

print("\n--- DataFrame after Target Variable Engineering ---")
# Display relevant columns to confirm the new target variable
print(df[['Date', 'Ticker', 'Close', 'Next_Day_Close', 'Price_Change_Direction']].head())
print(f"Shape after dropping NaNs: {df.shape}")
print(f"Target variable value counts (0: Down/No Change, 1: Up):\n{df['Price_Change_Direction'].value_counts()}")


# 2. Define Features (X) and Target (y)
y = df['Price_Change_Direction']

# Features to use for prediction:
# Numerical features from initial data and Task 3 processing
numerical_features = ['Open', 'High', 'Low', 'Close', 'Volume', 'Daily_Return', 'SMA_5_Day', 'SMA_20_Day']

# Temporal features extracted from 'Date'
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.dayofweek # Monday=0, Sunday=6

temporal_features = ['Year', 'Month', 'Day', 'DayOfWeek']

# Categorical features (Ticker needs One-Hot Encoding)
categorical_features = ['Ticker']

# Combine all feature names for selection
all_features_to_use = numerical_features + temporal_features + categorical_features

X = df[all_features_to_use]

print(f"\nShape of features (X) before preprocessing: {X.shape}")
print(f"Shape of target (y): {y.shape}")

# 3. Preprocessing Pipeline for Features
# We use ColumnTransformer to apply different transformations to different columns:
# - 'passthrough' for numerical/temporal features (no change needed after initial processing)
# - OneHotEncoder for the 'Ticker' categorical feature
preprocessor = ColumnTransformer(
    transformers=[
        ('num_temp', 'passthrough', numerical_features + temporal_features), 
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features) # handle_unknown='ignore' prevents errors for unseen categories
    ])

# 4. Split Data into Training and Testing Sets
# - test_size=0.2: 20% of the data will be used for testing, 80% for training.
# - random_state=42: Ensures reproducibility of the split.
# - stratify=y: Crucial for classification! It ensures the proportion of target classes (Up/Down)
#   is maintained in both training and testing sets, even if the dataset is imbalanced.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\n--- Shapes of Training and Testing Sets ---")
print(f"Training features shape (before preprocessing): {X_train.shape}")
print(f"Testing features shape (before preprocessing): {X_test.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Testing target shape: {y_test.shape}")


# --- Choose and Train an ML Model ---

# Create a Pipeline that first applies the preprocessing (OneHotEncoding etc.)
# and then trains the RandomForestClassifier. This ensures consistent preprocessing.
print("\n--- Initializing and Training RandomForestClassifier ---")
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1))])

# Train the model using the training data (X_train, y_train)
print("Training the model pipeline...")
model_pipeline.fit(X_train, y_train)
print("Model training complete.")


# --- Evaluate Model Performance ---

# Make Predictions on the Test Set (X_test)
print("\n--- Evaluating Model Performance ---")
y_pred = model_pipeline.predict(X_test)

# Calculate Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
# Use zero_division=0 to prevent warnings/errors if a class has no predicted samples
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Classification Report: Provides precision, recall, f1-score for each class
print("\n--- Classification Report ---")
# Define labels for the report (0: Down/No Change, 1: Up)
target_names = ['Down/No Change', 'Up']
print(classification_report(y_test, y_pred, target_names=target_names, zero_division=0))


# --- Visualize Model Performance (Confusion Matrix) ---

# Compute the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Plot the Confusion Matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix for Stock Price Direction Prediction')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# --- Interpretation (Placeholder for your documentation) ---
print("\n--- Interpretation of Results ---")
print("These metrics indicate the model's ability to predict whether a stock's closing price will increase or decrease the next day.")
print("The Confusion Matrix visually breaks down correct vs. incorrect predictions for 'Up' and 'Down/No Change' movements.")
print("This output, along with the plot, should be included in your Task 4 documentation with a detailed interpretation in the business context, discussing how accurate predictions of stock movement can inform investment strategies or trading decisions.")