# Stock Market Data Analysis with MLFlow
This notebook demonstrates data ingestion, analysis, and preprocessing using sklearn pipelines with MLFlow tracking

In [None]:
# Import dependencies
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import mlflow
import mlflow.sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print(f"MLflow version: {mlflow.__version__}")
print(f"Sklearn version: {sklearn.__version__}")

## Configuration
Set up MLFlow tracking URI (update this to your VM's IP)

In [None]:
# Set MLFlow tracking URI - update with your VM IP
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Stock Market Analysis Pipeline")
print(f"Tracking URI: {mlflow.get_tracking_uri()}")
print(f"Experiment: {mlflow.get_experiment_by_name('Stock Market Analysis Pipeline')}")

## 1. Data Ingestion

In [None]:
# Load the dataset
df = pd.read_csv("stock_market_dataset.csv")
print(f"Dataset shape: {df.shape}")
print(f"\nDataset info:")
df.info()
print(f"\nFirst few rows:")
df.head()

## 2. Exploratory Data Analysis

In [None]:
# Summary statistics
summary_stats = df.describe()
print("Summary Statistics:")
summary_stats

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values:")
print(missing_values[missing_values > 0])
print(f"\nTotal missing values: {missing_values.sum()}")

In [None]:
# Correlation matrix
plt.figure(figsize=(14, 10))
correlation_matrix = df.select_dtypes(include=[np.number]).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            fmt='.2f', linewidths=0.5, square=True)
plt.title('Correlation Matrix - Stock Market Features', fontsize=16, pad=20)
plt.tight_layout()
plt.savefig('correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()
print("Correlation matrix saved!")

In [None]:
# Distribution of key features
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
features = ['Close', 'Volume', 'RSI', 'MACD', 'GDP_Growth', 'Sentiment_Score']

for idx, feature in enumerate(features):
    row = idx // 3
    col = idx % 3
    axes[row, col].hist(df[feature].dropna(), bins=50, edgecolor='black', alpha=0.7)
    axes[row, col].set_title(f'Distribution of {feature}')
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Frequency')
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('feature_distributions.png', dpi=300, bbox_inches='tight')
plt.show()
print("Distribution plots saved!")

In [None]:
# Stock-wise analysis
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Count by stock
stock_counts = df['Stock'].value_counts()
axes[0].bar(stock_counts.index, stock_counts.values)
axes[0].set_title('Number of Records per Stock')
axes[0].set_xlabel('Stock')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

# Average close price by stock
avg_close = df.groupby('Stock')['Close'].mean().sort_values(ascending=False)
axes[1].bar(avg_close.index, avg_close.values, color='green', alpha=0.7)
axes[1].set_title('Average Close Price by Stock')
axes[1].set_xlabel('Stock')
axes[1].set_ylabel('Average Close Price')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('stock_analysis.png', dpi=300, bbox_inches='tight')
plt.show()
print("Stock analysis plots saved!")

In [None]:
# Target variable distribution
plt.figure(figsize=(8, 6))
target_counts = df['Target'].value_counts()
plt.bar(target_counts.index, target_counts.values, color=['red', 'green'])
plt.title('Target Variable Distribution (0=Down, 1=Up)', fontsize=14)
plt.xlabel('Target')
plt.ylabel('Count')
plt.xticks([0, 1], ['Down', 'Up'])
for i, v in enumerate(target_counts.values):
    plt.text(i, v + 500, str(v), ha='center', va='bottom')
plt.tight_layout()
plt.savefig('target_distribution.png', dpi=300, bbox_inches='tight')
plt.show()
print(f"Target balance: {target_counts[1]/(target_counts[0]+target_counts[1])*100:.2f}% Up")

## 3. Data Preparation with Sklearn Pipeline

In [None]:
# Separate features and target
# Drop non-numeric and target columns
X = df.drop(['Stock', 'Date', 'Target', 'Next_Close'], axis=1)
y = df['Target']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeatures: {list(X.columns)}")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"\nTraining set target distribution:")
print(y_train.value_counts())

In [None]:
# Create preprocessing pipeline
# Using RobustScaler to handle outliers better in financial data
preprocessing_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Handle any missing values
    ('scaler', RobustScaler())  # Robust to outliers
])

print("Preprocessing Pipeline:")
print(preprocessing_pipeline)

In [None]:
# Fit the pipeline on training data
X_train_processed = preprocessing_pipeline.fit_transform(X_train)
X_test_processed = preprocessing_pipeline.transform(X_test)

print(f"Processed training data shape: {X_train_processed.shape}")
print(f"Processed test data shape: {X_test_processed.shape}")

# Convert back to DataFrame for easier analysis
X_train_processed_df = pd.DataFrame(
    X_train_processed, 
    columns=X.columns,
    index=X_train.index
)
X_test_processed_df = pd.DataFrame(
    X_test_processed, 
    columns=X.columns,
    index=X_test.index
)

print("\nProcessed data statistics:")
X_train_processed_df.describe()

In [None]:
# Visualize the effect of preprocessing
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
features_to_compare = ['Close', 'Volume']

for idx, feature in enumerate(features_to_compare):
    # Before preprocessing
    axes[idx, 0].hist(X_train[feature], bins=50, edgecolor='black', alpha=0.7)
    axes[idx, 0].set_title(f'{feature} - Before Preprocessing')
    axes[idx, 0].set_xlabel(feature)
    axes[idx, 0].set_ylabel('Frequency')
    
    # After preprocessing
    axes[idx, 1].hist(X_train_processed_df[feature], bins=50, 
                      edgecolor='black', alpha=0.7, color='green')
    axes[idx, 1].set_title(f'{feature} - After Preprocessing')
    axes[idx, 1].set_xlabel(f'{feature} (scaled)')
    axes[idx, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.savefig('preprocessing_comparison.png', dpi=300, bbox_inches='tight')
plt.show()
print("Preprocessing comparison saved!")

## 4. Log Everything to MLFlow

In [None]:
# Start MLFlow run and log everything
with mlflow.start_run(run_name="Stock_Data_Pipeline_" + datetime.now().strftime("%Y%m%d_%H%M%S")):
    
    # Log parameters
    mlflow.log_param("dataset_name", "stock_market_dataset.csv")
    mlflow.log_param("total_samples", df.shape[0])
    mlflow.log_param("total_features", X.shape[1])
    mlflow.log_param("train_size", X_train.shape[0])
    mlflow.log_param("test_size", X_test.shape[0])
    mlflow.log_param("test_split_ratio", 0.2)
    mlflow.log_param("random_state", 42)
    mlflow.log_param("imputation_strategy", "median")
    mlflow.log_param("scaling_method", "RobustScaler")
    mlflow.log_param("unique_stocks", df['Stock'].nunique())
    
    # Log metrics
    mlflow.log_metric("target_balance_percentage", 
                     (y.sum() / len(y)) * 100)
    mlflow.log_metric("missing_values_total", df.isnull().sum().sum())
    mlflow.log_metric("avg_close_price", df['Close'].mean())
    mlflow.log_metric("avg_volume", df['Volume'].mean())
    mlflow.log_metric("avg_rsi", df['RSI'].mean())
    
    # Log the preprocessing pipeline
    mlflow.sklearn.log_model(
        preprocessing_pipeline, 
        "preprocessing_pipeline",
        registered_model_name="StockDataPreprocessor"
    )
    
    # Log artifacts (visualizations)
    mlflow.log_artifact('correlation_matrix.png')
    mlflow.log_artifact('feature_distributions.png')
    mlflow.log_artifact('stock_analysis.png')
    mlflow.log_artifact('target_distribution.png')
    mlflow.log_artifact('preprocessing_comparison.png')
    
    # Log text summaries
    mlflow.log_text(summary_stats.to_string(), "summary_statistics.txt")
    mlflow.log_text(correlation_matrix.to_string(), "correlation_matrix.txt")
    
    # Log dataset info
    dataset_info = f"""Stock Market Dataset Information
    =====================================
    Total Records: {df.shape[0]}
    Total Features: {df.shape[1]}
    Date Range: {df['Date'].min()} to {df['Date'].max()}
    Unique Stocks: {df['Stock'].nunique()}
    Stocks: {', '.join(df['Stock'].unique())}
    
    Features:
    {', '.join(df.columns)}
    
    Target Distribution:
    Down (0): {(y==0).sum()} ({(y==0).sum()/len(y)*100:.2f}%)
    Up (1): {(y==1).sum()} ({(y==1).sum()/len(y)*100:.2f}%)
    """
    mlflow.log_text(dataset_info, "dataset_info.txt")
    
    # Save processed data samples
    X_train_processed_df.head(100).to_csv('train_processed_sample.csv', index=False)
    mlflow.log_artifact('train_processed_sample.csv')
    
    print("✓ All data, visualizations, and pipeline logged to MLFlow!")
    print(f"✓ Run ID: {mlflow.active_run().info.run_id}")
    print(f"✓ Experiment ID: {mlflow.active_run().info.experiment_id}")

In [None]:
# View experiment information
experiment = mlflow.get_experiment_by_name("Stock Market Analysis Pipeline")
print(f"Experiment Name: {experiment.name}")
print(f"Experiment ID: {experiment.experiment_id}")
print(f"Artifact Location: {experiment.artifact_location}")
print(f"\nView your results at: {mlflow.get_tracking_uri()}")

## Summary

This notebook completed:
1. ✓ Data ingestion from stock market CSV
2. ✓ Exploratory data analysis with multiple visualizations
3. ✓ Data preprocessing using sklearn pipeline (imputation + scaling)
4. ✓ MLFlow experiment tracking with all artifacts
5. ✓ Saved preprocessing pipeline as reusable model

Next steps:
- Check MLFlow UI for all logged artifacts
- Deploy the preprocessing pipeline as a service
- Build a REST API for real-time and batch predictions