# Stock Market Data Exploration

This notebook explores the characteristics of stock market data, examines statistical properties, and visualizes patterns that might be useful for prediction tasks.

**Contents:**
1. Data Loading and Initial Examination
2. Statistical Analysis of Stock Prices
3. Correlation Analysis
4. Time Series Characteristics
5. Feature Engineering Exploration

In [6]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.4f}'.format)

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Data Loading and Initial Examination

We'll load a few sample stocks from our dataset and examine their basic properties.

In [7]:
# Set data paths
raw_data_path = '../data/raw/'

# List available CSV files
csv_files = [f for f in os.listdir(raw_data_path) if f.endswith('.csv')]
print(f"Found {len(csv_files)} stock data files.")

# Select a few well-known stocks to analyze
sample_stocks = ['AAPL.csv', 'MSFT.csv', 'AMZN.csv', 'GOOG.csv']
available_samples = [s for s in sample_stocks if s in csv_files]

if not available_samples:
    # If our preferred stocks aren't available, take the first few from the list
    available_samples = csv_files[:4] if len(csv_files) >= 4 else csv_files

print(f"Selected stocks for analysis: {[os.path.splitext(s)[0] for s in available_samples]}")

Found 10 stock data files.
Selected stocks for analysis: ['AAPL', 'MSFT', 'AMZN']


In [8]:
# Load the stock data into DataFrames
stock_data = {}
for stock_file in available_samples:
    symbol = os.path.splitext(stock_file)[0]
    file_path = os.path.join(raw_data_path, stock_file)
    
    # Read the CSV file
    try:
        df = pd.read_csv(file_path)
        
        # If the date is in the index, reset it to be a column
        if 'Date' not in df.columns and df.index.name == 'Date':
            df = df.reset_index()
        
        # Ensure Date column is datetime
        if 'Date' in df.columns:
            df['Date'] = pd.to_datetime(df['Date'])
        
        stock_data[symbol] = df
        print(f"Loaded {len(df)} rows for {symbol}")
    except Exception as e:
        print(f"Error loading {symbol}: {e}")

Loaded 3524 rows for AAPL
Loaded 3524 rows for MSFT
Loaded 3524 rows for AMZN


In [9]:
# Display the first few rows of one stock
if stock_data:
    symbol = list(stock_data.keys())[0]
    print(f"Sample data for {symbol}:")
    display(stock_data[symbol].head())
    
    # Data summary
    print(f"\nSummary statistics for {symbol}:")
    display(stock_data[symbol].describe())
    
    # Check for missing values
    missing_values = stock_data[symbol].isnull().sum()
    print(f"\nMissing values in {symbol} data:")
    display(missing_values[missing_values > 0] if any(missing_values > 0) else "No missing values")

Sample data for AAPL:


Unnamed: 0,Price,Close,High,Low,Open,Volume
0,Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
1,Date,,,,,
2,2010-01-04,6.4403300285339355,6.455075825369143,6.391277552762015,6.422875746236436,493729600
3,2010-01-05,6.451466083526611,6.487879368872802,6.417459939457402,6.458086826968792,601904800
4,2010-01-06,6.348845481872559,6.477044307725826,6.342224739983206,6.45146457040068,552160000



Summary statistics for AAPL:


Unnamed: 0,Price,Close,High,Low,Open,Volume
count,3524,3523.0,3523.0,3523.0,3523.0,3523
unique,3524,3474.0,3523.0,3523.0,3523.0,3514
top,2023-12-29,25.00335502624512,193.23980104114617,190.58573760932344,192.7427850763616,150347200
freq,1,2.0,1.0,1.0,1.0,2



Missing values in AAPL data:


Close     1
High      1
Low       1
Open      1
Volume    1
dtype: int64

## 2. Statistical Analysis of Stock Prices

Let's examine the distribution and statistical properties of stock prices and returns.

In [10]:
def calculate_returns(df):
    """Calculate daily and cumulative returns for a stock DataFrame."""
    df = df.copy()
    df['Daily_Return'] = df['Close'].pct_change()
    df['Cum_Return'] = (1 + df['Daily_Return']).cumprod() - 1
    return df

# Process all stocks
for symbol, df in stock_data.items():
    stock_data[symbol] = calculate_returns(df)

  df['Daily_Return'] = df['Close'].pct_change()


TypeError: unsupported operand type(s) for /: 'str' and 'str'

In [11]:
# Plot the closing prices
plt.figure(figsize=(15, 7))

for symbol, df in stock_data.items():
    plt.plot(df['Date'], df['Close'], label=symbol)

plt.title('Historical Closing Prices', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Price (USD)', fontsize=14)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

KeyError: 'Date'

<Figure size 1500x700 with 0 Axes>

In [None]:
# Plot the daily returns
plt.figure(figsize=(15, 10))

for i, (symbol, df) in enumerate(stock_data.items(), 1):
    plt.subplot(len(stock_data), 1, i)
    plt.plot(df['Date'], df['Daily_Return'], label=f'{symbol} Daily Returns')
    plt.title(f'{symbol} Daily Returns', fontsize=14)
    plt.grid(True)
    if i == len(stock_data):  # Only add xlabel to the bottom subplot
        plt.xlabel('Date', fontsize=12)
    plt.ylabel('Return', fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
# Compare return distributions
plt.figure(figsize=(15, 7))

for symbol, df in stock_data.items():
    sns.histplot(df['Daily_Return'].dropna(), kde=True, label=symbol, alpha=0.6)

plt.title('Distribution of Daily Returns', fontsize=16)
plt.xlabel('Daily Return', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [12]:
# Calculate summary statistics for returns
returns_stats = {}

for symbol, df in stock_data.items():
    returns = df['Daily_Return'].dropna()
    
    stats = {
        'Mean': returns.mean(),
        'Median': returns.median(),
        'Std Dev': returns.std(),
        'Min': returns.min(),
        'Max': returns.max(),
        'Skewness': returns.skew(),
        'Kurtosis': returns.kurt(),  # Excess kurtosis (normal = 0)
        '% Positive Days': (returns > 0).mean() * 100
    }
    
    returns_stats[symbol] = stats

# Display as DataFrame
stats_df = pd.DataFrame(returns_stats).T
stats_df

KeyError: 'Daily_Return'

## 3. Correlation Analysis

We'll examine correlations between different stocks and different features of the same stock.

In [13]:
# Create a DataFrame with returns from all stocks
returns_df = pd.DataFrame({
    symbol: df['Daily_Return'] for symbol, df in stock_data.items()
})

# Calculate correlation matrix
corr_matrix = returns_df.corr()

# Plot correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5, vmin=-1, vmax=1)
plt.title('Correlation Matrix of Stock Returns', fontsize=16)
plt.tight_layout()
plt.show()

KeyError: 'Daily_Return'

In [14]:
# Examine correlations between different features for a single stock
if stock_data:
    symbol = list(stock_data.keys())[0]
    features = ['Open', 'High', 'Low', 'Close', 'Volume', 'Daily_Return']
    
    # Select available features
    available_features = [f for f in features if f in stock_data[symbol].columns]
    feature_df = stock_data[symbol][available_features]
    
    # Calculate correlation matrix
    feature_corr = feature_df.corr()
    
    # Plot correlation matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(feature_corr, annot=True, cmap='coolwarm', linewidths=0.5, vmin=-1, vmax=1)
    plt.title(f'Feature Correlation Matrix for {symbol}', fontsize=16)
    plt.tight_layout()
    plt.show()

ValueError: could not convert string to float: 'AAPL'

## 4. Time Series Characteristics

Now let's explore the time series properties of stock prices, including:
- Stationarity tests
- Autocorrelation analysis
- Seasonality detection

In [15]:
def check_stationarity(series, window=30, title=''):
    """Check stationarity of a time series using the ADF test and rolling statistics."""
    # Calculate rolling statistics
    rolling_mean = series.rolling(window=window).mean()
    rolling_std = series.rolling(window=window).std()
    
    # Plot rolling statistics
    plt.figure(figsize=(12, 6))
    plt.plot(series, label='Original')
    plt.plot(rolling_mean, label=f'Rolling Mean ({window} days)')
    plt.plot(rolling_std, label=f'Rolling Std ({window} days)')
    plt.title(f'Rolling Statistics - {title}')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    
    # ADF test
    result = adfuller(series.dropna())
    print(f'ADF Statistic: {result[0]:.4f}')
    print(f'p-value: {result[1]:.4f}')
    print('Critical Values:')
    for key, value in result[4].items():
        print(f'\t{key}: {value:.4f}')
    
    # Interpret results
    if result[1] <= 0.05:
        print("Result: The series is stationary (reject the null hypothesis)")
    else:
        print("Result: The series is non-stationary (fail to reject the null hypothesis)")

In [16]:
# Check stationarity of price and returns for a sample stock
if stock_data:
    symbol = list(stock_data.keys())[0]
    
    # Check price stationarity
    print(f"\nStationarity Test for {symbol} Closing Prices:\n{'-'*40}")
    check_stationarity(stock_data[symbol]['Close'], title=f'{symbol} Closing Prices')
    
    # Check returns stationarity
    print(f"\nStationarity Test for {symbol} Daily Returns:\n{'-'*40}")
    check_stationarity(stock_data[symbol]['Daily_Return'].dropna(), title=f'{symbol} Daily Returns')


Stationarity Test for AAPL Closing Prices:
----------------------------------------


DataError: No numeric types to aggregate

In [17]:
# Plot autocorrelation and partial autocorrelation for returns
if stock_data:
    symbol = list(stock_data.keys())[0]
    returns = stock_data[symbol]['Daily_Return'].dropna()
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot ACF
    plot_acf(returns, ax=axes[0], lags=30)
    axes[0].set_title(f'Autocorrelation Function (ACF) - {symbol} Returns', fontsize=14)
    
    # Plot PACF
    plot_pacf(returns, ax=axes[1], lags=30)
    axes[1].set_title(f'Partial Autocorrelation Function (PACF) - {symbol} Returns', fontsize=14)
    
    plt.tight_layout()
    plt.show()

KeyError: 'Daily_Return'

## 5. Feature Engineering Exploration

Let's explore some common feature engineering techniques for stock price prediction.

In [18]:
def engineer_features(df):
    """Engineer features for stock price prediction."""
    df = df.copy()
    
    # Moving averages
    for window in [5, 10, 20, 50]:
        df[f'Close_MA_{window}'] = df['Close'].rolling(window=window).mean()
    
    # Price momentum (percentage change over window)
    for window in [5, 10, 20]:
        df[f'Momentum_{window}'] = df['Close'].pct_change(periods=window)
    
    # Volatility (standard deviation over window)
    for window in [5, 10, 20]:
        df[f'Volatility_{window}'] = df['Close'].rolling(window=window).std()
    
    # Trading volume features
    if 'Volume' in df.columns:
        df['Volume_Change'] = df['Volume'].pct_change()
        df['Volume_MA_5'] = df['Volume'].rolling(window=5).mean()
    
    # High-Low range
    if 'High' in df.columns and 'Low' in df.columns:
        df['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0
    
    # Day of week
    if 'Date' in df.columns:
        df['Day_of_Week'] = df['Date'].dt.dayofweek
    
    return df

# Engineer features for the first stock
if stock_data:
    symbol = list(stock_data.keys())[0]
    engineered_df = engineer_features(stock_data[symbol])
    
    # Display the new features
    print(f"Engineered features for {symbol}:")
    display(engineered_df.columns.tolist())
    
    # Display a sample of the engineered data
    display(engineered_df.iloc[50:55])

DataError: No numeric types to aggregate

In [19]:
# Visualize some of the engineered features
if 'engineered_df' in locals():
    # Plot close price and moving averages
    plt.figure(figsize=(15, 7))
    
    plt.plot(engineered_df['Date'], engineered_df['Close'], label='Close Price')
    
    for window in [5, 20, 50]:
        plt.plot(engineered_df['Date'], 
                engineered_df[f'Close_MA_{window}'], 
                label=f'{window}-day MA', 
                alpha=0.7)
    
    plt.title(f'{symbol} - Price and Moving Averages', fontsize=16)
    plt.xlabel('Date', fontsize=14)
    plt.ylabel('Price', fontsize=14)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [20]:
# Plot volatility over time
if 'engineered_df' in locals():
    plt.figure(figsize=(15, 7))
    
    for window in [5, 10, 20]:
        plt.plot(engineered_df['Date'], 
                engineered_df[f'Volatility_{window}'], 
                label=f'{window}-day Volatility')
    
    plt.title(f'{symbol} - Price Volatility', fontsize=16)
    plt.xlabel('Date', fontsize=14)
    plt.ylabel('Standard Deviation', fontsize=14)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [21]:
# Calculate feature correlations with future returns
if 'engineered_df' in locals():
    # Create future returns (target variable)
    for days in [1, 5, 10]:
        engineered_df[f'Future_Return_{days}d'] = engineered_df['Close'].pct_change(periods=days).shift(-days)
    
    # Select features and target
    features = [col for col in engineered_df.columns 
               if col not in ['Date', 'Future_Return_1d', 'Future_Return_5d', 'Future_Return_10d']]
    
    # Calculate correlations
    correlations = {}
    for days in [1, 5, 10]:
        target = f'Future_Return_{days}d'
        corr_series = engineered_df[features].corrwith(engineered_df[target]).sort_values(ascending=False)
        correlations[target] = corr_series
    
    # Display top correlations
    for target, corrs in correlations.items():
        print(f"\nTop correlations with {target}:")
        display(corrs.head(10))
        print(f"Bottom correlations with {target}:")
        display(corrs.tail(10))

## Conclusion

In this notebook, we've explored the characteristics of stock market data, examining statistical properties, time-series behaviors, correlations, and potential predictive features. 

Key findings:
1. Stock prices are typically non-stationary, while returns tend to be stationary
2. There are correlations between stocks in similar sectors
3. Various engineered features show different levels of correlation with future returns

The insights from this exploratory analysis will inform our feature engineering and modeling approaches in the subsequent notebooks.