# MBB Stock Data - Exploratory Data Analysis

Phân tích khám phá dữ liệu cổ phiếu MBB (MB Bank)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 12

## 1. Load Data

In [None]:
# Load MBB daily data
df = pd.read_csv('../data/raw/mbb_daily.csv', parse_dates=['time'])
df = df.set_index('time').sort_index()

# Load market indices
vnindex = pd.read_csv('../data/raw/vnindex_daily.csv', parse_dates=['time']).set_index('time')
vn30 = pd.read_csv('../data/raw/vn30_daily.csv', parse_dates=['time']).set_index('time')

print(f"MBB Data: {df.shape[0]} rows, from {df.index.min()} to {df.index.max()}")
print(f"VNINDEX Data: {vnindex.shape[0]} rows")
print(f"VN30 Data: {vn30.shape[0]} rows")

In [None]:
df.info()

In [None]:
df.describe()

## 2. Data Quality Check

In [None]:
# Check missing values
print("Missing values:")
print(df.isnull().sum())

print(f"\nDuplicate dates: {df.index.duplicated().sum()}")

In [None]:
# Check for gaps in dates (weekends/holidays excluded)
date_diff = df.index.to_series().diff()
long_gaps = date_diff[date_diff > pd.Timedelta(days=5)]
print(f"Gaps longer than 5 days: {len(long_gaps)}")
if len(long_gaps) > 0:
    print(long_gaps.head(10))

## 3. Price Visualization

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Price chart
axes[0].plot(df.index, df['close'], linewidth=1, color='#2E86AB')
axes[0].set_title('MBB Stock Price (2014-2024)', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Price (VND x 1000)')
axes[0].grid(True, alpha=0.3)

# Volume chart
axes[1].bar(df.index, df['volume'], width=1, color='#A23B72', alpha=0.7)
axes[1].set_title('Trading Volume', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Volume')
axes[1].set_xlabel('Date')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../data/processed/mbb_price_volume.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Return Distribution

In [None]:
# Calculate daily returns
df['returns'] = df['close'].pct_change()
df['log_returns'] = np.log(df['close'] / df['close'].shift(1))

print("Daily Return Statistics:")
print(f"Mean: {df['returns'].mean()*100:.4f}%")
print(f"Std: {df['returns'].std()*100:.4f}%")
print(f"Skewness: {df['returns'].skew():.4f}")
print(f"Kurtosis: {df['returns'].kurtosis():.4f}")
print(f"Min: {df['returns'].min()*100:.2f}%")
print(f"Max: {df['returns'].max()*100:.2f}%")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(df['returns'].dropna(), bins=100, alpha=0.7, color='#2E86AB', edgecolor='white')
axes[0].axvline(x=0, color='red', linestyle='--', linewidth=1)
axes[0].set_title('Distribution of Daily Returns', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Daily Return')
axes[0].set_ylabel('Frequency')

# Q-Q Plot
from scipy import stats
stats.probplot(df['returns'].dropna(), dist="norm", plot=axes[1])
axes[1].set_title('Q-Q Plot (Normal Distribution)', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

## 5. Correlation Analysis

In [None]:
# Correlation with market indices
vnindex['returns'] = vnindex['close'].pct_change()
vn30['returns'] = vn30['close'].pct_change()

# Merge data
merged = pd.DataFrame({
    'MBB': df['returns'],
    'VNINDEX': vnindex['returns'],
    'VN30': vn30['returns']
}).dropna()

print("Correlation Matrix:")
print(merged.corr())

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(merged.corr(), annot=True, cmap='RdYlBu_r', center=0, 
            fmt='.3f', square=True, linewidths=0.5)
plt.title('Correlation: MBB vs Market Indices', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.show()

## 6. Volatility Analysis

In [None]:
# Rolling volatility
df['volatility_20'] = df['returns'].rolling(window=20).std() * np.sqrt(252)
df['volatility_60'] = df['returns'].rolling(window=60).std() * np.sqrt(252)

fig, ax = plt.subplots(figsize=(14, 5))
ax.plot(df.index, df['volatility_20'], label='20-day Volatility', alpha=0.8)
ax.plot(df.index, df['volatility_60'], label='60-day Volatility', alpha=0.8)
ax.set_title('MBB Rolling Volatility (Annualized)', fontsize=12, fontweight='bold')
ax.set_ylabel('Volatility')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Seasonality Analysis

In [None]:
# Day of week effect
df['dayofweek'] = df.index.dayofweek
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri']

dow_returns = df.groupby('dayofweek')['returns'].mean() * 100

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].bar(range(5), dow_returns[:5], color=['red' if x < 0 else 'green' for x in dow_returns[:5]])
axes[0].set_xticks(range(5))
axes[0].set_xticklabels(day_names)
axes[0].axhline(y=0, color='black', linestyle='-', linewidth=0.5)
axes[0].set_title('Average Return by Day of Week', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Mean Return (%)')

# Monthly effect
df['month'] = df.index.month
month_returns = df.groupby('month')['returns'].mean() * 100

axes[1].bar(range(1, 13), month_returns, color=['red' if x < 0 else 'green' for x in month_returns])
axes[1].set_xticks(range(1, 13))
axes[1].set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
axes[1].axhline(y=0, color='black', linestyle='-', linewidth=0.5)
axes[1].set_title('Average Return by Month', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Mean Return (%)')

plt.tight_layout()
plt.show()

## 8. Summary Statistics

In [None]:
# Key statistics for model training
print("="*50)
print("MBB STOCK DATA SUMMARY")
print("="*50)
print(f"Period: {df.index.min().strftime('%Y-%m-%d')} to {df.index.max().strftime('%Y-%m-%d')}")
print(f"Total trading days: {len(df)}")
print(f"\nPrice Range:")
print(f"  Min: {df['close'].min():.2f}")
print(f"  Max: {df['close'].max():.2f}")
print(f"  Current: {df['close'].iloc[-1]:.2f}")
print(f"\nDaily Returns:")
print(f"  Mean: {df['returns'].mean()*100:.4f}%")
print(f"  Std: {df['returns'].std()*100:.4f}%")
print(f"  Annualized Return: {df['returns'].mean()*252*100:.2f}%")
print(f"  Annualized Volatility: {df['returns'].std()*np.sqrt(252)*100:.2f}%")
print(f"\nCorrelation with VNINDEX: {merged['MBB'].corr(merged['VNINDEX']):.4f}")
print(f"Correlation with VN30: {merged['MBB'].corr(merged['VN30']):.4f}")
print("="*50)

In [None]:
# Save clean data for feature engineering
df_clean = df[['open', 'high', 'low', 'close', 'volume']].copy()
df_clean.to_csv('../data/processed/mbb_clean.csv')
print("Clean data saved to data/processed/mbb_clean.csv")