In [None]:
# %% [markdown]
# # Bitcoin Price Prediction - Exploratory Data Analysis (EDA)
# 
# This notebook performs exploratory data analysis on the Bitcoin dataset to understand:
# - Data structure and quality
# - Key trends and patterns
# - Feature distributions and relationships
# - Missing value treatment

# %% [markdown]
# ## 1. Setup and Data Loading

# %%
# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from src.data_processing import load_data, preprocess_data
from src.visualization import set_plot_style

# Set style for plots
set_plot_style()

# %%
# Load data
data = load_data('../data/bitcoin_dataset.csv')

# Initial inspection
print("Dataset shape:", data.shape)
print("\nFirst 5 rows:")
display(data.head())

print("\nData columns:")
print(data.columns.tolist())

print("\nData types:")
print(data.dtypes)

# %% [markdown]
# ## 2. Data Quality Assessment

# %%
# Check for missing values
print("Missing values per column:")
missing_values = data.isnull().sum()
print(missing_values[missing_values > 0])

# Visualize missing values
plt.figure(figsize=(12, 6))
sns.heatmap(data.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Values Heatmap")
plt.show()

# %%
# Handle missing values
data_clean = preprocess_data(data)

# Verify missing values treatment
print("\nMissing values after treatment:")
print(data_clean.isnull().sum().sum())

# %% [markdown]
# ## 3. Univariate Analysis

# %%
# Descriptive statistics
print("Descriptive Statistics:")
display(data_clean.describe())

# %%
# Plot distributions of key features
def plot_distributions(data, columns, ncols=3):
    nrows = int(np.ceil(len(columns)/ncols))
    fig, axes = plt.subplots(nrows, ncols, figsize=(15, nrows*4))
    axes = axes.flatten()
    
    for i, col in enumerate(columns):
        sns.histplot(data[col], ax=axes[i], kde=True)
        axes[i].set_title(f'Distribution of {col}')
        axes[i].set_xlabel('')
    
    # Hide empty subplots
    for j in range(i+1, len(axes)):
        fig.delaxes(axes[j])
    
    plt.tight_layout()
    plt.show()

# Select numeric columns excluding date
numeric_cols = data_clean.select_dtypes(include=[np.number]).columns.tolist()
plot_distributions(data_clean, numeric_cols[:9])  # First 9 features
plot_distributions(data_clean, numeric_cols[9:])  # Remaining features

# %% [markdown]
# ## 4. Time Series Analysis

# %%
# Convert Date to datetime
data_clean['Date'] = pd.to_datetime(data_clean['Date'])

# %%
# Plot time series of key metrics
def plot_ts(data, cols, titles=None, ncols=2):
    if titles is None:
        titles = cols
    
    nrows = int(np.ceil(len(cols)/ncols))
    fig, axes = plt.subplots(nrows, ncols, figsize=(15, nrows*4))
    axes = axes.flatten()
    
    for i, (col, title) in enumerate(zip(cols, titles)):
        axes[i].plot(data['Date'], data[col])
        axes[i].set_title(title)
        axes[i].set_xlabel('Date')
        axes[i].set_ylabel('Value')
        axes[i].grid(True)
    
    # Hide empty subplots
    for j in range(i+1, len(axes)):
        fig.delaxes(axes[j])
    
    plt.tight_layout()
    plt.show()

# Key metrics to plot
ts_cols = [
    'btc_market_price',
    'btc_total_bitcoins',
    'btc_trade_volume',
    'btc_blocks_size',
    'btc_avg_block_size',
    'btc_n_transactions_per_block'
]

ts_titles = [
    'Market Price (USD)',
    'Total Bitcoins in Circulation',
    'Trade Volume',
    'Blockchain Size',
    'Average Block Size',
    'Transactions per Block'
]

plot_ts(data_clean, ts_cols, ts_titles)

# %% [markdown]
# ## 5. Correlation Analysis

# %%
# Compute and plot correlation matrix
corr_matrix = data_clean.select_dtypes(include=[np.number]).corr()

plt.figure(figsize=(20, 15))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, vmin=-1, vmax=1, mask=np.triu(np.ones_like(corr_matrix)))
plt.title("Feature Correlation Matrix (Lower Triangle)")
plt.show()

# %%
# Top correlations with target (market price)
target = 'btc_market_price'
top_correlations = corr_matrix[target].sort_values(ascending=False)[1:11]  # Exclude self-correlation

plt.figure(figsize=(10, 6))
sns.barplot(x=top_correlations.values, y=top_correlations.index, palette='viridis')
plt.title(f"Top 10 Features Correlated with {target}")
plt.xlabel("Correlation Coefficient")
plt.ylabel("Feature")
plt.show()

# %% [markdown]
# ## 6. Feature Relationships

# %%
# Scatter plots of key relationships
def plot_scatter(data, x_cols, y_cols, titles=None):
    if titles is None:
        titles = [f"{y} vs {x}" for x, y in zip(x_cols, y_cols)]
    
    fig, axes = plt.subplots(1, len(x_cols), figsize=(15, 5))
    if len(x_cols) == 1:
        axes = [axes]
    
    for ax, x, y, title in zip(axes, x_cols, y_cols, titles):
        sns.scatterplot(data=data, x=x, y=y, ax=ax, alpha=0.6)
        ax.set_title(title)
        ax.set_xlabel(x)
        ax.set_ylabel(y)
    
    plt.tight_layout()
    plt.show()

# Plot key relationships
plot_scatter(
    data_clean,
    x_cols=['btc_total_bitcoins', 'btc_avg_block_size'],
    y_cols=['btc_difficulty', 'btc_n_transactions_per_block'],
    titles=[
        "Mining Difficulty vs Total Bitcoins",
        "Transactions per Block vs Avg Block Size"
    ]
)

# %% [markdown]
# ## 7. Outlier Detection

# %%
# Boxplots for outlier detection
def plot_boxplots(data, columns, ncols=3):
    nrows = int(np.ceil(len(columns)/ncols))
    fig, axes = plt.subplots(nrows, ncols, figsize=(15, nrows*4))
    axes = axes.flatten()
    
    for i, col in enumerate(columns):
        sns.boxplot(y=data[col], ax=axes[i])
        axes[i].set_title(f'Boxplot of {col}')
        axes[i].set_ylabel('Value')
    
    # Hide empty subplots
    for j in range(i+1, len(axes)):
        fig.delaxes(axes[j])
    
    plt.tight_layout()
    plt.show()

plot_boxplots(data_clean, numeric_cols[:9])
plot_boxplots(data_clean, numeric_cols[9:])

# %% [markdown]
# ## Key Findings
# 
# 1. **Data Quality**:
#    - Identified and treated missing values using forward fill
#    - No remaining missing values in the dataset
# 
# 2. **Distributions**:
#    - Most features show right-skewed distributions
#    - Some features (like market price) show exponential growth patterns
# 
# 3. **Time Series Patterns**:
#    - Clear upward trends in market price and total bitcoins
#    - Seasonal patterns in some metrics like trade volume
# 
# 4. **Correlations**:
#    - Strong positive correlations between market price and:
#      - Trade volume
#      - Hash rate
#      - Transaction fees
#    - Some expected relationships like:
#      - Difficulty vs total bitcoins
#      - Block size vs transactions