In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from src.data_loader import (
    load_basketball_data_complete,
    get_dataset_info,
    explore_table_schema,
    load_with_sql_query
)

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("✓ Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

ModuleNotFoundError: No module named 'src'

In [None]:
print("\n" + "="*60)
print("STEP 1: LOADING BASKETBALL DATASET")
print("="*60)

# Load the complete dataset from Kaggle (downloads and loads from SQLite)
df, dataset_path = load_basketball_data_complete(min_games=20)

print("\n✓ Dataset loaded successfully from SQLite database!")
print(f"Dataset location: {dataset_path}")


STEP 1: LOADING BASKETBALL DATASET

NBA PLAYER PERFORMANCE PREDICTION - DATA LOADING
=== DOWNLOADING BASKETBALL DATASET ===
[SUCCESS] Dataset downloaded to: /Users/nehemiahshannon/.cache/kagglehub/datasets/wyattowalsh/basketball/versions/231
[SUCCESS] Found database: /Users/nehemiahshannon/.cache/kagglehub/datasets/wyattowalsh/basketball/versions/231/nba.sqlite

=== BASKETBALL DATABASE TABLES ===
[SUCCESS] Found database: /Users/nehemiahshannon/.cache/kagglehub/datasets/wyattowalsh/basketball/versions/231/nba.sqlite
  * game                             65,698 rows  |   55 columns
  * game_summary                     58,110 rows  |   14 columns
  * other_stats                      28,271 rows  |   26 columns
  * officials                        70,971 rows  |    5 columns
  * inactive_players                110,191 rows  |    9 columns
  * game_info                        58,053 rows  |    4 columns
  * line_score                       58,053 rows  |   43 columns
  * play_by_play      

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x10b03eb50>>
Traceback (most recent call last):
  File "/Users/nehemiahshannon/nba-player-performance-prediction/.venv/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 781, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [2]:
print("\n" + "="*60)
print("STEP 2: INITIAL DATA INSPECTION")
print("="*60)

# Display basic info
get_dataset_info(df)

# Show first few rows
print("\n=== FIRST 5 ROWS ===")
print(df.head())

# Show last few rows
print("\n=== LAST 5 ROWS ===")
print(df.tail())


STEP 2: INITIAL DATA INSPECTION


NameError: name 'get_dataset_info' is not defined

In [None]:
# Assuming 'position' column exists
if 'position' in df.columns:
    plt.figure(figsize=(12, 6))
    
    # Violin plot shows distribution by position
    sns.violinplot(data=df, x='position', y=target_col)
    plt.xlabel('Position', fontsize=12)
    plt.ylabel('Points Per Game', fontsize=12)
    plt.title('Points Distribution by Position', fontsize=14, fontweight='bold')
    plt.grid(True, alpha=0.3, axis='y')
    
    plt.savefig('../results/figures/points_by_position.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # WHY THIS MATTERS:
    # - Guards typically score more than centers
    # - Shows if position is important predictor
    # - Reveals position-specific patterns

In [None]:
# Select only numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Calculate correlation matrix
correlation_matrix = df[numeric_cols].corr()

# Plot heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('../results/figures/correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

# WHY THIS MATTERS:
# - Shows which features are related to target
# - Identifies multicollinearity (highly correlated features)
# - Helps feature selection

In [None]:
# Get correlations with target variable
target_correlations = correlation_matrix[target_col].sort_values(ascending=False)

print(f"\nTop 10 features correlated with {target_col}:")
print(target_correlations.head(10))

print(f"\nBottom 10 features correlated with {target_col}:")
print(target_correlations.tail(10))

# Visualize top correlations
plt.figure(figsize=(10, 8))
top_corr = target_correlations[1:16]  # Exclude target itself
plt.barh(range(len(top_corr)), top_corr.values)
plt.yticks(range(len(top_corr)), top_corr.index)
plt.xlabel('Correlation with Points Per Game', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.title('Top 15 Features Correlated with Target', fontsize=14, fontweight='bold')
plt.axvline(x=0, color='black', linestyle='--', linewidth=0.8)
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig('../results/figures/top_correlations.png', dpi=300, bbox_inches='tight')
plt.show()

# WHY THIS MATTERS:
# - Identifies most predictive features
# - Guides feature engineering
# - Helps understand what drives performance

In [None]:
# Create scatter plots for top correlated features
top_features = target_correlations[1:5].index.tolist()  # Top 4 features

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.ravel()

for idx, feature in enumerate(top_features):
    axes[idx].scatter(df[feature], df[target_col], alpha=0.5, edgecolors='k', linewidth=0.3)
    axes[idx].set_xlabel(feature, fontsize=11)
    axes[idx].set_ylabel(target_col, fontsize=11)
    axes[idx].set_title(f'{feature} vs {target_col}', fontsize=12, fontweight='bold')
    axes[idx].grid(True, alpha=0.3)
    
    # Add correlation value
    corr = df[feature].corr(df[target_col])
    axes[idx].text(0.05, 0.95, f'r = {corr:.3f}', 
                   transform=axes[idx].transAxes, 
                   fontsize=10, verticalalignment='top',
                   bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.savefig('../results/figures/scatter_plots.png', dpi=300, bbox_inches='tight')
plt.show()

# WHY THIS MATTERS:
# - Visual check of linear relationships
# - Identifies non-linear patterns
# - Spots outliers and clusters

In [None]:
# Check missing values
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing_Count': missing_values.values,
    'Missing_Percent': missing_percent.values
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_df) > 0:
    print("\nColumns with missing values:")
    print(missing_df)
    
    # Visualize
    plt.figure(figsize=(10, 6))
    plt.barh(missing_df['Column'], missing_df['Missing_Percent'])
    plt.xlabel('Percentage Missing (%)', fontsize=12)
    plt.ylabel('Columns', fontsize=12)
    plt.title('Missing Values by Column', fontsize=14, fontweight='bold')
    plt.grid(True, alpha=0.3, axis='x')
    plt.tight_layout()
    plt.savefig('../results/figures/missing_values.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("\nNo missing values found!")
    # WHY THIS MATTERS:
    # - Determines preprocessing strategy
    # - Identifies data quality issues
    # - Guides handling approach (drop, impute, etc.)