# Baseball Pitch Prediction - Exploratory Data Analysis

This notebook explores the MLB pitch data collected for our prediction model. We'll analyze patterns in pitch selection, game situations, and player tendencies.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set style for plots
plt.style.use('seaborn')
sns.set_palette('husl')

# Configure plot settings
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 100

## 1. Load and Examine the Data

In [None]:
# Load the processed data
data = pd.read_csv('data/processed/modeling_data_2021_to_2023.csv')

# Display basic information about the dataset
print("Dataset Shape:", data.shape)
print("\nFeature Names:")
print(data.columns.tolist())
print("\nBasic Statistics:")
data.describe()

## 2. Pitch Type Distribution Analysis

In [None]:
# Overall fastball vs offspeed distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='is_fastball')
plt.title('Distribution of Fastballs vs Offspeed Pitches')
plt.xlabel('Pitch Type (0=Offspeed, 1=Fastball)')
plt.ylabel('Count')

# Calculate and display percentages
fb_pct = (data['is_fastball'].mean() * 100)
print(f"Fastball Percentage: {fb_pct:.2f}%")
print(f"Offspeed Percentage: {100-fb_pct:.2f}%")

## 3. Game Situation Analysis

In [None]:
# Analyze pitch selection by count
plt.figure(figsize=(15, 6))
counts = ['0-0', '0-1', '0-2', '1-0', '1-1', '1-2', '2-0', '2-1', '2-2', '3-0', '3-1', '3-2']
fb_by_count = data.groupby('count')['is_fastball'].mean().reindex(counts) * 100

sns.barplot(x=fb_by_count.index, y=fb_by_count.values)
plt.title('Fastball Percentage by Count')
plt.xlabel('Count')
plt.ylabel('Fastball %')
plt.xticks(rotation=45)

## 4. Pitcher Tendencies Analysis

In [None]:
# Distribution of pitcher fastball percentages
plt.figure(figsize=(12, 6))
sns.histplot(data=data, x='pitcher_fb_pct', bins=30)
plt.title('Distribution of Pitcher Fastball Percentages')
plt.xlabel('Fastball Percentage')
plt.ylabel('Count')

# Show statistics
print("Pitcher Fastball Percentage Statistics:")
print(data['pitcher_fb_pct'].describe())

## 5. Count and Pressure Situation Analysis

In [None]:
# Compare fastball usage in different count situations
situations = ['hitter_count', 'pitcher_count', 'neutral_count']
fb_by_situation = [data[data[sit] == 1]['is_fastball'].mean() * 100 for sit in situations]

plt.figure(figsize=(10, 6))
sns.barplot(x=situations, y=fb_by_situation)
plt.title('Fastball Usage by Count Situation')
plt.xlabel('Situation')
plt.ylabel('Fastball %')
plt.xticks(rotation=45)

## 6. Sequence Analysis

In [None]:
# Analyze how previous pitch affects next pitch selection
prev_next = pd.crosstab(data['prev_is_fastball'], data['is_fastball'], normalize='index') * 100

plt.figure(figsize=(10, 6))
prev_next.plot(kind='bar')
plt.title('Next Pitch Type Based on Previous Pitch')
plt.xlabel('Previous Pitch (0=Offspeed, 1=Fastball)')
plt.ylabel('Percentage')
plt.legend(['Offspeed', 'Fastball'])
plt.xticks(rotation=0)

## 7. Batter Performance Analysis

In [None]:
# Analyze batter success against different pitch types
plt.figure(figsize=(12, 6))
data[['batter_success_vs_fb', 'batter_success_vs_os']].boxplot()
plt.title('Batter Success Rates Against Different Pitch Types')
plt.ylabel('Success Rate')

# Print average success rates
print("Average Success Rates:")
print(f"vs Fastball: {data['batter_success_vs_fb'].mean():.3f}")
print(f"vs Offspeed: {data['batter_success_vs_os'].mean():.3f}")

## 8. Correlation Analysis

In [None]:
# Create correlation matrix of numerical features
correlation_matrix = data.select_dtypes(include=[np.number]).corr()

plt.figure(figsize=(15, 12))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)