# Baseball Statistics: Data Preparation

Baseball is a game full of statistics, and most of those statistics have been consistently and carefully tracked going back to the late 1800s. That makes professional baseball a playground for data analysts. Here I look at interesting correlations between players, their stats, and their salaries.

**Data Source:** [Lahman's Baseball Database](http://www.seanlahman.com/baseball-archive/statistics/). The data set I used was through the 2018 season.

Copyright © 2019 Ken Norton (ken@kennethnorton.com)

In [6]:
import sys
import logging
import datetime as dt

import numpy as np
import scipy as sp
import sklearn
import statsmodels.api as sm
from statsmodels.formula.api import ols

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [7]:
# Set default matplotlib settings
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams['figure.figsize'] = (10, 7)
plt.rcParams['lines.linewidth'] = 3
plt.rcParams['figure.titlesize'] = 26
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 22
plt.rcParams['xtick.labelsize'] = 14
plt.rcParams['ytick.labelsize'] = 14
plt.rcParams['legend.fontsize'] = 16

plt.style.use(['default', 'fivethirtyeight', 'seaborn-poster'])

In [8]:
# Get the data
pitching = pd.read_csv('data/Pitching.csv')
salaries = pd.read_csv('data/Salaries.csv')
people = pd.read_csv('data/People.csv')
fielding = pd.read_csv('data/Fielding.csv')
teams = pd.read_csv('data/Teams.csv')
hof = pd.read_csv('data/HallofFame.csv')
batting = pd.read_csv('data/Batting.csv')

In [9]:
# A quick function for looking at the correlation between two values
def correlation(x, y):
    std_x = (x - x.mean()) / x.std(ddof=0)
    std_y = (y - y.mean()) / y.std(ddof=0)
    
    return (std_x * std_y).mean()

In [10]:
# Orange and black are the Giants colors!
# Make a fun little function to generate orange and black plots
def sfgplot(x, y, xtitle, ytitle):
    plt.xlabel(xtitle)
    plt.ylabel(ytitle)
    plt.fill_between(x, y, facecolor='#FD5A1E')
    plt.plot(x, y, color='#FD5A1E', linewidth=5)