
This notebook addresses data preparation for a baseball player hits prediction model for the next season. To prevent data leakage, we shift each player's hits data to the next season. This method ensures our predictions rely only on pre-season information, enhancing accuracy. The adjusted dataset is then saved for modeling.

In [3]:
import pandas as pd
import numpy as np

In [4]:
# Load full_batting_data.csv
df = pd.read_csv('../resources/full_batting_data.csv')

In [5]:
# Keep only necessery columns - IDfg,Season,Name,Age,PA,H,HR,R,RBI,SB,BB,SO,AVG,OBP,SLG,OPS,wOBA,WAR,BABIP,LD%,GB%,Contact%,SwStr%,Barrel%,EV,maxEV

data = df.loc[:, ['IDfg', 'Season', 'Name', 'Age', 'PA', 'H', 'HR', 'R', 'RBI', 'SB', 'BB', 'SO', 'AVG', 'OBP', 'SLG', 'OPS', 'wOBA', 'WAR', 'BABIP', 'LD%', 'GB%', 'Contact%', 'SwStr%', 'Barrel%', 'EV', 'maxEV']]


In [6]:
data.head()

Unnamed: 0,IDfg,Season,Name,Age,PA,H,HR,R,RBI,SB,...,wOBA,WAR,BABIP,LD%,GB%,Contact%,SwStr%,Barrel%,EV,maxEV
0,13611,2018,Mookie Betts,25,614,180,32,129,80,30,...,0.449,10.4,0.368,0.212,0.339,0.859,0.05,0.131,92.3,110.6
1,10155,2018,Mike Trout,26,608,147,39,101,79,24,...,0.447,9.5,0.346,0.234,0.313,0.841,0.06,0.153,91.2,118.0
2,11579,2015,Bryce Harper,22,654,172,42,118,99,6,...,0.461,9.3,0.369,0.222,0.385,0.754,0.108,0.122,91.4,116.0
3,10155,2015,Mike Trout,23,682,172,41,104,90,11,...,0.415,9.3,0.344,0.244,0.372,0.8,0.075,0.161,92.9,117.7
4,15640,2017,Aaron Judge,25,678,154,52,128,114,9,...,0.43,8.7,0.357,0.219,0.349,0.676,0.133,0.249,94.9,121.1


In [7]:
# data = pd.read_csv('resources/dataset.csv')
# data.head()

In [8]:
data_len = len(data)
data_len
    

3310

In [9]:
next_hits = np.empty((data_len, 1))
next_hits[:, 0] = np.nan
next_hits

array([[nan],
       [nan],
       [nan],
       ...,
       [nan],
       [nan],
       [nan]])

In [10]:
for row in range(data_len):
    season = data.loc[row,"Season"]
    id = data.loc[row,"IDfg"]
    
    next_season = season + 1
    
    # print(f'season: {season}, id: {id}, next season={next_season}')
    
    intersect = ((data['IDfg']==id) & (data['Season']==next_season))
    
    if np.any(intersect):
        next_hits[row, 0] = data.loc[intersect,"H"]
        


  next_hits[row, 0] = data.loc[intersect,"H"]


In [11]:
np.savetxt('../next_hits.csv', next_hits, delimiter=",")

In [12]:
data['next_year_hits'] = next_hits

In [13]:
data.to_csv('../resources/data2.csv', na_rep='NULL')