In [19]:
import numpy as np
import pandas as pd
import math

In [25]:
df_ball = pd.read_csv("Data/ball_pos_event.csv")
player = pd.read_csv('player&ball_pos_analysis/player_pos_analysis.csv')

In [21]:
df_ball.head(100)

Unnamed: 0,playID,playPerGame,gameID,ballPositionX,ballPositionY,ballPositionZ,ballTimeStamp,eventCode
0,23916,3,1,-0.335508,54.21180,5.79471,38518,1.0
1,23916,3,1,-0.227452,47.68860,5.45637,38568,
2,23916,3,1,-0.135159,41.22240,5.07717,38618,
3,23916,3,1,-0.058631,34.81290,4.65711,38668,
4,23916,3,1,0.002134,28.46016,4.19616,38718,
...,...,...,...,...,...,...,...,...
95,7027,14,1,-0.062554,12.68445,7.00338,419518,
96,7027,14,1,-0.216447,18.05004,9.55638,419568,
97,7027,14,1,-0.382056,23.34888,12.01359,419618,
98,7027,14,1,-0.559287,28.58178,14.37576,419668,


In [31]:
player.sample(10)

Unnamed: 0,playID,playPerGame,gameID,playerNum,posCode,posName,playerTimeStamp,fieldX,fieldY
8725,672,42,40,0,13,Runner on Third Base,7154622,48,96
291639,22433,21,57,6158,3,First Baseman,1029489,-10,153
271128,20856,96,68,0,12,Runner on Second Base,4400939,-22,308
304432,23417,150,64,9639,5,Third Baseman,4600389,133,247
116232,8941,173,32,1972,6,ShortStop,730689,60,139
314959,24227,74,96,0,12,Runner on Second Base,2072358,-138,256
208295,16023,177,49,6240,3,First Baseman,6507072,18,150
7867,606,43,68,8840,1,Pitcher,5283172,-125,269
175847,13527,85,41,1802,9,Right Field,916258,129,240
77758,5981,179,80,6188,9,Right Field,5131272,61,64


In [4]:
# Calculate travel distance
def travel_distance(x1, y1, z1, x2, y2, z2):
    return np.sqrt((x2 - x1)**2 + (y2 - y1)**2 + (z2 - z1)**2)

# Dictionary to store results for each playID
results = {}

# The start of the ball pos would be when the eventCode = 4, 
# and the end of the ball pos would be when the eventCode = 2 or 11
for playID, play_data in df_ball.groupby('playID'):
    event_codes = play_data['eventCode'].values
    
    start_idx = np.where(event_codes == 4)[0][0] if 4 in event_codes else None
    end_idx = next((i for i, code in enumerate(event_codes) if code in [2, 11]), None)
    
    if start_idx is not None and end_idx is not None:
        positions = play_data[['ballPositionX', 'ballPositionY', 'ballPositionZ']].values
        timestamps = play_data['ballTimeStamp'].values / 1000  # Convert to seconds
        
        positions = positions[start_idx:end_idx+1]
        timestamps = timestamps[start_idx:end_idx+1]
        
        speeds = []
        for i in range(1, len(positions)):
            distance = travel_distance(*positions[i-1], *positions[i])
            time_interval = timestamps[i] - timestamps[i-1]
            
            if time_interval > 0:  # Avoid division by zero
                speed = distance / time_interval
                speeds.append(speed)
        
        total_distance = sum(travel_distance(*pos1, *pos2) for pos1, pos2 in zip(positions[:-1], positions[1:]))
        
        results[playID] = {
            'Total Distance': total_distance,
            'Average Ball Speed': np.mean(speeds) if speeds else 0  # Set to 0 if no speeds available
        }
    else:
        results[playID] = {
            'Total Distance': 0,
            'Average Ball Speed': 0
        }

output_df = pd.DataFrame.from_dict(results, orient='index')

# Reset the index and rename the columns
output_df.reset_index(inplace=True)
output_df.rename(columns={'index': 'playID'}, inplace=True)

print(output_df)

      playID  Total Distance  Average Ball Speed
0          6      146.409737           69.718922
1          8        0.000000            0.000000
2          9        0.000000            0.000000
3         10        0.000000            0.000000
4         11        0.000000            0.000000
...      ...             ...                 ...
9054   26297      142.418452           50.863733
9055   26298        0.000000            0.000000
9056   26300        0.000000            0.000000
9057   26302      350.872622           71.826088
9058   26303      189.547407           36.805322

[9059 rows x 3 columns]


In [7]:
output_df

Unnamed: 0,playID,Total Distance,Average Ball Speed
0,6,146.409737,69.718922
1,8,0.000000,0.000000
2,9,0.000000,0.000000
3,10,0.000000,0.000000
4,11,0.000000,0.000000
...,...,...,...
9054,26297,142.418452,50.863733
9055,26298,0.000000,0.000000
9056,26300,0.000000,0.000000
9057,26302,350.872622,71.826088


In [64]:

# Function to calculate Euclidean distance between two points
def travel_distance(x1, y1, z1, x2, y2, z2):
    return np.sqrt((x2 - x1)**2 + (y2 - y1)**2 + (z2 - z1)**2)

# Dictionary to store results for each playID
results = {}

# The start of the ball pos would be when the eventCode = 4, 
# and the end of the ball pos would be when the eventCode = 2 or 11
for playID, play_data in df_ball.groupby('playID'):
    event_codes = play_data['eventCode'].values
    
    start_idx = np.where(event_codes == 4)[0][0] if 4 in event_codes else None
    end_idx = next((i for i, code in enumerate(event_codes) if code in [2, 11]), None)
    
    if start_idx is not None and end_idx is not None:
        positions = play_data[['ballPositionX', 'ballPositionY', 'ballPositionZ']].values
        timestamps = play_data['ballTimeStamp'].values / 1000  # Convert to seconds
        
        positions = positions[start_idx:end_idx+1]
        timestamps = timestamps[start_idx:end_idx+1]
        
        speeds = []
        accelerations = []  # New list to store acceleration values
        
        if len(positions) > 1:  # Ensure at least two positions for initial acceleration calculation
            distance = travel_distance(*positions[0], *positions[1])
            time_interval = timestamps[1] - timestamps[0]
            
            if time_interval > 0:  # Avoid division by zero
                speed = distance / time_interval
                speeds.append(speed)
                
                # Calculate initial acceleration as change in speed over time
                initial_acceleration = speed / time_interval
                accelerations.append(initial_acceleration)
        
        total_distance = sum(travel_distance(*pos1, *pos2) for pos1, pos2 in zip(positions[:-1], positions[1:]))
        
        results[playID] = {
            'TotalDistance': total_distance,
            'AverageBallSpeed': np.mean(speeds) if speeds else 0,  # Set to 0 if no speeds available
            'InitialBallAcceleration': accelerations[0] if accelerations else 0  # Set to 0 if no accelerations available
        }
    else:
        results[playID] = {
            'TotalDistance': 0,
            'AverageBallSpeed': 0,
            'InitialBallAcceleration': 0
        }

output_df = pd.DataFrame.from_dict(results, orient='index')

# Reset the index and rename the columns
output_df.reset_index(inplace=True)
output_df.rename(columns={'index': 'playID'}, inplace=True)

print(output_df)

      playID  TotalDistance  AverageBallSpeed  InitialBallAcceleration
0          6     146.409737         76.303938              1526.078767
1          8       0.000000          0.000000                 0.000000
2          9       0.000000          0.000000                 0.000000
3         10       0.000000          0.000000                 0.000000
4         11       0.000000          0.000000                 0.000000
...      ...            ...               ...                      ...
9054   26297     142.418452        142.361659              2847.233174
9055   26298       0.000000          0.000000                 0.000000
9056   26300       0.000000          0.000000                 0.000000
9057   26302     350.872622         36.235475              1098.044697
9058   26303     189.547407        138.146846              2762.936917

[9059 rows x 4 columns]


In [39]:
output_df.head(10)

Unnamed: 0,playID,Total Distance,Average Ball Speed,Initial Ball Acceleration
0,6,146.409737,76.303938,1526.078767
1,8,0.0,0.0,0.0
2,9,0.0,0.0,0.0
3,10,0.0,0.0,0.0
4,11,0.0,0.0,0.0
5,17,310.540927,157.514202,3150.284047
6,19,314.98893,31.240105,946.66984
7,25,139.815216,51.85551,1037.110196
8,29,0.0,0.0,0.0
9,30,0.0,0.0,0.0


## Player Analysis

In [28]:
sorted_df = player.groupby('gameID').apply(lambda x: x.sort_values(by=['playPerGame', 'playerTimeStamp'])).reset_index(drop=True)

In [29]:
# Build a new dataframe with every row contain the playID playPergame and gameID, also including a new feature
# "result" which have value 1, 2, 3 means batter run to first, second, third base.

# Used to save value
new_data = []

# find all the game id
unique_games = sorted_df['gameID'].unique()

for game_id in unique_games:
    game_data = sorted_df[sorted_df['gameID'] == game_id]
    
    # Now check details about each single game
    for i in range(len(game_data)):
        row = game_data.iloc[i]
        
        if row['posCode'] == 10:
            player_num = row['playerNum']
            play_per_game = row['playPerGame']
            next_play_per_game = play_per_game + 1
            
            next_row = game_data[(game_data['playPerGame'] == next_play_per_game) & 
                                 (game_data['playerNum'] == player_num) & 
                                 (game_data['posCode'].isin([11, 12, 13]))]
            
            if not next_row.empty:
                result = next_row['posCode'].values[0] - 10
                new_data.append((row['playID'], play_per_game, game_id, result))

# build new dataframe
new_columns = ['playID', 'playPerGame', 'gameID', 'result']
batter_df = pd.DataFrame(new_data, columns=new_columns)

In [42]:
batter_df.head(10)

Unnamed: 0,playID,playPerGame,gameID,result
0,25953,6,1,1
1,7027,14,1,1
2,21697,23,1,2
3,6627,43,1,1
4,5766,45,1,1
5,5248,54,1,1
6,2710,55,1,2
7,6079,64,1,1
8,5869,72,1,1
9,8133,83,1,1


In [32]:
batter_df

Unnamed: 0,playID,playPerGame,gameID,result


In [45]:
ball_pos_batter_df = pd.merge(output_df,batter_df, on='playID', how='left')
ball_pos_batter_df

Unnamed: 0,playID,Total Distance,Average Ball Speed,Initial Ball Acceleration,playPerGame,gameID,result
0,6,146.409737,76.303938,1526.078767,,,
1,8,0.000000,0.000000,0.000000,,,
2,9,0.000000,0.000000,0.000000,,,
3,10,0.000000,0.000000,0.000000,,,
4,11,0.000000,0.000000,0.000000,,,
...,...,...,...,...,...,...,...
9054,26297,142.418452,142.361659,2847.233174,,,
9055,26298,0.000000,0.000000,0.000000,,,
9056,26300,0.000000,0.000000,0.000000,,,
9057,26302,350.872622,36.235475,1098.044697,54.0,67.0,2.0


In [46]:
ball_pos_batter_df.sample(10)

Unnamed: 0,playID,Total Distance,Average Ball Speed,Initial Ball Acceleration,playPerGame,gameID,result
3391,10003,413.368,102.832633,2056.652663,192.0,62.0,2.0
3583,10582,316.524338,98.904886,1978.097721,199.0,75.0,1.0
1733,5177,140.626403,209.735492,4194.709849,,,
8708,25315,0.0,0.0,0.0,,,
537,1672,0.0,0.0,0.0,,,
5214,15269,242.122376,177.136818,3542.736363,,,
7148,20827,120.749624,200.204963,4004.099256,,,
3543,10450,231.210642,238.292064,4765.841275,45.0,85.0,1.0
7517,21857,363.265517,140.682465,2813.649294,,,
895,2736,0.0,0.0,0.0,,,


In [49]:
ball_pos_batter_df = ball_pos_batter_df.fillna(0)

In [50]:
ball_pos_batter_df.loc[ball_pos_batter_df['result'] == 0]

Unnamed: 0,playID,Total Distance,Average Ball Speed,Initial Ball Acceleration,playPerGame,gameID,result
0,6,146.409737,76.303938,1526.078767,0.0,0.0,0.0
1,8,0.000000,0.000000,0.000000,0.0,0.0,0.0
2,9,0.000000,0.000000,0.000000,0.0,0.0,0.0
3,10,0.000000,0.000000,0.000000,0.0,0.0,0.0
4,11,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...
9053,26295,123.375990,158.779034,3175.580683,0.0,0.0,0.0
9054,26297,142.418452,142.361659,2847.233174,0.0,0.0,0.0
9055,26298,0.000000,0.000000,0.000000,0.0,0.0,0.0
9056,26300,0.000000,0.000000,0.000000,0.0,0.0,0.0


In [63]:
ball_pos_batter_df['result'].value_counts()

0.0    8025
1.0    1034
Name: result, dtype: int64

In [57]:
# replace the 2 and 3 as 1 
ball_pos_batter_df['result'] = ball_pos_batter_df['result'].replace([2, 3], 1)

In [None]:
ball_pos_batter_df2 = ball_pos_batter_df[ball_pos_batter_df['ColName'] != 0]