In [27]:
# Importing the pandas library to handle data in DataFrame format
import pandas as pd
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


# Define the path to the Excel file
file_path = './MLR_S9.xlsx'

# Loading the Excel file into a DataFrame
# The engine 'openpyxl' is specifically for .xlsx files
try:
    data = pd.read_excel(file_path, engine='openpyxl')
    print("Data loaded successfully!")
except Exception as e:
    print("An error occurred:", e)

Data loaded successfully!


In [28]:
# Displaying the first few rows of the DataFrame to verify the content
print(data.columns)
data.head()

Index(['Hitter', 'Hitter ID', 'Swing', 'Pitcher', 'Pitcher ID', 'Pitch',
       'Old Result', 'Diff', 'Inning', 'Outs', 'OBC', 'Home Score',
       'Away Score', 'Batter WPA', 'Pitcher WPA', 'RBI', 'Run', 'Inning ID',
       'Game ID', 'Session', 'Batter Team', 'Pitcher Team', 'Exact Result',
       'Result at Neutral', 'Result All Neutral',
       'Pitcher Responsible for Runner on 3rd Who Scored',
       'Pitcher Responsible for Runner on 2nd Who Scored',
       'Pitcher Responsible for Runner on 1st Who Scored',
       'Pitcher Responsible for Batter Who Scored', 'PA Type'],
      dtype='object')


Unnamed: 0,Hitter,Hitter ID,Swing,Pitcher,Pitcher ID,Pitch,Old Result,Diff,Inning,Outs,...,Batter Team,Pitcher Team,Exact Result,Result at Neutral,Result All Neutral,Pitcher Responsible for Runner on 3rd Who Scored,Pitcher Responsible for Runner on 2nd Who Scored,Pitcher Responsible for Runner on 1st Who Scored,Pitcher Responsible for Batter Who Scored,PA Type
0,Lane Drew,190,211.0,Huascal Bandito,2786,825.0,PO,386.0,T1,0,...,MIA,ARI,PO,PO,PO,,,,,1
1,Captain Trash,2696,303.0,Huascal Bandito,2786,954.0,K,349.0,T1,1,...,MIA,ARI,K,K,K,,,,,1
2,Hakuna Moncada,2501,778.0,Huascal Bandito,2786,489.0,K,289.0,T1,2,...,MIA,ARI,K,K,K,,,,,1
3,Biggus Dickus,2458,,Artanis Jones,37,,AUTO K,,B1,0,...,ARI,MIA,AUTO K,AUTO K,AUTO K,,,,,9
4,Iris Solstice,401,349.0,Artanis Jones,37,214.0,BB,135.0,B1,1,...,ARI,MIA,BB,FO,1B,,,,,1


In [29]:
# Model Factory to create models
def create_model(model_name):
    if model_name == 'randModel':
        # Define the model function
        def model(x):
            return random.randint(1, 1000)
        return model
    elif model_name == 'prevPitch':
        # Define the previous pitch model function
        def model(data, index):
            if index > 0 and not pd.isna(data.iloc[index - 1]['Pitch']):
                return data.iloc[index - 1]['Pitch']
            else:
                return random.randint(1, 1000)
        return model
    else:
        raise ValueError("Model not defined in the factory.")


In [14]:
def calculate_dpa(pitch, model_swing):
    """
    Calculate the DPA value based on pitch and model swing.
    
    DPA = MIN(ABS(Pitch - Model swing), 1000 - ABS(Pitch - Model swing))
    
    Args:
    pitch (int): The pitch number.
    model_swing (int): The model's predicted swing number.
    
    Returns:
    int: The calculated DPA value.
    """
    dpa = min(abs(pitch - model_swing), 1000 - abs(pitch - model_swing))
    return dpa

In [15]:
# Assuming we might add feature-based models later, we prepare the data normally
features = []  # Since randModel does not use features, this is empty for now
target = 'Pitch'

# Extracting target data
y = data[target]

# There are no feature columns to drop since we're not using them currently
X = data.drop(columns=['Game ID', 'Pitch'])  # Dummy X, essentially empty for randModel

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [16]:
def apply_model_and_calculate_dpa(X, y, model_name):
    # Create model using the factory
    model = create_model(model_name)
    
    # Apply the model to generate predictions
    # Naming columns dynamically based on the model name
    swing_column_name = f'{model_name} Swing'
    dpa_column_name = f'{model_name} DPA'
    
    # The model application does not depend on X for the randModel
    X[swing_column_name] = [model(x) for x in range(len(X))]
    
    # Calculate DPA for each prediction using the dynamically named columns
    X[dpa_column_name] = [calculate_dpa(p, s) for p, s in zip(y, X[swing_column_name])]
    
    return X

# Applying model to training data
trained_data = apply_model_and_calculate_dpa(X_train.copy(), y_train, 'randModel')

# Applying model to testing data
tested_data = apply_model_and_calculate_dpa(X_test.copy(), y_test, 'randModel')


In [24]:
def evaluate_models(data, model_names):
    results = {}
    for model in model_names:
        swing_column = f'{model} Swing'
        dpa_column = f'{model} DPA'
        
        # Ensure data columns for evaluation are present and drop rows with NaN in these columns
        valid_data = data.dropna(subset=[swing_column, dpa_column, 'Swing', 'Diff'])
        
        # Calculate metrics for the model
        count_less_than_150 = np.sum(valid_data[dpa_column] < 150)
        avg_count_less_than_150 = count_less_than_150 / len(valid_data) if len(valid_data) > 0 else 0
        mean_dpa = valid_data[dpa_column].mean()
        mse = np.mean(np.square(valid_data[dpa_column]))
        
        # Store results for the model
        results[model] = {
            'Average Count DPA < 150': avg_count_less_than_150,
            'Mean DPA': mean_dpa,
            'Mean Squared Error': mse
        }
        
    # Calculate metrics for actual game results using the 'Diff' column
    game_count_less_than_150 = np.sum(np.abs(valid_data['Diff']) < 150)
    game_avg_count_less_than_150 = game_count_less_than_150 / len(valid_data) if len(valid_data) > 0 else 0
    game_mean_dpa = np.mean(np.abs(valid_data['Diff']))
    game_mse = np.mean(np.square(valid_data['Diff']))
    
    results['Actual Game'] = {
        'Average Count DPA < 150': game_avg_count_less_than_150,
        'Mean DPA': game_mean_dpa,
        'Mean Squared Error': game_mse
    }
    
    return results


In [25]:
# Evaluate all models and the actual game results
model_names = ['randModel']  # Add more models as needed
train_evaluation_results = evaluate_models(trained_data, model_names)
test_evaluation_results = evaluate_models(tested_data, model_names)

# Print training and testing evaluation results
print("Training Data Performance:")
for model, metrics in train_evaluation_results.items():
    print(f"Results for {model}:")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value}")
    print("\n")
    
print("Testing Data Performance:")
for model, metrics in test_evaluation_results.items():
    print(f"Results for {model}:")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value}")
    print("\n")


Training Data Performance:
Results for randModel:
Average Count DPA < 150: 0.2943016759776536
Mean DPA: 250.53486033519553
Mean Squared Error: 83404.89508379888


Results for Actual Game:
Average Count DPA < 150: 0.3105027932960894
Mean DPA: 247.31653631284917
Mean Squared Error: 82640.93821229051


Testing Data Performance:
Results for randModel:
Average Count DPA < 150: 0.31596769992185464
Mean DPA: 246.22896587653034
Mean Squared Error: 81813.46991404012


Results for Actual Game:
Average Count DPA < 150: 0.31362333941130505
Mean DPA: 245.33628549101329
Mean Squared Error: 81668.17660849179


