In [20]:
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer

# Load data
data_1990_2023 = pd.read_csv('1990_2023.csv')

In [21]:
# Data preprocessing
data_1990_2023 = data_1990_2023.dropna()

In [22]:
#Model Selection and Implementation
# Data preprocessing
data_1990_2023 = data_1990_2023.dropna(subset=['WAR'])  # Remove rows with missing WAR values

In [23]:
# Feature selection
selected_features = ['Batting', 'Base Running', 'Fielding', 'Positional', 'Offense', 'Defense']
X_train = data_1990_2023[selected_features]
y_train = data_1990_2023['WAR']

X_test = data_1990_2023[selected_features]
y_test = data_1990_2023['WAR']

In [24]:
# Model training
model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train, y_train)

In [25]:
# Save the trained model using joblib
model_path = 'trained_model.pkl'
joblib.dump(model, model_path)

['trained_model.pkl']

In [26]:
# Model Evaluation
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error: {mse}")
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

Mean Squared Error: 6.291499136898979
Root Mean Squared Error: 2.5082860955040553
Mean Absolute Error: 1.9114096816854051
R-squared: 0.9758698894431309


# Model prediction for Top 10 Career WAR since 1990

In [51]:
# Model prediction for 2013-2022
predicted_wars = model.predict(data_1990_2023[selected_features])

In [52]:
# Save predictions for comparison
data_1990_2023['Predicted_WAR'] = predicted_wars
data_1990_2023.to_csv('predicted_wars_1990_2023.csv', index=False)
# Load the original data with predicted WAR values
data_1990_2023_with_predictions = pd.read_csv('predicted_wars_1990_2023.csv')

In [53]:
# Analyzing Predictions
def analyze_predictions(data_with_predictions):
    sorted_data = data_with_predictions.sort_values(by='Predicted_WAR', ascending=False)
    top_10_original = sorted_data.head(10)
    return top_10_original

data_1990_2023_with_predictions = pd.read_csv('predicted_wars_1990_2023.csv')
top_10_original = analyze_predictions(data_1990_2023_with_predictions)

In [54]:
# Calculate metrics
y_true = data_2013_2022_with_predictions['WAR']
y_pred = data_2013_2022_with_predictions['Predicted_WAR']

mse = mean_squared_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred, squared=False)
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

In [55]:
# Sort the data by original WAR values in descending order
sorted_data = data_1990_2023_with_predictions.sort_values(by='WAR', ascending=False)

In [56]:
# Select the top 10 players by WAR
top_10_original = sorted_data.head(10)

In [57]:
# Print the top 10 players along with original
for index, row in top_10_original.iterrows():
    print("Player: {:<20} | Original WAR: {:<10}".format(row['Name'], row['WAR']))

Player: Barry Bonds          | Original WAR: 143.27493504734136
Player: Alex Rodriguez       | Original WAR: 113.71163202697753
Player: Albert Pujols        | Original WAR: 88.77482972441709
Player: Mike Trout           | Original WAR: 85.20292465065245
Player: Chipper Jones        | Original WAR: 84.583813454414
Player: Adrián Beltré        | Original WAR: 83.79983541376689
Player: Jeff Bagwell         | Original WAR: 80.20785192601608
Player: Ken Griffey Jr.      | Original WAR: 75.2547769600008
Player: Derek Jeter          | Original WAR: 73.07613040995574
Player: Frank Thomas         | Original WAR: 72.05993742316457


In [35]:
# Load the data with predicted WAR values
data_1990_2023_with_predictions = pd.read_csv('predicted_wars_1990_2023.csv')

# Sort the data by original WAR values and select the top 10 players
top_10_original = data_1990_2023_with_predictions.sort_values(by='WAR', ascending=False).head(10)

# Sort the data by predicted WAR values and select the top 10 players
top_10_predicted = data_1990_2023_with_predictions.sort_values(by='Predicted_WAR', ascending=False).head(10)

# Display the top 10 players by original WAR
print("Top 10 Players by Original WAR:")
for index, row in top_10_original.iterrows():
    print("Player: {:<20} | Original WAR: {:<10}".format(row['Name'], row['WAR']))

# Display the top 10 players by predicted WAR
print("\nTop 10 Players by Predicted WAR:")
for index, row in top_10_predicted.iterrows():
    print("Player: {:<20} | Predicted WAR: {:<10}".format(row['Name'], row['Predicted_WAR']))

Top 10 Players by Original WAR:
Player: Barry Bonds          | Original WAR: 143.27493504734136
Player: Alex Rodriguez       | Original WAR: 113.71163202697753
Player: Albert Pujols        | Original WAR: 88.77482972441709
Player: Mike Trout           | Original WAR: 85.20292465065245
Player: Chipper Jones        | Original WAR: 84.583813454414
Player: Adrián Beltré        | Original WAR: 83.79983541376689
Player: Jeff Bagwell         | Original WAR: 80.20785192601608
Player: Ken Griffey Jr.      | Original WAR: 75.2547769600008
Player: Derek Jeter          | Original WAR: 73.07613040995574
Player: Frank Thomas         | Original WAR: 72.05993742316457

Top 10 Players by Predicted WAR:
Player: Barry Bonds          | Predicted WAR: 125.5350340692872
Player: Alex Rodriguez       | Predicted WAR: 103.2112828109956
Player: Mike Trout           | Predicted WAR: 83.58772515967836
Player: Albert Pujols        | Predicted WAR: 81.62154485263059
Player: Frank Thomas         | Predicted WAR: 81.

In [36]:
# Load the data with predicted WAR values
data_2013_2022_with_predictions = pd.read_csv('predicted_wars_1990_2023.csv')

# Sort the data by original WAR values and select the top 10 players
top_10_original = data_2013_2022_with_predictions.sort_values(by='WAR', ascending=False).head(10)

# Sort the data by predicted WAR values and select the top 10 players
top_10_predicted = data_2013_2022_with_predictions.sort_values(by='Predicted_WAR', ascending=False).head(10)

# Find common player names
common_player_names = set(top_10_original['Name']).intersection(top_10_predicted['Name'])

# Display the common player names and the count
print("Common Player Names in Top 10:")
for name in common_player_names:
    print(name)

print("\nNumber of common players:", len(common_player_names))


Common Player Names in Top 10:
Alex Rodriguez
Frank Thomas
Adrián Beltré
Albert Pujols
Mike Trout
Chipper Jones
Jeff Bagwell
Ken Griffey Jr.
Barry Bonds

Number of common players: 9


In [50]:
import pandas as pd
from sklearn.impute import SimpleImputer
import joblib

# Load the trained model
model_path = 'trained_model.pkl'  # Update with the correct path to your trained model
model = joblib.load(model_path)

# Define the years for prediction
years_to_predict = [2024]

# Loop through the age groups
for age in range(25, 31):
    # Load the dataset for the specific age group
    data_age = pd.read_csv(f'Age_{age}.csv')
    
    # Preprocess the data
    data_age = data_age.dropna(subset=['WAR'])
    X_age = data_age[selected_features]
    
    # Handle missing values using an imputer
    imputer = SimpleImputer()
    X_age_imputed = imputer.fit_transform(X_age)
    
    # Use the trained model to predict WAR values for the age group
    predicted_wars_age = model.predict(X_age_imputed)
    
    # Calculate the adjustment factor based on the highest predicted WAR value
    highest_predicted_war = max(predicted_wars_age)
    adjustment_factor = 8.0 / highest_predicted_war
    
    # Adjust the predicted WAR values proportionally
    predicted_wars_age_adjusted = [war * adjustment_factor for war in predicted_wars_age]
    
    # Add adjusted predicted WAR values to the dataset
    data_age['Predicted_WAR_Adjusted'] = predicted_wars_age_adjusted
    
    # Sort the data by adjusted predicted WAR values and select the top 10 players
    top_10_predicted_age_adjusted = data_age.sort_values(by='Predicted_WAR_Adjusted', ascending=False).head(10)
    
    # Print the top 10 players for the age group with adjusted predicted WAR values
    print(f"Top 10 Players by Adjusted Predicted WAR for Age {age} in 2024:")
    for rank, (_, row) in enumerate(top_10_predicted_age_adjusted.iterrows(), start=1):
        print(f"{rank}. {row['Name']: <20} | Team: {row['Team']: <20} | Adjusted Predicted WAR: {row['Predicted_WAR_Adjusted']:.2f}")
    
    # Export the modified dataset for the current age group to a new CSV file
    data_age.to_csv(f'Predicted_Age_{age}_Adjusted.csv', index=False)
    
    print("\n")


Top 10 Players by Adjusted Predicted WAR for Age 25 in 2024:
1. Ronald Acuña Jr.     | Team: ATL                  | Adjusted Predicted WAR: 8.00
2. Luis Robert Jr.      | Team: CHW                  | Adjusted Predicted WAR: 6.51
3. Bo Bichette          | Team: TOR                  | Adjusted Predicted WAR: 4.29
4. Lars Nootbaar        | Team: STL                  | Adjusted Predicted WAR: 3.95
5. Josh Lowe            | Team: TBR                  | Adjusted Predicted WAR: 3.87
6. William Contreras    | Team: MIL                  | Adjusted Predicted WAR: 3.86
7. Bryson Stott         | Team: PHI                  | Adjusted Predicted WAR: 3.49
8. Kerry Carpenter      | Team: DET                  | Adjusted Predicted WAR: 3.33
9. Grae Kessinger       | Team: HOU                  | Adjusted Predicted WAR: 3.33
10. Dominic Fletcher     | Team: ARI                  | Adjusted Predicted WAR: 3.32


Top 10 Players by Adjusted Predicted WAR for Age 26 in 2024:
1. Kyle Tucker          | Team: HOU



# Visualizations

In [42]:
import pandas as pd
from sklearn.impute import SimpleImputer
import joblib

# Load the trained model
model_path = 'trained_model.pkl'  # Update with the correct path to your trained model
model = joblib.load(model_path)

# Load the data for the year 2022
data_2022 = pd.read_csv('2022.csv')

# Preprocess the data (if needed)
# For example, drop any rows with missing values
data_2022 = data_2022.dropna()

# Feature selection
selected_features = ['Batting', 'Base Running', 'Fielding', 'Positional', 'Offense', 'Defense']
X_2022 = data_2022[selected_features]

# Handle missing values using an imputer
imputer = SimpleImputer()
X_2022_imputed = imputer.fit_transform(X_2022)

# Use the trained model to predict WAR values for 2023
predicted_wars_2023 = model.predict(X_2022_imputed)

# Calculate the adjustment factor based on the highest predicted WAR value
highest_predicted_war = max(predicted_wars_2023)
adjustment_factor = 7.3 / highest_predicted_war

# Adjust the predicted WAR values proportionally
predicted_wars_2023_adjusted = [war * adjustment_factor for war in predicted_wars_2023]

# Add adjusted predicted WAR values to the 2022 dataset
data_2022['Predicted_WAR_2023_Adjusted'] = predicted_wars_2023_adjusted

# Sort the data by adjusted predicted WAR values for 2023 and select the top 10 players
top_10_predicted_2023_adjusted = data_2022.sort_values(by='Predicted_WAR_2023_Adjusted', ascending=False).head(10)

# Write the top 10 players with adjusted predicted WAR values to a new CSV file
top_10_predicted_2023_adjusted.to_csv('top_10_adjusted_war_2023.csv', index=False)

print("Top 10 Players by Adjusted Predicted WAR for 2023:")
for idx, (index, row) in enumerate(top_10_predicted_2023_adjusted.iterrows(), start=1):
    print(f"{idx}. Player: {row['Name']:<20} | Adjusted Predicted WAR 2023: {row['Predicted_WAR_2023_Adjusted']:.2f}")


Top 10 Players by Adjusted Predicted WAR for 2023:
1. Player: Aaron Judge          | Adjusted Predicted WAR 2023: 7.30
2. Player: Paul Goldschmidt     | Adjusted Predicted WAR 2023: 5.88
3. Player: Nolan Arenado        | Adjusted Predicted WAR 2023: 5.86
4. Player: Manny Machado        | Adjusted Predicted WAR 2023: 5.85
5. Player: Jose Altuve          | Adjusted Predicted WAR 2023: 5.55
6. Player: Yordan Alvarez       | Adjusted Predicted WAR 2023: 5.52
7. Player: Freddie Freeman      | Adjusted Predicted WAR 2023: 5.39
8. Player: Austin Riley         | Adjusted Predicted WAR 2023: 4.83
9. Player: José Ramírez         | Adjusted Predicted WAR 2023: 4.81
10. Player: J.T. Realmuto        | Adjusted Predicted WAR 2023: 4.76




In [43]:
import csv

# Read data from the CSV file with the correct encoding and skip initial characters
data = []
with open('2023.csv', 'r', encoding='utf-8-sig') as csvfile:
    csv_reader = csv.DictReader(csvfile)
    for row in csv_reader:
        data.append(row)

# Convert WAR values to float and sort the data by WAR
data.sort(key=lambda x: float(x['WAR']), reverse=True)

# Extract the top ten players
top_ten = data[:10]

# Write the top ten players to a new CSV file
with open('top_ten_war_2023.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = [
        'Name', 'Team', 'Batting', 'Base Running', 'Fielding',
        'Positional', 'Offense', 'Defense', 'League', 'Replacement',
        'RAR', 'WAR', 'Dollars', 'NameASCII', 'PlayerId', 'MLBAMID'
    ]
    csv_writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    csv_writer.writeheader()
    csv_writer.writerows(top_ten)

# Print the top ten players by WAR
for idx, player in enumerate(top_ten, start=1):
    print(f"{idx}. Player: {player['Name']} | WAR: {player['WAR']}")




1. Player: Mookie Betts | WAR: 7.327975986929802
2. Player: Freddie Freeman | WAR: 6.638534213255805
3. Player: Shohei Ohtani | WAR: 6.471881355205432
4. Player: Ronald Acuña Jr. | WAR: 6.209565928163518
5. Player: Francisco Lindor | WAR: 5.192459056873583
6. Player: Julio Rodríguez | WAR: 5.137760386533369
7. Player: Bobby Witt Jr. | WAR: 4.93077953028478
8. Player: Luis Robert Jr. | WAR: 4.8928087102231395
9. Player: Corbin Carroll | WAR: 4.850268787330351
10. Player: Marcus Semien | WAR: 4.783538038609257
