In [1]:
# importing modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

In [2]:
# Importing necessary files
Pitching_df = pd.read_csv('Extra Pitching Data/Pitch Data - 2008.csv')
People_df = pd.read_csv('Extra Pitching Data/Tommy John Surgery List (@MLBPlayerAnalys) - TJ List.csv')

In [3]:
# checking the shape of the data
print(People_df.shape)
print(Pitching_df.shape)

(2490, 42)
(3277, 35)


In [4]:
# import Unidecode
from unidecode import unidecode

# Function to preprocess names
def preprocess_name(name):
    # Remove apostrophes and accent markings
    name = unidecode(name)
    # Split into first and last name
    name_parts = name.split()
    # Join parts back together without special characters
    return ' '.join(name_parts)

# applying function to Player column
Pitching_df["Player"] = Pitching_df["player_name"].apply(preprocess_name)

In [5]:
# Count the occurrences of each player's name
People_df['No. TJ Surgeries'] = People_df.groupby('Player')['Player'].transform('count')

In [6]:
# checking df
People_df

Unnamed: 0,Player,TJ Surgery Date,Team,Level,Position,Throws,Country,High School,College(s),Age,...,K-BB%,ERA-,FIP-,G.1,GS.1,IP.1,K-BB%.1,ERA-.1,FIP-.1,No. TJ Surgeries
0,Jorge Mateo,8/28/2024,BAL,MLB,2B,R*,Dominican,,,29,...,,,,,,,,,,1
1,Adbert Alzolay,8/7/2024,CHC,AAA,P,R,Venezuela,,,29,...,,,,,,,,,,1
2,Luis Medina,8/7/2024,OAK,MLB,P,R,Dominican,,,25,...,,,,,,,,,,1
3,River Ryan,8/1/2024,LAD,MLB,P,R,United States,North Carolina,North Carolina,25,...,,,,,,,,,,1
4,Luke Keaschall,8/1/2024,MIN,AA,2B,R,United States,California,"San Francisco, Arizona State",21,...,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2485,Tom Candiotti,10/13/1981,MIL,AA,P,R,United States,California,St. Mary's of California,23,...,,,,,,,,,,1
2486,Bill Bordley,4/13/1981,SF,MLB,P,L,United States,California,"El Camino, USC",23,...,,,,,,,,,,1
2487,Joe Hesketh,1/1/1981,WAS,AA,P,L,United States,New York,State of New York,22,...,,,,,,,,,,1
2488,Brent Strom,1/1/1978,SD,MLB,P,L,United States,California,"San Diego City, USC",28,...,,,,,,,,,,1


In [7]:
# dropping columns
People_df = People_df[["Player", "No. TJ Surgeries"]]

In [8]:
# dropping duplicates of names
People_df = People_df.drop_duplicates(subset = "Player", keep = "first")

In [9]:
# checking df
People_df

Unnamed: 0,Player,No. TJ Surgeries
0,Jorge Mateo,1
1,Adbert Alzolay,1
2,Luis Medina,1
3,River Ryan,1
4,Luke Keaschall,1
...,...,...
2485,Tom Candiotti,1
2486,Bill Bordley,1
2487,Joe Hesketh,1
2488,Brent Strom,1


In [10]:
# dropping columns
Pitching_df = Pitching_df[["pitches", "player_name"]]

In [11]:
# checking df
Pitching_df

Unnamed: 0,pitches,player_name
0,52108,"Verlander, Justin"
1,48905,"Scherzer, Max"
2,47573,"Greinke, Zack"
3,44996,"Lester, Jon"
4,44494,"Kershaw, Clayton"
...,...,...
3272,3,"Loretta, Mark"
3273,3,"Sanó, Miguel"
3274,3,"Ervin, Phillip"
3275,3,"Beeter, Clayton"


In [12]:
# func to re format player name into "first name last name" with no accents for easy merging
import unidecode
def reformat_name(name):
    name = unidecode.unidecode(name)
    if pd.isna(name):
        return ''
    last_first = name.split(", ")
    if len(last_first) == 2:
        return f"{last_first[1]} {last_first[0]}"
    return name

In [13]:
# applying func to player name column
Pitching_df["player_name"] = Pitching_df['player_name'].apply(reformat_name)

In [14]:
# checking df
Pitching_df

Unnamed: 0,pitches,player_name
0,52108,Justin Verlander
1,48905,Max Scherzer
2,47573,Zack Greinke
3,44996,Jon Lester
4,44494,Clayton Kershaw
...,...,...
3272,3,Mark Loretta
3273,3,Miguel Sano
3274,3,Phillip Ervin
3275,3,Clayton Beeter


In [15]:
# renaming columns
Pitching_df = Pitching_df.rename(columns = {"player_name":"Player"})

In [16]:
# Merge DataFrames on 'Player' column
merged_df = pd.merge(Pitching_df, People_df, on='Player', how='outer')

In [17]:
# checking df
merged_df.head(5)

Unnamed: 0,pitches,Player,No. TJ Surgeries
0,52108.0,Justin Verlander,1.0
1,48905.0,Max Scherzer,
2,47573.0,Zack Greinke,
3,44996.0,Jon Lester,
4,44494.0,Clayton Kershaw,


In [18]:
# filling null vales with 0
merged_df['No. TJ Surgeries'].fillna(0, inplace=True)

# dropping pitcher with tj surgery before they appear in our 2008 - 2023 data
merged_df.dropna(subset = ["pitches"], inplace = True)

merged_df

Unnamed: 0,pitches,Player,No. TJ Surgeries
0,52108.0,Justin Verlander,1.0
1,48905.0,Max Scherzer,0.0
2,47573.0,Zack Greinke,0.0
3,44996.0,Jon Lester,0.0
4,44494.0,Clayton Kershaw,0.0
...,...,...,...
3272,3.0,Mark Loretta,0.0
3273,3.0,Miguel Sano,1.0
3274,3.0,Phillip Ervin,0.0
3275,3.0,Clayton Beeter,1.0


In [19]:
# exporting csv
merged_df.to_csv('tj_pitching_merge.csv', index=False)

PermissionError: [Errno 13] Permission denied: 'tj_pitching_merge.csv'