In [2]:
import pandas as pd
import plotly.express as px
from dash import Dash, dcc, html, Input, Output
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [3]:
#pipe/group functions (only has functions)

def groupby_sum(kaggle_batting:pd.DataFrame, mlb_data_df:pd.DataFrame):
    mlb_data_df.rename(inplace=True ,columns={"player_id": "playerID",
                                                      "year": "yearID","ab":"AB","hit":"H","double":"2B",
                                                      "triple": "3B","home_run":"HR","strikeout":"SO","walk":"BB",
                                                      "b_rbi":"RBI","r_total_caught_stealing":"CS",
                                                      "r_total_stolen_base":"SB","b_game":"G",
                                                      "b_gnd_into_dp":"GIDP","b_hit_by_pitch":"HBP",
                                                      "b_intent_walk":"IBB","b_sac_bunt":"SH","b_sac_fly":"SF",
                                                      "r_run":"R"})
    numeric_columns = kaggle_batting.select_dtypes(include=["number"]).columns.drop("yearID").drop("stint")
    grouped = pd.DataFrame(kaggle_batting.groupby("yearID")[numeric_columns].sum()).reset_index()
    mlb_grouped = pd.DataFrame(mlb_data_df.groupby("yearID")[numeric_columns].sum()).reset_index()
    grouped = pd.concat([grouped,mlb_grouped]).reset_index(drop=True)
    
    return grouped

def cleaner(total_df:pd.DataFrame):#simply turns any numeric average of a year that is zero to nan since that should not occur
    #droping any year that has a record average of any numeric is zero results in around 64 of the 143 or so years
    numeric_columns = total_df.select_dtypes(include=["number"]).columns.drop("Year")

    for i in numeric_columns:
        total_df[i] = total_df[i].replace(0,np.nan)
    return total_df.reset_index(drop=True)

def add_percentage_rows_of_hits(total_df:pd.DataFrame):
    total_df["1B"] = total_df["H"] - (total_df["2B"] + total_df["3B"] + total_df["HR"])
    total_df["% Of Singles Over Hits"] = (total_df["1B"] / total_df["H"]) * 100
    total_df["% Of Doubles Over Hits"] = (total_df["2B"] / total_df["H"]) * 100
    total_df["% Of Triples Over Hits"] = (total_df["3B"] / total_df["H"]) * 100
    total_df["% Of Home Runs Over Hits"] = (total_df["HR"] / total_df["H"]) * 100
    
    return total_df.rename(columns={"yearID": "Year", "SB": "Total Stolen Bases"})

In [4]:
total_df = groupby_sum(pd.read_csv("final_project_2024_data\Batting.csv"), pd.read_csv("final_project_2024_data\stats.csv"))\
    .pipe(add_percentage_rows_of_hits)\
    .pipe(cleaner)


#SB
px.line(total_df,x="Year",y="Total Stolen Bases",title="Is there statistical evidence for recent rule/equipment changes to promote base stealing?").show()


#ratio of hit type to hits

fig = px.line(total_df,x="Year",y="% Of Singles Over Hits",title="How has the percent of single hits changed over time?")#single Season record 1887

fig.update_layout(
    yaxis_title="% Of Hits"
).show()
fig = px.line(total_df, x="Year", y=["% Of Doubles Over Hits", "% Of Triples Over Hits", "% Of Home Runs Over Hits"],title="How has the percent of double, triple, and home run hits changed over time?")
fig.update_layout(
    yaxis_title="% Of Hits"
).show()

In [5]:
def make_master(df):
    master = pd.read_csv('final_project_2024_data/Master.csv')
    return df.merge(master, how='left', on='playerID')

def make_useful(df):
    df = df[['yearID', 'HR', 'H', 'lgID', 'RBI', 'R', '2B', '3B','SB','CS']].reset_index().rename(columns={'yearID':'Year', 'HR':'Home Runs', 'H':'Hits', 'lgID':'League', 'RBI':'Runs Batted In', 'R':'Runs', '2B':'Doubles', '3B':'Triples','SB':'Stolen Bases','CS':'Caught Stealing'})
    return df[df['Year'] >= 1922]

def singles(df):
    df['Singles'] = df['Hits'] - df['Home Runs'] - df['Doubles'] - df['Triples']
    return df

def final_result(df):
    columns = ['Home Runs', 'Hits', 'Runs Batted In', 'Runs', 'Singles', 'Doubles', 'Triples', 'Stolen Bases','Caught Stealing']
    df = df.groupby(['Year', 'League'])[columns].sum().reset_index()
    return df

In [6]:
df = pd.read_csv('final_project_2024_data/Batting.csv').pipe(make_master)\
    .pipe(make_useful)\
    .pipe(singles)\
    .pipe(final_result)

In [7]:

app = Dash(__name__)

app.layout = html.Div([
    html.H4("Statistics For Each League Over Time"),
    html.P("y-axis:"),
    dcc.RadioItems(
        id="y-axis",
        options=["Hits", "Runs", "Runs Batted In", "Home Runs", "Singles", "Doubles", "Triples", "Stolen Bases","Caught Stealing"],
        value="Hits",
        inline=True
    ),
    dcc.Graph(id="graph")
])

@app.callback(
    Output("graph", "figure"),
    Input("y-axis", "value"),
)
def generate_chart(y):
    fig = px.line(df, x='Year', y=y, color='League', title='How do certain statistics compare between leagues?')
    return fig

if __name__ == '__main__':
    app.run(debug=True)