## NBA MVP Prediction Project

In [32]:
import pandas as pd

def mvp_data_cleaning() -> pd.DataFrame:
        # Set mvp csv path
        mvp_csv_path = "./data/mvp_csv_data/mvp_data.csv"

        # Load csv file into a datafram
        mvp_df = pd.read_csv(mvp_csv_path)

        # Filter out data to contain desired/useful columns
        mvp_df = mvp_df[["Player", "Year", "Pts Won", "Pts Max", "Share"]]

        # Return cleaned df 
        return mvp_df

mvps = mvp_data_cleaning()
mvps.head()

Unnamed: 0,Player,Year,Pts Won,Pts Max,Share
0,Hakeem Olajuwon,1994,889,1010,0.88
1,David Robinson,1994,730,1010,0.723
2,Scottie Pippen,1994,390,1010,0.386
3,Shaquille O'Neal,1994,289,1010,0.286
4,Patrick Ewing,1994,255,1010,0.252


In [33]:
def single_row(df: pd.DataFrame) -> pd.DataFrame:
    if df.shape[0] == 1:
        # If there is only one row for player in year, 
        # then return that row

        return df
    else:
        # If there are multiple rows for player in year, 
        # then return the total row and remove the other rows

        row = df[df["Team"] == "TOT"]
        row["Team"] = df.iloc[-1,:]
        return row  

In [34]:
 # Set player csv path
player_csv_path = './data/player_csv_data/player_data.csv'

# Load csv file into a dataframe
player_df = pd.read_csv(player_csv_path)

# Remove useless columns
del player_df['Rk']

# Remove asterisk in some of the player names in player_df
player_df['Player'] = player_df['Player'].str.replace('*', '', regex=False)

# For player with multiple row entries, use the total 
# row and remove the other rows
player_df = player_df.groupby(["Player", "Year"]).apply(single_row)

# Drop index levels added by grouping the player data 
player_df.index = player_df.index.droplevel()
player_df.index = player_df.index.droplevel()
player_df.head()

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards,Year
75,A.C. Green,30.0,PHO,PF,82.0,55.0,34.5,5.7,11.3,0.502,...,5.8,9.2,1.7,0.9,0.5,1.2,1.7,14.7,,1994.0
640,A.C. Green,31.0,PHO,SF,82.0,52.0,32.8,3.8,7.5,0.504,...,5.8,8.2,1.5,0.7,0.4,1.4,1.8,11.2,,1995.0
1230,A.C. Green,32.0,PHO,SF,82.0,36.0,25.8,2.6,5.4,0.484,...,4.7,6.8,0.9,0.5,0.3,1.0,1.7,7.5,,1996.0
Player,,,A.C. Green,,,,,,,,...,,,,,,,,,,
Age,,,33,,,,,,,,...,,,,,,,,,,


In [35]:
def team_data_cleaning() -> pd.DataFrame:

        # Read in the data from the CSV file
        team_df = pd.read_csv("./data/team_csv_data/team_data.csv")

        # Get rid of rows that have "Division" in the "W" column. W is Wins
        team_df = team_df[~team_df["W"].str.contains("Division")]

        # Remove asteriks in team names
        team_df['Team'] = team_df['Team'].str.replace('*', '', regex=False)

        return team_df

team_df = team_data_cleaning()
team_df.head()

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,57,25,0.695,—,98.5,91.5,6.48,1994,New York Knicks
1,50,32,0.61,7.0,105.7,101.8,3.68,1994,Orlando Magic
2,45,37,0.549,12.0,103.2,101.0,2.11,1994,New Jersey Nets
3,42,40,0.512,15.0,103.4,100.7,2.4,1994,Miami Heat
4,32,50,0.39,25.0,100.8,105.1,-4.28,1994,Boston Celtics


In [38]:
def merge_data() -> None:

    # Merge the player and mvp dataframes
    merged_df = player_df.merge(mvps, on=["Player", "Year"], how="outer")

    # Further process Share, Pts Max and Pts Won columns for playes 
    # who have never had any MVP votes. This is because these stats will 
    # be NaN for those players
    merged_df[["Pts Max", "Pts Won", "Share"]] = merged_df[["Pts Max", "Pts Won", "Share"]].fillna(0, inplace=True)

    # Map abbreviated team names to the team_df
    abbreviations = {}

    # Open the teams_config.csv file
    with open("./config/teams_config.csv") as f:

        # Read all the lines in the file
        lines = f.readlines()

        # For each line - skip the header
        for line in lines[1:]:

            # Replace the backslash in the line with "". 
            # Split the line using the ; that is in the data
            abbrev, name = line.replace("\n", "").split(";")
            abbreviations[abbrev] = name

    # Map the team names to the team_df
    merged_df["Team"] = merged_df["Team"].map(abbreviations)

    # Load and clean team data
    team_df = team_data_cleaning()

    # Combine merged_df with team_df
    stats = merged_df.merge(team_df, how="outer", on=["Year", "Team"])

    # Convert some columns to numerical
    stats = stats.apply(pd.to_numeric, errors="ignore")

    # Replace "-" in GB column with 0: This means that the team 
    # was 0 games behind the first seed. Afterwards, the column 
    # is converted to a float
    stats["GB"] = stats["GB"].str.replace("—", "0")
    stats["GB"] = pd.to_numeric(stats["GB"])

    return stats

stats = merge_data()
stats.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df[["Pts Max", "Pts Won", "Share"]] = merged_df[["Pts Max", "Pts Won", "Share"]].fillna(0, inplace=True)


Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,...,Pts Won,Pts Max,Share,W,L,W/L%,GB,PS/G,PA/G,SRS
0,A.C. Green,30.0,Phoenix Suns,PF,82.0,55.0,34.5,5.7,11.3,0.502,...,,,,56.0,26.0,0.683,7.0,108.2,103.4,4.68
1,Cedric Ceballos,24.0,Phoenix Suns,SF,53.0,43.0,30.2,8.0,15.0,0.535,...,,,,56.0,26.0,0.683,7.0,108.2,103.4,4.68
2,Charles Barkley,30.0,Phoenix Suns,PF,65.0,65.0,35.4,8.0,16.1,0.495,...,,,,56.0,26.0,0.683,7.0,108.2,103.4,4.68
3,Dan Majerle,28.0,Phoenix Suns,SG,80.0,76.0,40.1,6.0,14.2,0.418,...,,,,56.0,26.0,0.683,7.0,108.2,103.4,4.68
4,Danny Ainge,34.0,Phoenix Suns,SG,68.0,1.0,22.9,3.3,7.9,0.417,...,,,,56.0,26.0,0.683,7.0,108.2,103.4,4.68
