In [1]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from itertools import combinations
from collections import OrderedDict

In [2]:
#get player teams into dataframe from player_teams.csv
player_teams = pd.read_csv('../basketballPlayoffs/players_teams.csv')

#get player teams into dataframe from player_teams.csv
coaches = pd.read_csv('../basketballPlayoffs/coaches.csv')

#teams match up results (only post is available)
series_post = pd.read_csv('../basketballPlayoffs/series_post.csv')

#team stats in playoff
teams_post = pd.read_csv('../basketballPlayoffs/teams_post.csv')

#player csv
players = pd.read_csv('../basketballPlayoffs/players.csv')

#awards csv
awards_players = pd.read_csv('../basketballPlayoffs/awards_players.csv')

#teams csv
teams = pd.read_csv('../basketballPlayoffs/teams.csv')

#remove all columns starting with "lgID" from all dataframes
for df in [player_teams, coaches, series_post, teams_post, players, awards_players, teams]:
    for col in df.columns:
        if col.startswith('lgID'):
            df.drop(col, axis=1, inplace=True)




In [3]:
# Assuming you have a column 'year' to sort by
player_teams = player_teams.sort_values(by=['playerID', 'year'])
player_teams['career_year'] = player_teams.groupby('playerID').cumcount() + 1

def compute_percentage(numerator, denominator):
    return round(numerator.divide(denominator).where(denominator != 0, 0.0)*100,2)

# Regular Season Percentages
player_teams['ft%'] = compute_percentage(player_teams['ftMade'], player_teams['ftAttempted'])
player_teams['fg%'] = compute_percentage(player_teams['fgMade'], player_teams['fgAttempted'])
player_teams['three%'] = compute_percentage(player_teams['threeMade'], player_teams['threeAttempted'])
player_teams['gs%'] = compute_percentage(player_teams['GS'], player_teams['GP'])

# Playoffs Percentages
player_teams['Postft%'] = compute_percentage(player_teams['PostftMade'], player_teams['PostftAttempted'])
player_teams['Postfg%'] = compute_percentage(player_teams['PostfgMade'], player_teams['PostfgAttempted'])
player_teams['Postthree%'] = compute_percentage(player_teams['PostthreeMade'], player_teams['PostthreeAttempted'])
player_teams['Postgs%'] = compute_percentage(player_teams['PostGS'], player_teams['PostGP'])

#remove the made and attempted columns
player_teams.drop(['ftMade', 'ftAttempted', 'fgMade', 'fgAttempted', 'threeMade', 'threeAttempted', 'GS', 'GP', 'PostftMade', 'PostftAttempted', 'PostfgMade', 'PostfgAttempted', 'PostthreeMade', 'PostthreeAttempted', 'PostGS', 'PostGP'], axis=1, inplace=True)


player_teams.head()

Unnamed: 0,playerID,year,stint,tmID,minutes,points,oRebounds,dRebounds,rebounds,assists,...,PostDQ,career_year,ft%,fg%,three%,gs%,Postft%,Postfg%,Postthree%,Postgs%
0,abrossv01w,2,0,MIN,846,343,43,131,174,53,...,0,1,72.73,38.91,25.0,88.46,0.0,0.0,0.0,0.0
1,abrossv01w,3,0,MIN,805,314,45,101,146,60,...,0,2,48.28,37.66,33.33,100.0,0.0,0.0,0.0,0.0
2,abrossv01w,4,0,MIN,792,318,44,97,141,82,...,0,3,70.41,39.3,30.49,83.33,100.0,27.27,42.86,100.0
3,abrossv01w,5,0,MIN,462,146,17,57,74,45,...,0,4,60.87,35.25,37.74,50.0,50.0,34.78,25.0,100.0
4,abrossv01w,6,0,MIN,777,304,29,78,107,60,...,0,5,72.6,39.49,40.24,100.0,0.0,0.0,0.0,0.0


In [4]:
coaches['total_games'] = coaches['won']+ coaches['lost']
coaches['W%'] = compute_percentage(coaches['won'],coaches['total_games'])

coaches['total_p_games'] = coaches['post_wins']+ coaches['post_losses']
coaches['postW%'] = compute_percentage(coaches['post_wins'],coaches['total_p_games'])

#remove won and lost columns, and post_wins and post_losses
coaches.drop(['won','lost','post_wins','post_losses'], axis=1, inplace=True)

coaches.head()

Unnamed: 0,coachID,year,tmID,stint,total_games,W%,total_p_games,postW%
0,adamsmi01w,5,WAS,0,34,50.0,3,33.33
1,adubari99w,1,NYL,0,32,62.5,7,57.14
2,adubari99w,2,NYL,0,32,65.62,6,50.0
3,adubari99w,3,NYL,0,32,56.25,8,50.0
4,adubari99w,4,NYL,0,34,47.06,0,0.0


In [5]:
# Get all unique teams
unique_teams = set(series_post['tmIDWinner']).union(series_post['tmIDLoser'])

# Create all possible combinations of matchups (excluding self-matchups)
matchups = combinations(sorted(unique_teams), 2)

# Initialize the records for all matchups
records = OrderedDict(((team1, team2), {'t1winsvst2': 0, 't1lossesvst2': 0}) for team1, team2 in matchups)

# Update win-loss counts
for _, row in series_post.iterrows():
    winner, loser = row['tmIDWinner'], row['tmIDLoser']
    
    if (winner, loser) in records:
        records[(winner, loser)]['t1winsvst2'] += row['W']
        records[(winner, loser)]['t1lossesvst2'] += row['L']
    elif (loser, winner) in records:
        records[(loser, winner)]['t1lossesvst2'] += row['W']
        records[(loser, winner)]['t1winsvst2'] += row['L']

# Convert the OrderedDict to a DataFrame
team_matchups = pd.DataFrame([(k[0], k[1], v['t1winsvst2'], v['t1lossesvst2']) 
                              for k, v in records.items()],
                             columns=['team1', 'team2', 'wins', 'losses'])

# Calculate win probability
team_matchups['totalGames'] = team_matchups['wins'] + team_matchups['losses']
team_matchups['winProb'] = team_matchups.apply(
    lambda row: round((row['wins'] / row['totalGames'])*100,2) if row['totalGames'] != 0 else 50.0,
    axis=1
)

# Drop the totalGames column if it's not needed
team_matchups.drop('totalGames', axis=1, inplace=True)

team_matchups.head()


Unnamed: 0,team1,team2,wins,losses,winProb
0,ATL,CHA,0,0,50.0
1,ATL,CLE,0,0,50.0
2,ATL,CON,0,0,50.0
3,ATL,DET,0,2,0.0
4,ATL,HOU,0,0,50.0


In [6]:
# Group by team ID and sum wins and losses
post_result = teams_post.groupby('tmID').agg({
    'W': 'sum',
    'L': 'sum'
}).reset_index()


In [7]:
#remove players with no birthDate
players = players[players['birthDate'].notnull()]

#remove college and collegeOther columns

players.drop(['college', 'collegeOther', 'deathDate'], axis=1, inplace=True)

#print players with empty columns
players[players.isnull().any(axis=1)]

players.head(200)

Unnamed: 0,bioID,pos,firstseason,lastseason,height,weight,birthDate
0,abrahta01w,C,0,0,74.0,190,1975-09-27
1,abrossv01w,F,0,0,74.0,169,1980-07-09
2,adairje01w,C,0,0,76.0,197,1986-12-19
3,adamsda01w,F-C,0,0,73.0,239,1989-02-19
4,adamsjo01w,C,0,0,75.0,180,1981-05-24
...,...,...,...,...,...,...,...
195,darlihe01w,G,0,0,66.0,164,1978-08-29
196,darscna99w,,0,0,0.0,0,0000-00-00
197,davenje01w,C,0,0,77.0,215,1985-06-24
198,davisbr01w,G,0,0,72.0,172,1983-01-01
