# Setup and Data    

In [2]:
import pandas as pd

awards = pd.read_csv(".../Datasets/awards_data.csv")
player_data = pd.read_csv(".../Datasets/player_stats.csv")
team_data = pd.read_csv(".../Datasets/team_stats.csv")
rebounding_data = pd.read_csv(".../Datasets/team_rebounding_data_22.csv")

What is the average number of points per game for players in the 2007-2021 seasons who won All NBA First, Second, and Third teams (**not** the All Defensive Teams), as well as for players who were in the All-Star Game (**not** the rookie all-star game)?


 

In [3]:
playerstats = awards.merge(player_data, on='nbapersonid', how='inner')

In [4]:
first_team = playerstats[(playerstats['season_x'] >= 2007) & (playerstats['season_x'] <= 2021) & 
                         (playerstats['All NBA First Team'] == 1.0)]

first_team_ppg = first_team['points'] / first_team['games']

print("First Team points per game:",first_team_ppg.mean())



sec_team = playerstats[(playerstats['season_x'] >= 2007) & (playerstats['season_x'] <= 2021) & 
                         (playerstats['All NBA Second Team'] == 1.0)]

sec_team_ppg = sec_team['points'] / sec_team['games']

print("Second Team points per game:",sec_team_ppg.mean())



third_team = playerstats[(playerstats['season_x'] >= 2007) & (playerstats['season_x'] <= 2021) & 
                         (playerstats['All NBA Third Team'] == 1.0)]

third_team_ppg = third_team['points'] / third_team['games']

print("Third Team points per game:",third_team_ppg.mean())



allstar_team = playerstats[(playerstats['season_x'] >= 2007) & (playerstats['season_x'] <= 2021) & 
                         (playerstats['all_star_game'])]

allstar_team_ppg = allstar_team['points'] / allstar_team['games']

print("All-Star Team points per game:",allstar_team_ppg.mean())

First Team points per game: 22.3096871795066
Second Team points per game: 19.602453597125052
Third Team points per game: 17.406737392538716
All-Star Team points per game: 18.73400715244671


What was the average number of years of experience in the league it takes for players to make their first All NBA Selection (1st, 2nd, or 3rd team)? Please limit your sample to players drafted in 2007 or later who did eventually go on to win at least one All NBA selection. 


In [5]:
allnba_players = playerstats[(playerstats['draftyear'] >= 2007) &
                                ((playerstats['All NBA First Team'] == 1.0) |
                                 (playerstats['All NBA Second Team'] == 1.0) |
                                 (playerstats['All NBA Third Team'] == 1.0))]

allnba_columns = ['All NBA First Team', 'All NBA Second Team', 'All NBA Third Team']
allnba_players = allnba_players[allnba_players[allnba_columns].sum(axis=1) > 0]


first_allnba_years = allnba_players.groupby('nbapersonid')['season_x'].min()

years_to_first_allnba = first_allnba_years - allnba_players.groupby('nbapersonid')['draftyear'].min() + 1

print("Average years to first All NBA selection:", years_to_first_allnba.mean())

Average years to first All NBA selection: 4.682926829268292


## Data Cleaning Interlude  

You're going to work to create a dataset with a "career outcome" for each player, representing the highest level of success that the player achieved for **at least two** seasons *after his first four seasons in the league*. On a single season level, the outcomes are:  

- Elite: A player is "Elite" in a season if he won any All NBA award (1st, 2nd, or 3rd team), MVP, or DPOY in that season.    
- All-Star: A player is "All-Star" in a season if he was selected to be an All-Star that season.   
- Starter:  A player is a "Starter" in a season if he started in at least 41 games in the season OR if he played at least 2000 minutes in the season.    
- Rotation:  A player is a "Rotation" player in a season if he played at least 1000 minutes in the season.   
- Roster:  A player is a "Roster" player in a season if he played at least 1 minute for an NBA team but did not meet any of the above criteria.     
- Out of the League: A player is "Out of the League" if he is not in the NBA in that season.   


In [6]:
playerinfo = player_data.merge(awards, on='nbapersonid', how='left')
players_2010_draft = playerinfo[playerinfo['draftyear'] == 2010]
seasongames = team_data.merge(players_2010_draft, on='nbateamid', how='right')

def calculate_career_outcome(player_df):
    elite_count = 0
    all_star_count = 0
    starter_count = 0
    rotation_count = 0
    roster_count = 0
    out_of_league_count = 0
    
    for index, season in player_df.iterrows():
        if index >= 4:

            adjusted_minutes = season['mins']
            adjusted_games_started = season['games']

            if (season['All NBA First Team'] > 0) or (season['All NBA Second Team'] > 0) or (season['All NBA Third Team'] > 0):
                elite_count += 1
            elif season['all_star_game'] == True:
                all_star_count += 1
            elif (adjusted_games_started >= 41) or (adjusted_minutes >= 2000):
                starter_count += 1
            elif adjusted_minutes >= 1000 and adjusted_minutes < 2000:
                rotation_count += 1
            elif adjusted_minutes >= 1:
                roster_count += 1
    
    if elite_count >= 2:
        return "Elite"
    elif all_star_count >= 2:
        return "All-Star"
    elif starter_count >= 2:
        return "Starter"
    elif rotation_count >= 2:
        return "Rotation"
    elif roster_count >= 2:
        return "Roster"
    else:
        return "Out of the League"


results_list = []


for player, data in players_2010_draft.groupby('player'):
    career_outcome = calculate_career_outcome(data)
    results_list.append({'Player': player, 'Career Outcome': career_outcome})

results = pd.DataFrame(results_list)

outcome_counts = results['Career Outcome'].value_counts()

print(outcome_counts)

Starter              31
Roster               20
Out of the League    18
Elite                 3
All-Star              1
Name: Career Outcome, dtype: int64


### Open Ended Modeling

Making a prediction on which players drafted in 2018 or later, will make the All-Star Game, using the data of players drafted on or before 2015 to train the model


In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

training_data = playerstats[playerstats['draftyear'] <= 2015].copy()
training_data['points_per_game'] = training_data['points'] / training_data['games']

X = training_data[['points', 'ast', 'steals', 'blocks', 'tot_reb', 'PER', 'points_per_game']]
y = training_data['all_star_game'].copy()  # 1 if a player becomes an All-Star, 0 otherwise
y.fillna(False, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

newplayers = playerstats[playerstats['draftyear'] >= 2018].copy()
newplayers['points_per_game'] = newplayers['points'] / newplayers['games']
new_players_data = newplayers[['points', 'ast', 'steals', 'blocks', 'tot_reb', 'PER', 'points_per_game']]
predictions = model.predict(new_players_data)

newplayers['predicted_all_star'] = predictions

all_star_players = newplayers[newplayers['predicted_all_star'] == True]

grouped_players = all_star_players.groupby('player')['predicted_all_star'].max()


print(grouped_players)


Accuracy: 0.8790214477211796
Classification Report:
              precision    recall  f1-score   support

       False       0.89      0.98      0.93      5102
        True       0.70      0.30      0.41       866

    accuracy                           0.88      5968
   macro avg       0.79      0.64      0.67      5968
weighted avg       0.86      0.88      0.86      5968

player
Ja Morant                  True
Luka Doncic                True
Luka Dončić                True
Shai Gilgeous-Alexander    True
Trae Young                 True
Zion Williamson            True
Name: predicted_all_star, dtype: bool


## Predicting Team Stats  

Calculate what OKC's predicted offensive rebound percent is for game 81 in the data. That is, use games 1-80 to predict game 81.  

In [8]:
okc_data = rebounding_data[rebounding_data['team'] == 'OKC']

average_offensive_rebound_percent = okc_data.iloc[:80]['offensive_rebounds'].sum() / okc_data.iloc[:80]['off_rebound_chances'].sum()

predicted_offensive_rebound_percent = average_offensive_rebound_percent * 100

print("Predicted Offensive Rebound Percentage for Game 81:", predicted_offensive_rebound_percent)

Predicted Offensive Rebound Percentage for Game 81: 28.8689755388714
