In [417]:
# munging imports
import pandas as pd
import numpy as np

# visualization imports
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

%matplotlib inline

# modeling imports
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

# Load Dataset

In [2]:
nba_raw = pd.read_csv('nba_stats_raw.csv')
nba_raw.head()

Unnamed: 0,PLAYER,TEAM,AGE,GP,W,L,MIN,PTS,FGM,FGA,...,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,SEASON
0,Trae Young,ATL,23,76,40,36,2651.9,2155,711,1544,...,737,303,72,7,128,3535,42,0,159,2021-22
1,DeMar DeRozan,CHI,32,76,43,33,2742.9,2118,774,1535,...,374,181,68,24,178,3244,6,0,77,2021-22
2,Joel Embiid,PHI,28,68,45,23,2296.4,2079,666,1334,...,284,214,77,99,181,3774,46,2,368,2021-22
3,Jayson Tatum,BOS,24,76,49,27,2731.0,2046,708,1564,...,334,217,75,49,174,3433,22,0,667,2021-22
4,Nikola Jokic,DEN,27,74,46,28,2475.6,2004,764,1311,...,584,281,109,63,191,4338,66,19,444,2021-22


# Cleaning and Prepping Data

In [165]:
# Check datatypes
nba_raw.dtypes

PLAYER     object
TEAM       object
AGE         int64
GP          int64
W           int64
L           int64
MIN       float64
PTS         int64
FGM         int64
FGA         int64
FG%       float64
3PM         int64
3PA         int64
3P%       float64
FTM         int64
FTA         int64
FT%       float64
OREB        int64
DREB        int64
REB         int64
AST         int64
TOV         int64
STL         int64
BLK         int64
PF          int64
FP          int64
DD2         int64
TD3         int64
+/-         int64
SEASON     object
dtype: object

In [4]:
# View stats
nba_raw.describe()

Unnamed: 0,AGE,GP,W,L,MIN,PTS,FGM,FGA,FG%,3PM,...,REB,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-
count,12307.0,12307.0,12307.0,12307.0,12307.0,12307.0,12307.0,12307.0,12307.0,12307.0,...,12307.0,12307.0,12307.0,12307.0,12307.0,12307.0,12307.0,12307.0,12307.0,12307.0
mean,27.084261,51.284472,25.640042,25.64443,1203.294272,499.8899,186.385634,411.118225,43.447989,36.501909,...,211.858942,110.314293,69.350207,38.225806,24.443081,105.071667,1038.29406,4.273422,0.118875,-0.017957
std,4.33553,25.100181,15.967273,14.69967,891.763451,465.709429,170.733779,366.898757,10.083881,49.279353,...,194.687783,130.805531,62.278953,34.256465,33.164808,73.031412,883.926708,9.153311,1.006281,169.663292
min,18.0,1.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,0.0,0.0,-761.0
25%,24.0,31.0,12.0,14.0,375.7,114.0,43.0,103.0,39.8,0.0,...,58.0,19.0,19.0,10.0,4.0,40.0,273.0,0.0,0.0,-76.0
50%,26.0,57.0,25.0,25.0,1115.4,378.0,143.0,323.0,43.7,12.0,...,164.0,65.0,54.0,30.0,13.0,101.0,861.0,0.0,0.0,-12.0
75%,30.0,73.0,38.0,36.0,1905.45,763.5,285.0,627.5,48.0,59.0,...,306.0,151.0,103.0,57.0,31.0,159.0,1593.5,4.0,0.0,54.0
max,44.0,85.0,73.0,71.0,3484.1,2832.0,978.0,2173.0,100.0,402.0,...,1247.0,935.0,464.0,225.0,307.0,371.0,4906.0,71.0,42.0,1072.0


In [5]:
# Check for null values in each column
null_counts = nba_raw.isna().sum()
null_counts

PLAYER    0
TEAM      0
AGE       0
GP        0
W         0
L         0
MIN       0
PTS       0
FGM       0
FGA       0
FG%       0
3PM       0
3PA       0
3P%       0
FTM       0
FTA       0
FT%       0
OREB      0
DREB      0
REB       0
AST       0
TOV       0
STL       0
BLK       0
PF        0
FP        0
DD2       0
TD3       0
+/-       0
SEASON    0
dtype: int64

In [167]:
# Check for duplicate rows
duplicate_rows = nba_raw[nba_raw.duplicated()]
duplicate_rows

Unnamed: 0,PLAYER,TEAM,AGE,GP,W,L,MIN,PTS,FGM,FGA,...,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,SEASON


In [54]:
# Each row pertains to a player during a particular season/yr
# Rank players by the number of years played
list1 = nba_raw['PLAYER'].value_counts().reset_index()
list1.columns = ['PLAYER', 'NUM_YRS']
list1

Unnamed: 0,PLAYER,NUM_YRS
0,Vince Carter,22
1,Dirk Nowitzki,21
2,Kobe Bryant,20
3,Kevin Garnett,20
4,Jamal Crawford,20
...,...,...
2449,Kevin Murphy,1
2450,Kris Joseph,1
2451,Fab Melo,1
2452,Josh Akognon,1


In [55]:
# Merge 2 dataframes
nba = pd.merge(nba_raw, list1, on='PLAYER', how='left')

In [56]:
nba.head(10)

Unnamed: 0,PLAYER,TEAM,AGE,GP,W,L,MIN,PTS,FGM,FGA,...,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,SEASON,NUM_YRS
0,Trae Young,ATL,23,76,40,36,2651.9,2155,711,1544,...,303,72,7,128,3535,42,0,159,2021-22,4
1,DeMar DeRozan,CHI,32,76,43,33,2742.9,2118,774,1535,...,181,68,24,178,3244,6,0,77,2021-22,13
2,Joel Embiid,PHI,28,68,45,23,2296.4,2079,666,1334,...,214,77,99,181,3774,46,2,368,2021-22,6
3,Jayson Tatum,BOS,24,76,49,27,2731.0,2046,708,1564,...,217,75,49,174,3433,22,0,667,2021-22,5
4,Nikola Jokic,DEN,27,74,46,28,2475.6,2004,764,1311,...,281,109,63,191,4338,66,19,444,2021-22,7
5,Giannis Antetokounmpo,MIL,27,67,45,22,2204.2,2002,689,1245,...,219,72,91,212,3788,46,4,397,2021-22,9
6,Luka Doncic,DAL,23,65,44,21,2300.7,1847,641,1403,...,292,75,36,145,3452,44,10,146,2021-22,4
7,Devin Booker,PHX,25,68,56,12,2344.6,1822,662,1421,...,162,77,26,180,2873,5,0,469,2021-22,7
8,Karl-Anthony Towns,MIN,26,74,44,30,2475.5,1818,642,1214,...,226,72,83,267,3333,40,1,275,2021-22,7
9,Donovan Mitchell,UTA,25,67,41,26,2265.6,1733,617,1376,...,200,99,12,164,2741,3,0,276,2021-22,5


In [57]:
# Add a target column: > 4 yrs - "1", <= 4 yrs - "0"
def label_target(df):
    df = df.assign(TARGET_4UP=0)
    df.loc[df['NUM_YRS'] >4, 'TARGET_4UP']=1
    return df

In [58]:
# Dataframe with players and the number of years they've played, and whether or not they've played for more than 4 years - 'Yes' = 1, 'No' = 0
nba2 = label_target(nba)
nba2

Unnamed: 0,PLAYER,TEAM,AGE,GP,W,L,MIN,PTS,FGM,FGA,...,STL,BLK,PF,FP,DD2,TD3,+/-,SEASON,NUM_YRS,TARGET_4UP
0,Trae Young,ATL,23,76,40,36,2651.9,2155,711,1544,...,72,7,128,3535,42,0,159,2021-22,4,0
1,DeMar DeRozan,CHI,32,76,43,33,2742.9,2118,774,1535,...,68,24,178,3244,6,0,77,2021-22,13,1
2,Joel Embiid,PHI,28,68,45,23,2296.4,2079,666,1334,...,77,99,181,3774,46,2,368,2021-22,6,1
3,Jayson Tatum,BOS,24,76,49,27,2731.0,2046,708,1564,...,75,49,174,3433,22,0,667,2021-22,5,1
4,Nikola Jokic,DEN,27,74,46,28,2475.6,2004,764,1311,...,109,63,191,4338,66,19,444,2021-22,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12302,Anthony Miller,ATL,25,1,0,1,13.7,0,0,5,...,0,0,2,8,0,0,-14,1996-97,6,1
12303,Bruce Bowen,MIA,26,1,1,0,0.6,0,0,0,...,0,1,0,3,0,0,3,1996-97,13,1
12304,Cuonzo Martin,MIL,25,3,0,3,13.0,0,0,7,...,0,0,1,2,0,0,-2,1996-97,1,0
12305,Derrick Alston,ATL,24,2,1,1,10.6,0,0,5,...,0,0,0,5,0,0,-7,1996-97,1,0


In [63]:
# See all columns
nba2.columns

Index(['PLAYER', 'TEAM', 'AGE', 'GP', 'W', 'L', 'MIN', 'PTS', 'FGM', 'FGA',
       'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB',
       'AST', 'TOV', 'STL', 'BLK', 'PF', 'FP', 'DD2', 'TD3', '+/-', 'SEASON',
       'NUM_YRS', 'TARGET_4UP'],
      dtype='object')

In [73]:
# Drop column features that would cause major collinearity issues
nba3 = nba2.drop(columns=['FGM', 'FGA', '3PM', '3PA', 'FTM', 'FTA', 'FP'])
nba3.head()

Unnamed: 0,PLAYER,TEAM,AGE,GP,W,L,MIN,PTS,FG%,3P%,...,TOV,STL,BLK,PF,DD2,TD3,+/-,SEASON,NUM_YRS,TARGET_4UP
0,Trae Young,ATL,23,76,40,36,2651.9,2155,46.0,38.2,...,303,72,7,128,42,0,159,2021-22,4,0
1,DeMar DeRozan,CHI,32,76,43,33,2742.9,2118,50.4,35.2,...,181,68,24,178,6,0,77,2021-22,13,1
2,Joel Embiid,PHI,28,68,45,23,2296.4,2079,49.9,37.1,...,214,77,99,181,46,2,368,2021-22,6,1
3,Jayson Tatum,BOS,24,76,49,27,2731.0,2046,45.3,35.3,...,217,75,49,174,22,0,667,2021-22,5,1
4,Nikola Jokic,DEN,27,74,46,28,2475.6,2004,58.3,33.7,...,281,109,63,191,66,19,444,2021-22,7,1


In [79]:
# Create function to locate only the rookie year (first year) stat for each player
def find_rookie_yr(df):
    rookie_yrs = df.groupby(['PLAYER'])['AGE'].min().reset_index()
    return rookie_yrs
    

In [81]:
# Dataframe of rookie year stats
rookies = find_rookie_yr(nba3)
rookies

Unnamed: 0,PLAYER,AGE
0,A.C. Green,33
1,A.J. Bramlett,23
2,A.J. Guyton,23
3,AJ Hammons,24
4,AJ Price,23
...,...,...
2449,Zion Williamson,19
2450,Zoran Dragic,26
2451,Zoran Planinic,21
2452,Zydrunas Ilgauskas,23


In [82]:
# Merge the two dataframes together. Only the rookie year for each player will be shown with their full stats for that year
rookie_yrs = nba3.merge(rookies, how='inner', indicator=False)
rookie_yrs

Unnamed: 0,PLAYER,TEAM,AGE,GP,W,L,MIN,PTS,FG%,3P%,...,TOV,STL,BLK,PF,DD2,TD3,+/-,SEASON,NUM_YRS,TARGET_4UP
0,Franz Wagner,ORL,20,79,21,58,2429.0,1197,46.8,35.4,...,119,68,34,164,2,0,-244,2021-22,1,0
1,Jalen Green,HOU,20,67,12,55,2137.7,1157,42.6,34.3,...,135,44,18,103,0,0,-519,2021-22,1,0
2,Scottie Barnes,TOR,20,74,43,31,2617.3,1134,49.2,30.1,...,136,80,55,192,13,0,66,2021-22,1,0
3,Cade Cunningham,DET,20,64,20,44,2088.1,1114,41.6,31.4,...,234,78,43,200,10,2,-275,2021-22,1,0
4,Evan Mobley,CLE,21,69,40,29,2331.0,1034,50.8,25.0,...,133,56,115,147,21,0,95,2021-22,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2453,Anthony Miller,ATL,25,1,0,1,13.7,0,0.0,0.0,...,0,0,0,2,0,0,-14,1996-97,6,1
2454,Bruce Bowen,MIA,26,1,1,0,0.6,0,0.0,0.0,...,0,0,1,0,0,0,3,1996-97,13,1
2455,Cuonzo Martin,MIL,25,3,0,3,13.0,0,0.0,0.0,...,1,0,0,1,0,0,-2,1996-97,1,0
2456,Derrick Alston,ATL,24,2,1,1,10.6,0,0.0,0.0,...,0,0,0,0,0,0,-7,1996-97,1,0


# Save File

In [166]:
# csv file
rookie_yrs.to_csv('rookie_yrs.csv', encoding='utf-8', index=False)

In [225]:
# pickle file
rookie_yrs.to_pickle("rookie_yrs_pkl")