## Import required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

## Load the dataset

In [2]:
data = pd.read_csv('/Users/wongyenchik/Desktop/Data Science Toolbox/players_stats_by_season_full_details.csv')
pd.options.display.max_columns = None

## Data Preprocessing

In [3]:
data.head()

Unnamed: 0,League,Season,Season_Year,Stage,Player,Team,GP,MIN,FGM,FGA,3PM,3PA,FTM,FTA,TOV,PF,ORB,DRB,REB,AST,STL,BLK,PTS,birth_year,birth_month,birth_date,height,height_cm,weight,weight_kg,nationality,high_school,draft_round,draft_pick,draft_team
0,NBA,1999 - 2000,2000,Regular_Season,Shaquille O'Neal,LAL,79,3163.0,956,1665,0,1,432,824,223,255,336,742,1078,299,36,239,2344,1972.0,Mar,6-Mar-72,1-Jul,216.0,325.0,147.0,United States,Robert G. Cole High School,1.0,1.0,Orlando Magic
1,NBA,1999 - 2000,2000,Regular_Season,Vince Carter,TOR,82,3126.0,788,1696,95,236,436,551,178,263,150,326,476,322,110,92,2107,1977.0,Jan,26-Jan-77,6-Jun,198.0,220.0,100.0,United States,Mainland High School,1.0,5.0,Golden State Warriors
2,NBA,1999 - 2000,2000,Regular_Season,Karl Malone,UTA,82,2947.0,752,1476,2,8,589,739,231,229,169,610,779,304,79,71,2095,1963.0,Jul,24-Jul-63,9-Jun,206.0,265.0,120.0,United States,Summerfield High School,1.0,13.0,Utah Jazz
3,NBA,1999 - 2000,2000,Regular_Season,Allen Iverson,PHI,70,2853.0,729,1733,89,261,442,620,230,162,71,196,267,328,144,5,1989,1975.0,Jun,7-Jun-75,Jun-00,183.0,165.0,75.0,United States,Bethel High School,1.0,1.0,Philadelphia Sixers
4,NBA,1999 - 2000,2000,Regular_Season,Gary Payton,SEA,82,3425.0,747,1666,177,520,311,423,224,178,100,429,529,732,153,18,1982,1968.0,Jul,23-Jul-68,4-Jun,193.0,180.0,82.0,United States,Skyline High School,1.0,2.0,Seattle SuperSonics


In [4]:
data["FG%"] = data.FGM / data.FGA
data["3P%"] = data["3PM"] / data["3PA"]
data["Age"] = data.Season_Year - data.birth_year
data["PPG"] = data.PTS / data.GP
data["RPG"] = data.REB / data.GP
data["SPG"] = data.STL / data.GP
data["BPG"] = data.BLK / data.GP
data["APG"] = data.AST / data.GP

In [5]:
data.head()

Unnamed: 0,League,Season,Season_Year,Stage,Player,Team,GP,MIN,FGM,FGA,3PM,3PA,FTM,FTA,TOV,PF,ORB,DRB,REB,AST,STL,BLK,PTS,birth_year,birth_month,birth_date,height,height_cm,weight,weight_kg,nationality,high_school,draft_round,draft_pick,draft_team,FG%,3P%,Age,PPG,RPG,SPG,BPG,APG
0,NBA,1999 - 2000,2000,Regular_Season,Shaquille O'Neal,LAL,79,3163.0,956,1665,0,1,432,824,223,255,336,742,1078,299,36,239,2344,1972.0,Mar,6-Mar-72,1-Jul,216.0,325.0,147.0,United States,Robert G. Cole High School,1.0,1.0,Orlando Magic,0.574174,0.0,28.0,29.670886,13.64557,0.455696,3.025316,3.78481
1,NBA,1999 - 2000,2000,Regular_Season,Vince Carter,TOR,82,3126.0,788,1696,95,236,436,551,178,263,150,326,476,322,110,92,2107,1977.0,Jan,26-Jan-77,6-Jun,198.0,220.0,100.0,United States,Mainland High School,1.0,5.0,Golden State Warriors,0.464623,0.402542,23.0,25.695122,5.804878,1.341463,1.121951,3.926829
2,NBA,1999 - 2000,2000,Regular_Season,Karl Malone,UTA,82,2947.0,752,1476,2,8,589,739,231,229,169,610,779,304,79,71,2095,1963.0,Jul,24-Jul-63,9-Jun,206.0,265.0,120.0,United States,Summerfield High School,1.0,13.0,Utah Jazz,0.509485,0.25,37.0,25.54878,9.5,0.963415,0.865854,3.707317
3,NBA,1999 - 2000,2000,Regular_Season,Allen Iverson,PHI,70,2853.0,729,1733,89,261,442,620,230,162,71,196,267,328,144,5,1989,1975.0,Jun,7-Jun-75,Jun-00,183.0,165.0,75.0,United States,Bethel High School,1.0,1.0,Philadelphia Sixers,0.420658,0.340996,25.0,28.414286,3.814286,2.057143,0.071429,4.685714
4,NBA,1999 - 2000,2000,Regular_Season,Gary Payton,SEA,82,3425.0,747,1666,177,520,311,423,224,178,100,429,529,732,153,18,1982,1968.0,Jul,23-Jul-68,4-Jun,193.0,180.0,82.0,United States,Skyline High School,1.0,2.0,Seattle SuperSonics,0.448379,0.340385,32.0,24.170732,6.45122,1.865854,0.219512,8.926829


In [6]:
data.describe()

Unnamed: 0,Season_Year,GP,MIN,FGM,FGA,3PM,3PA,FTM,FTA,TOV,PF,ORB,DRB,REB,AST,STL,BLK,PTS,birth_year,height_cm,weight,weight_kg,draft_round,draft_pick,FG%,3P%,Age,PPG,RPG,SPG,BPG,APG
count,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53631.0,53875.0,49385.0,49385.0,10136.0,10136.0,53943.0,50806.0,53631.0,53949.0,53949.0,53949.0,53949.0,53949.0
mean,2014.204638,30.313574,752.431404,113.200541,245.094942,28.468535,80.738383,56.297299,76.012716,47.260487,70.10104,34.325048,90.601216,124.926264,62.78691,26.715398,10.492057,311.178372,1986.361675,197.445123,210.309527,95.422193,1.38753,14.053177,0.457034,0.317017,27.823125,9.953104,3.986357,0.872907,0.317607,1.982133
std,4.971016,17.849616,534.216679,100.164033,212.155076,30.673395,80.672208,59.24065,76.172698,37.398461,45.62977,37.003235,84.6291,117.293566,73.184287,22.077459,18.199867,271.81159,6.637023,8.728587,26.128059,11.851299,0.508224,8.643064,0.080094,0.128674,4.270519,4.759802,2.199844,0.478501,0.398981,1.498618
min,2000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1961.0,160.0,130.0,59.0,1.0,1.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0
25%,2012.0,17.0,380.9,48.0,109.0,6.0,20.0,20.0,28.0,21.0,37.0,11.0,37.0,50.0,20.0,11.0,1.0,134.0,1982.0,191.0,190.0,86.0,1.0,6.0,0.407407,0.272727,25.0,6.551724,2.392857,0.531915,0.058824,0.923077
50%,2015.0,29.0,663.0,89.0,196.0,20.0,61.0,40.0,56.0,39.0,64.0,23.0,68.0,93.0,41.0,21.0,4.0,247.0,1987.0,198.0,209.0,95.0,1.0,13.0,0.450617,0.333333,27.0,9.342105,3.529412,0.794118,0.179487,1.548387
75%,2018.0,37.0,954.0,145.0,310.0,41.0,117.0,73.0,99.0,63.0,91.0,44.0,114.0,159.0,78.0,36.0,12.0,399.0,1991.0,203.0,229.0,104.0,2.0,22.0,0.5,0.385714,31.0,12.6,5.090909,1.126582,0.423077,2.631579
max,2020.0,85.0,3485.0,978.0,2173.0,402.0,1028.0,756.0,972.0,464.0,371.0,440.0,894.0,1247.0,925.0,225.0,307.0,2832.0,2004.0,229.0,375.0,170.0,7.0,30.0,1.0,1.0,50.0,41.972222,18.666667,4.3,4.333333,13.272727


In [7]:
data.isnull().sum()

League             0
Season             0
Season_Year        0
Stage              0
Player             0
Team              11
GP                 0
MIN                0
FGM                0
FGA                0
3PM                0
3PA                0
FTM                0
FTA                0
TOV                0
PF                 0
ORB                0
DRB                0
REB                0
AST                0
STL                0
BLK                0
PTS                0
birth_year       318
birth_month      318
birth_date       318
height            74
height_cm         74
weight          4564
weight_kg       4564
nationality        7
high_school    30266
draft_round    43813
draft_pick     43813
draft_team     43813
FG%                6
3P%             3143
Age              318
PPG                0
RPG                0
SPG                0
BPG                0
APG                0
dtype: int64

In [8]:
# Checking why there are alot of NULL values in high_school column

highschool_data = data[data['high_school'].isnull()]

In [9]:
highschool_data.head()

Unnamed: 0,League,Season,Season_Year,Stage,Player,Team,GP,MIN,FGM,FGA,3PM,3PA,FTM,FTA,TOV,PF,ORB,DRB,REB,AST,STL,BLK,PTS,birth_year,birth_month,birth_date,height,height_cm,weight,weight_kg,nationality,high_school,draft_round,draft_pick,draft_team,FG%,3P%,Age,PPG,RPG,SPG,BPG,APG
30,NBA,1999 - 2000,2000,Regular_Season,Dirk Nowitzki,DAL,82,2938.0,515,1118,116,306,289,348,141,256,102,430,532,203,63,68,1435,1978.0,Jun,19-Jun-78,Jul-00,213.0,245.0,111.0,Germany,,1.0,9.0,Milwaukee Bucks,0.460644,0.379085,22.0,17.5,6.487805,0.768293,0.829268,2.47561
65,NBA,1999 - 2000,2000,Regular_Season,Vlade Divac,SAC,82,2374.0,384,764,7,26,230,333,190,251,174,482,656,244,103,103,1005,1968.0,Feb,3-Feb-68,1-Jul,216.0,260.0,118.0,Serbia,,1.0,26.0,Los Angeles Lakers,0.502618,0.269231,32.0,12.256098,8.0,1.256098,1.256098,2.97561
86,NBA,1999 - 2000,2000,Regular_Season,Peja Stojakovic,SAC,74,1749.0,321,717,100,267,135,153,88,97,74,202,276,106,52,7,877,1977.0,Jun,9-Jun-77,10-Jun,208.0,229.0,104.0,Serbia / Greece,,1.0,14.0,Sacramento Kings,0.447699,0.374532,23.0,11.851351,3.72973,0.702703,0.094595,1.432432
103,NBA,1999 - 2000,2000,Regular_Season,Arvydas Sabonis,POR,66,1686.0,302,598,7,19,167,198,97,184,97,416,513,118,43,78,778,1964.0,Dec,19-Dec-64,3-Jul,221.0,279.0,127.0,Lithuania,,4.0,7.0,Atlanta Hawks (1985);Round 1,0.505017,0.368421,36.0,11.787879,7.772727,0.651515,1.181818,1.787879
168,NBA,1999 - 2000,2000,Regular_Season,Rasho Nesterovic,MIN,82,1723.0,206,433,0,2,59,103,71,262,135,244,379,93,21,85,471,1976.0,May,30-May-76,Jul-00,213.0,255.0,116.0,Slovenia,,1.0,17.0,Minnesota Timberwolves,0.475751,0.0,24.0,5.743902,4.621951,0.256098,1.036585,1.134146


In [10]:
## Changing the NaN with noHS for all the basketball players that don't have their highschools confirmed

data['high_school'].fillna('NoHS', inplace =True)

# NBA Data

In [11]:
nba_data = data[data['League'] == 'NBA']

In [12]:
nba_data.head()

Unnamed: 0,League,Season,Season_Year,Stage,Player,Team,GP,MIN,FGM,FGA,3PM,3PA,FTM,FTA,TOV,PF,ORB,DRB,REB,AST,STL,BLK,PTS,birth_year,birth_month,birth_date,height,height_cm,weight,weight_kg,nationality,high_school,draft_round,draft_pick,draft_team,FG%,3P%,Age,PPG,RPG,SPG,BPG,APG
0,NBA,1999 - 2000,2000,Regular_Season,Shaquille O'Neal,LAL,79,3163.0,956,1665,0,1,432,824,223,255,336,742,1078,299,36,239,2344,1972.0,Mar,6-Mar-72,1-Jul,216.0,325.0,147.0,United States,Robert G. Cole High School,1.0,1.0,Orlando Magic,0.574174,0.0,28.0,29.670886,13.64557,0.455696,3.025316,3.78481
1,NBA,1999 - 2000,2000,Regular_Season,Vince Carter,TOR,82,3126.0,788,1696,95,236,436,551,178,263,150,326,476,322,110,92,2107,1977.0,Jan,26-Jan-77,6-Jun,198.0,220.0,100.0,United States,Mainland High School,1.0,5.0,Golden State Warriors,0.464623,0.402542,23.0,25.695122,5.804878,1.341463,1.121951,3.926829
2,NBA,1999 - 2000,2000,Regular_Season,Karl Malone,UTA,82,2947.0,752,1476,2,8,589,739,231,229,169,610,779,304,79,71,2095,1963.0,Jul,24-Jul-63,9-Jun,206.0,265.0,120.0,United States,Summerfield High School,1.0,13.0,Utah Jazz,0.509485,0.25,37.0,25.54878,9.5,0.963415,0.865854,3.707317
3,NBA,1999 - 2000,2000,Regular_Season,Allen Iverson,PHI,70,2853.0,729,1733,89,261,442,620,230,162,71,196,267,328,144,5,1989,1975.0,Jun,7-Jun-75,Jun-00,183.0,165.0,75.0,United States,Bethel High School,1.0,1.0,Philadelphia Sixers,0.420658,0.340996,25.0,28.414286,3.814286,2.057143,0.071429,4.685714
4,NBA,1999 - 2000,2000,Regular_Season,Gary Payton,SEA,82,3425.0,747,1666,177,520,311,423,224,178,100,429,529,732,153,18,1982,1968.0,Jul,23-Jul-68,4-Jun,193.0,180.0,82.0,United States,Skyline High School,1.0,2.0,Seattle SuperSonics,0.448379,0.340385,32.0,24.170732,6.45122,1.865854,0.219512,8.926829


In [13]:
nba_data.isnull().sum()

League           0
Season           0
Season_Year      0
Stage            0
Player           0
Team             0
GP               0
MIN              0
FGM              0
FGA              0
3PM              0
3PA              0
FTM              0
FTA              0
TOV              0
PF               0
ORB              0
DRB              0
REB              0
AST              0
STL              0
BLK              0
PTS              0
birth_year       0
birth_month      0
birth_date       0
height           0
height_cm        0
weight           0
weight_kg        0
nationality      0
high_school      0
draft_round    752
draft_pick     752
draft_team     752
FG%              2
3P%            844
Age              0
PPG              0
RPG              0
SPG              0
BPG              0
APG              0
dtype: int64

In [14]:
# Checking why draft_round, draft_pick, and draft_team have identical NULL values

draft_data = nba_data[nba_data['draft_round'].isnull()]

In [15]:
draft_data.head()

Unnamed: 0,League,Season,Season_Year,Stage,Player,Team,GP,MIN,FGM,FGA,3PM,3PA,FTM,FTA,TOV,PF,ORB,DRB,REB,AST,STL,BLK,PTS,birth_year,birth_month,birth_date,height,height_cm,weight,weight_kg,nationality,high_school,draft_round,draft_pick,draft_team,FG%,3P%,Age,PPG,RPG,SPG,BPG,APG
34,NBA,1999 - 2000,2000,Regular_Season,Darrell Armstrong,ORL,82,2590.0,484,1119,137,403,225,247,248,137,65,205,270,501,169,9,1330,1968.0,Jun,22-Jun-68,Jun-00,183.0,170.0,77.0,United States,Ashbrook High School,,,,0.432529,0.33995,32.0,16.219512,3.292683,2.060976,0.109756,6.109756
55,NBA,1999 - 2000,2000,Regular_Season,David Wesley,CHA,82,2760.0,407,955,88,248,214,275,159,186,39,186,225,463,109,11,1116,1970.0,Nov,14-Nov-70,Jun-00,183.0,190.0,86.0,United States,Longview High School,,,,0.426178,0.354839,30.0,13.609756,2.743902,1.329268,0.134146,5.646341
72,NBA,1999 - 2000,2000,Regular_Season,Tyrone Nesby,LAC,73,2317.0,364,915,94,281,151,191,102,205,82,193,275,121,75,31,973,1976.0,Jan,31-Jan-76,6-Jun,198.0,225.0,102.0,United States,Cairo High School,,,,0.397814,0.33452,24.0,13.328767,3.767123,1.027397,0.424658,1.657534
82,NBA,1999 - 2000,2000,Regular_Season,Avery Johnson,SAS,82,2571.0,402,850,1,9,114,155,140,150,33,125,158,491,76,18,919,1965.0,Mar,25-Mar-65,10-May,178.0,175.0,79.0,United States,St. Augustine High School,,,,0.472941,0.111111,35.0,11.207317,1.926829,0.926829,0.219512,5.987805
90,NBA,1999 - 2000,2000,Regular_Season,John Amaechi,ORL,80,1684.0,306,700,1,6,223,291,139,161,62,204,266,95,35,37,836,1970.0,Nov,26-Nov-70,10-Jun,208.0,270.0,122.0,United States,St. John's Jesuit High School and Academy,,,,0.437143,0.166667,30.0,10.45,3.325,0.4375,0.4625,1.1875


In [16]:
# After we did some research these players randomly, we discovered that most of them really are undrafted so we did some changes in the dataset below

nba_data['draft_round'].fillna(0, inplace =True)
nba_data['draft_pick'].fillna(0, inplace =True)
nba_data['draft_team'].fillna('Undrafted', inplace =True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nba_data['draft_round'].fillna(0, inplace =True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nba_data['draft_pick'].fillna(0, inplace =True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nba_data['draft_team'].fillna('Undrafted', inplace =True)


In [17]:
#Checking the NULL values in FG%

fg = nba_data[nba_data['FG%'].isnull()]
fg.head()

Unnamed: 0,League,Season,Season_Year,Stage,Player,Team,GP,MIN,FGM,FGA,3PM,3PA,FTM,FTA,TOV,PF,ORB,DRB,REB,AST,STL,BLK,PTS,birth_year,birth_month,birth_date,height,height_cm,weight,weight_kg,nationality,high_school,draft_round,draft_pick,draft_team,FG%,3P%,Age,PPG,RPG,SPG,BPG,APG
45317,NBA,2018 - 2019,2019,Regular_Season,Tyler Ulis,CHI,1,0.8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1996.0,Jan,5-Jan-96,10-May,178.0,149.0,68.0,United States,Marian Catholic High School,2.0,4.0,Phoenix Suns,,,23.0,0.0,0.0,0.0,0.0,0.0
45318,NBA,2018 - 2019,2019,Regular_Season,John Holland,CLE,1,0.7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1988.0,Nov,6-Nov-88,5-Jun,196.0,205.0,93.0,United States / Puerto Rico,Fordham Preparatory School,0.0,0.0,Undrafted,,,31.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# We can see that the data didn't have FGM and FGA in both of these players, so we just replace null values with 0

nba_data['FG%'].fillna(0, inplace =True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nba_data['FG%'].fillna(0, inplace =True)


In [19]:
# Checking the NULL values again
nba_data.isnull().sum()

League           0
Season           0
Season_Year      0
Stage            0
Player           0
Team             0
GP               0
MIN              0
FGM              0
FGA              0
3PM              0
3PA              0
FTM              0
FTA              0
TOV              0
PF               0
ORB              0
DRB              0
REB              0
AST              0
STL              0
BLK              0
PTS              0
birth_year       0
birth_month      0
birth_date       0
height           0
height_cm        0
weight           0
weight_kg        0
nationality      0
high_school      0
draft_round      0
draft_pick       0
draft_team       0
FG%              0
3P%            844
Age              0
PPG              0
RPG              0
SPG              0
BPG              0
APG              0
dtype: int64

In [20]:
# checking why the 3P% counts as NULL

three_data = nba_data[nba_data['3P%'].isnull()]

In [21]:
three_data.head()

Unnamed: 0,League,Season,Season_Year,Stage,Player,Team,GP,MIN,FGM,FGA,3PM,3PA,FTM,FTA,TOV,PF,ORB,DRB,REB,AST,STL,BLK,PTS,birth_year,birth_month,birth_date,height,height_cm,weight,weight_kg,nationality,high_school,draft_round,draft_pick,draft_team,FG%,3P%,Age,PPG,RPG,SPG,BPG,APG
78,NBA,1999 - 2000,2000,Regular_Season,Dikembe Mutombo,ATL,82,2984.0,322,573,0,0,298,421,174,248,304,853,1157,105,27,269,942,1966.0,Jun,25-Jun-66,2-Jul,218.0,260.0,118.0,Democratic Republic of the Congo,Institute Boboto,1.0,4.0,Denver Nuggets,0.561955,,34.0,11.487805,14.109756,0.329268,3.280488,1.280488
84,NBA,1999 - 2000,2000,Regular_Season,Antonio Davis,TOR,79,2479.0,313,712,0,0,284,371,121,267,235,461,696,105,38,100,910,1968.0,Oct,31-Oct-68,9-Jun,206.0,245.0,111.0,United States,McClymonds High School,2.0,18.0,Indiana Pacers,0.439607,,32.0,11.518987,8.810127,0.481013,1.265823,1.329114
99,NBA,1999 - 2000,2000,Regular_Season,Corliss Williamson,SAC,76,1707.0,311,622,0,0,163,212,110,192,122,168,290,82,38,19,785,1973.0,Dec,4-Dec-73,7-Jun,201.0,245.0,111.0,United States,Russellville High School,1.0,13.0,Sacramento Kings,0.5,,27.0,10.328947,3.815789,0.5,0.25,1.078947
100,NBA,1999 - 2000,2000,Regular_Season,Michael Olowokandi,LAC,80,2493.0,330,756,0,0,123,189,177,304,194,462,656,38,35,140,783,1975.0,Apr,3-Apr-75,Jul-00,213.0,270.0,122.0,Nigeria / England,Newlands Manor School,1.0,1.0,Los Angeles Clippers,0.436508,,25.0,9.7875,8.2,0.4375,1.75,0.475
107,NBA,1999 - 2000,2000,Regular_Season,Dale Davis,IND,74,2127.0,302,602,0,0,139,203,91,203,256,473,729,64,52,94,743,1969.0,Mar,25-Mar-69,11-Jun,211.0,252.0,114.0,United States,Stephens High School,1.0,13.0,Indiana Pacers,0.501661,,31.0,10.040541,9.851351,0.702703,1.27027,0.864865


In [22]:
## The 3PT percentage is NULL due to these players had 0 three point attempt and field goal so 0 divide by 0 will be infinite thus the dataset set the 3P% as NaN

## So we fill the null values with 0
nba_data['3P%'].fillna(0, inplace =True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nba_data['3P%'].fillna(0, inplace =True)


In [23]:
# Check the NULL values again
nba_data.isnull().sum()

League         0
Season         0
Season_Year    0
Stage          0
Player         0
Team           0
GP             0
MIN            0
FGM            0
FGA            0
3PM            0
3PA            0
FTM            0
FTA            0
TOV            0
PF             0
ORB            0
DRB            0
REB            0
AST            0
STL            0
BLK            0
PTS            0
birth_year     0
birth_month    0
birth_date     0
height         0
height_cm      0
weight         0
weight_kg      0
nationality    0
high_school    0
draft_round    0
draft_pick     0
draft_team     0
FG%            0
3P%            0
Age            0
PPG            0
RPG            0
SPG            0
BPG            0
APG            0
dtype: int64

# NBA Data FOR ARIMA

In [24]:
# This is the method how i only retain the 'Regular_Season' Stage

new_nba_data = nba_data[nba_data['Stage'] == 'Regular_Season'][['Season_Year','Stage','Player', 'PPG']]

In [25]:
#ALL is in regular season

new_nba_data

Unnamed: 0,Season_Year,Stage,Player,PPG
0,2000,Regular_Season,Shaquille O'Neal,29.670886
1,2000,Regular_Season,Vince Carter,25.695122
2,2000,Regular_Season,Karl Malone,25.548780
3,2000,Regular_Season,Allen Iverson,28.414286
4,2000,Regular_Season,Gary Payton,24.170732
...,...,...,...,...
51971,2020,Regular_Season,Matt Dellavedova,3.105263
51972,2020,Regular_Season,Goga Bitadze,3.185185
51973,2020,Regular_Season,Nassir Little,3.583333
51974,2020,Regular_Season,David Nwaba,5.200000


In [26]:
#This one i use new_nba_data that filter out the playoffs so left with reg season

arima_data = new_nba_data[['Season_Year', 'Player', 'PPG']]

In [27]:
arima_data

Unnamed: 0,Season_Year,Player,PPG
0,2000,Shaquille O'Neal,29.670886
1,2000,Vince Carter,25.695122
2,2000,Karl Malone,25.548780
3,2000,Allen Iverson,28.414286
4,2000,Gary Payton,24.170732
...,...,...,...
51971,2020,Matt Dellavedova,3.105263
51972,2020,Goga Bitadze,3.185185
51973,2020,Nassir Little,3.583333
51974,2020,David Nwaba,5.200000


In [28]:
# Create a list of players who have played from 2015 to 2020

players_2015_to_2020 = nba_data[nba_data['Season_Year'].between(2015, 2020)]['Player']

# Filter the original DataFrame to retain records of these players

filtered_arima_data = arima_data[nba_data['Player'].isin(players_2015_to_2020)]

  filtered_arima_data = arima_data[nba_data['Player'].isin(players_2015_to_2020)]


### Testing by filter out Stephen Curry

In [29]:
nba_data[nba_data['Player']=='Stephen Curry'][['Season_Year','Stage','PPG']]

Unnamed: 0,Season_Year,Stage,PPG
10589,2010,Regular_Season,17.4875
11876,2011,Regular_Season,18.554054
18723,2013,Regular_Season,22.897436
18953,2013,Playoffs,23.416667
22845,2014,Regular_Season,24.012821
23086,2014,Playoffs,23.0
26813,2015,Regular_Season,23.75
27018,2015,Playoffs,28.285714
31085,2016,Regular_Season,30.063291
31286,2016,Playoffs,25.111111


In [30]:
filtered_arima_data[filtered_arima_data['Player']=='Stephen Curry']

Unnamed: 0,Season_Year,Player,PPG
10589,2010,Stephen Curry,17.4875
11876,2011,Stephen Curry,18.554054
18723,2013,Stephen Curry,22.897436
22845,2014,Stephen Curry,24.012821
26813,2015,Stephen Curry,23.75
31085,2016,Stephen Curry,30.063291
35286,2017,Stephen Curry,25.303797
39967,2018,Stephen Curry,26.392157
45035,2019,Stephen Curry,27.26087


In [31]:
# Assuming you have already filtered the data for the specific player in filtered_nba_data
player_name = "Stephen Curry"
player_data = filtered_arima_data[filtered_arima_data['Player'] == player_name]

# Count the number of unique years played
years_played = player_data['Season_Year'].nunique()

# Calculate the average PPG for each year
ppg_data = player_data.groupby('Season_Year')['PPG'].mean()

# Display the results
print(f"{player_name} played in {years_played} unique years.")
print("PPG for each year:")
print(ppg_data)


Stephen Curry played in 9 unique years.
PPG for each year:
Season_Year
2010    17.487500
2011    18.554054
2013    22.897436
2014    24.012821
2015    23.750000
2016    30.063291
2017    25.303797
2018    26.392157
2019    27.260870
Name: PPG, dtype: float64


### Create new dataframe

In [32]:
# Define a list of all unique players in your dataset
unique_players = filtered_arima_data['Player'].unique()

# Create a dictionary to store each player's data
player_data_dict = {}

# Iterate through each player and store the player's data in the dictionary
for player in unique_players:
    player_data = filtered_arima_data[filtered_arima_data['Player'] == player]

    # Remove duplicate rows
    player_data = player_data.drop_duplicates(subset=['Season_Year', 'Player'])

    # Create a template DataFrame covering all years from 2000 to 2020 for the current player
    all_years_df = pd.DataFrame({'Season_Year': range(2000, 2021)})

    # Merge player data with the template DataFrame for the current player, filling in missing years with NaN
    player_data = pd.merge(all_years_df, player_data, on='Season_Year', how='left')

#     # Fill NaN values in the 'PPG' column with the calculated mean PPG
#     non_zero_ppg = player_data['PPG'].replace(0, pd.NA)  # Replace 0 with NaN
    mean_ppg = player_data['PPG'].mean()
    player_data['PPG'].fillna(mean_ppg, inplace=True)

    # Fill NaN values in the 'Player' column with the current player's name
    player_data['Player'].fillna(player, inplace=True)

    # Store the player's data in the dictionary
    player_data_dict[player] = player_data

# Combine all player data into one DataFrame
all_players_data = pd.concat(player_data_dict.values(), ignore_index=True)

In [33]:
all_players_data.head(62)

Unnamed: 0,Season_Year,Player,PPG
0,2000,Vince Carter,25.695122
1,2001,Vince Carter,27.600000
2,2002,Vince Carter,24.733333
3,2003,Vince Carter,16.539454
4,2004,Vince Carter,22.534247
...,...,...,...
57,2015,Glenn Robinson,16.307483
58,2016,Glenn Robinson,16.307483
59,2017,Glenn Robinson,6.072464
60,2018,Glenn Robinson,16.307483


In [34]:
# Create a list to store DataFrames for each year
yearly_datasets = []

# Iterate from 2000 to 2020
for year in range(2000, 2021):
    # Filter the data for the current year and append it to the list
    yearly_data = all_players_data[all_players_data['Season_Year'] == year]
    yearly_datasets.append(yearly_data)

# Concatenate the DataFrames to create a new dataset
final_arima_dataset = pd.concat(yearly_datasets)

In [35]:
# Set the maximum number of rows and columns to be displayed
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


final_arima_dataset

Unnamed: 0,Season_Year,Player,PPG
0,2000,Vince Carter,25.695122
21,2000,Tim Duncan,23.189189
42,2000,Glenn Robinson,20.901235
63,2000,Dirk Nowitzki,17.5
84,2000,Paul Pierce,19.547945
105,2000,Andre Miller,11.146341
126,2000,Jason Terry,8.111111
147,2000,Mike Miller,11.29872
168,2000,Hedo Turkoglu,11.05034
189,2000,Pau Gasol,17.251447


In [36]:
final_arima_dataset.isnull().sum()

Season_Year    0
Player         0
PPG            0
dtype: int64

In [37]:
final_arima_dataset['PPG'].fillna(0, inplace =True)

In [38]:
final_arima_dataset.isnull().sum()

Season_Year    0
Player         0
PPG            0
dtype: int64

## Arima model training

In [39]:
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')  # Ignore all warnings (not recommended for production code)

# Load your pre-sorted data
arima = pd.read_csv('/Users/wongyenchik/Desktop/Data Science Toolbox/final_arima_dataset.csv')

# Initialize an empty DataFrame to store predictions
predictions_df = pd.DataFrame(columns=['Player', 'Predicted_PPG'])

# Loop through each unique player and fit an ARIMA model
for player in arima['Player'].unique():
    player_data = arima[arima['Player'] == player]['PPG']

    # Fit an ARIMA model (you may need to find the appropriate order)
    p, d, q = 1, 1, 1  # Example order for the ARIMA model
    model = sm.tsa.ARIMA(player_data, order=(p, d, q))
    model_fit = model.fit()

    # Make a prediction for the 2021 season
    prediction_2021 = model_fit.get_forecast(steps=1).predicted_mean.values[0]

    # Append the prediction to the DataFrame
    predictions_df = predictions_df.append({'Year': 2021, 'Player': player, 'PPG': prediction_2021}, ignore_index=True)

# Sort the predictions in descending order to get the top 3 players for 2021
top_3_players_2021 = predictions_df.sort_values(by='PPG', ascending=False).head(3)

# Print the top 3 players for 2021
print(top_3_players_2021)
top_3_players_2021.to_csv('top3.csv', index=False)

                    Player Predicted_PPG    Year        PPG
131           James Harden           NaN  2021.0  34.234559
223           Bradley Beal           NaN  2021.0  33.209075
246  Giannis Antetokounmpo           NaN  2021.0  29.442726
