# Load dep and connection to postgres

In [1]:
# import dep
import pandas as pd
import matplotlib as plt
import numpy as np
from collections import Counter

import psycopg2
from sqlalchemy import create_engine
from config import postgres_pw

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split

In [2]:
# connect jupyter to postgres
db_string = f'postgresql://postgres:{postgres_pw}@127.0.0.1:5432/NBA_MVP'

In [3]:
# create database engine
engine = create_engine(db_string)

In [4]:
# Connect to postgres server
dbConnection = engine.connect()

In [5]:
# read game_df
game_df = pd.read_sql("select * from games", dbConnection)

In [6]:
# read season_df
season_df = pd.read_sql("select * from seasons", dbConnection)

In [7]:
# read scoreboard data into dataframes
scoreboard_df = pd.read_sql("select * from scoreboards", dbConnection)

In [8]:
scoreboard_df

Unnamed: 0,game_id,team_id,season_id,abb,city,player_id,player_name,status,time_played,fgm,...,oreb,dreb,reb,ast,stl,blk,turn_over,pf,pts,plus_minus
0,0021000003,1610612745,22010,HOU,Houston,2203,Shane Battier,PLA,00:31:24,1,...,2,4,6,4,0,2,2,3,3,9
1,0021000003,1610612745,22010,HOU,Houston,2449,Luis Scola,PLA,00:33:11,7,...,4,12,16,4,1,0,4,4,18,7
2,0021000003,1610612745,22010,HOU,Houston,2397,Yao Ming,PLA,00:23:21,4,...,4,7,11,2,0,2,4,6,9,-6
3,0021000003,1610612745,22010,HOU,Houston,2755,Kevin Martin,PLA,00:29:29,8,...,1,2,3,2,0,1,4,4,26,4
4,0021000003,1610612745,22010,HOU,Houston,201166,Aaron Brooks,PLA,00:41:39,7,...,1,2,3,9,0,0,2,0,24,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360546,0022101224,1610612750,22021,MIN,Minnesota,1630195,Leandro Bolmaro,PLA,00:17:54,4,...,1,2,3,4,0,0,0,1,11,15
360547,0022101224,1610612750,22021,MIN,Minnesota,1627774,Jake Layman,PLA,00:16:36,3,...,0,3,3,0,1,0,0,0,10,16
360548,0022101224,1610612750,22021,MIN,Minnesota,1630233,Nathan Knight,PLA,00:15:57,7,...,2,6,8,3,0,0,1,4,17,16
360549,0022101224,1610612750,22021,MIN,Minnesota,1630593,McKinley Wright IV,PLA,00:07:09,0,...,0,0,0,2,0,0,0,0,0,3


In [9]:
# time converter
def time_convert(x):
    h,m,s = map(int,x.split(':'))
    return (h*60+m)*60+s

In [10]:
# Convert time played into seconds
scoreboard_df['time_played'] = scoreboard_df['time_played'].astype(str).apply(time_convert)

In [11]:
# strip empty white space at end of ids
scoreboard_df['game_id'] = scoreboard_df['game_id'].str.strip()
scoreboard_df['team_id'] = scoreboard_df['team_id'].str.strip()
scoreboard_df['player_id'] = scoreboard_df['player_id'].str.strip()

# Split seasons and get avg stats of each player

In [12]:
# get list of seasons
seasons = scoreboard_df['season_id'].unique().tolist()
seasons

['22010',
 '22011',
 '22012',
 '22013',
 '22014',
 '22015',
 '22016',
 '22017',
 '22018',
 '22019',
 '22020',
 '22021']

In [13]:
# for loop to split each season
for season in seasons:
    if season == '22010':
        season22010 = scoreboard_df[scoreboard_df['season_id'] == '22010'].groupby(['player_id', 'player_name', 'team_id', 'season_id']).mean()
    elif season == '22011':
        season22011 = scoreboard_df[scoreboard_df['season_id'] == '22011'].groupby(['player_id', 'player_name', 'team_id', 'season_id']).mean()
    elif season == '22012':
        season22012 = scoreboard_df[scoreboard_df['season_id'] == '22012'].groupby(['player_id', 'player_name', 'team_id', 'season_id']).mean()
    elif season == '22013':
        season22013 = scoreboard_df[scoreboard_df['season_id'] == '22013'].groupby(['player_id', 'player_name', 'team_id', 'season_id']).mean()
    elif season == '22014':
        season22014 = scoreboard_df[scoreboard_df['season_id'] == '22014'].groupby(['player_id', 'player_name', 'team_id', 'season_id']).mean()
    elif season == '22015':
        season22015 = scoreboard_df[scoreboard_df['season_id'] == '22015'].groupby(['player_id', 'player_name', 'team_id', 'season_id']).mean()
    elif season == '22016':
        season22016 = scoreboard_df[scoreboard_df['season_id'] == '22016'].groupby(['player_id', 'player_name', 'team_id', 'season_id']).mean()
    elif season == '22017':
        season22017 = scoreboard_df[scoreboard_df['season_id'] == '22017'].groupby(['player_id', 'player_name', 'team_id', 'season_id']).mean()
    elif season == '22018':
        season22018 = scoreboard_df[scoreboard_df['season_id'] == '22018'].groupby(['player_id', 'player_name', 'team_id', 'season_id']).mean()
    elif season == '22019':
        season22019 = scoreboard_df[scoreboard_df['season_id'] == '22019'].groupby(['player_id', 'player_name', 'team_id', 'season_id']).mean()
    elif season == '22020':
        season22020 = scoreboard_df[scoreboard_df['season_id'] == '22020'].groupby(['player_id', 'player_name', 'team_id', 'season_id']).mean()
    elif season == '22021':
        season22021 = scoreboard_df[scoreboard_df['season_id'] == '22021'].groupby(['player_id', 'player_name', 'team_id', 'season_id']).mean()

In [14]:
# reset index of each table
season22010.reset_index(inplace = True)
season22011.reset_index(inplace = True)
season22012.reset_index(inplace = True)
season22013.reset_index(inplace = True)
season22014.reset_index(inplace = True)
season22015.reset_index(inplace = True)
season22016.reset_index(inplace = True)
season22017.reset_index(inplace = True)
season22018.reset_index(inplace = True)
season22019.reset_index(inplace = True)
season22020.reset_index(inplace = True)
season22021.reset_index(inplace = True)

In [15]:
season_df

Unnamed: 0,season_id,season_year,start_year,end_year,mvp,player_id
0,22010,2010-11,2010,2011,Derrick Rose,201565.0
1,22011,2011-12,2011,2012,LeBron James,2544.0
2,22012,2012-13,2012,2013,LeBron James,2544.0
3,22013,2013-14,2013,2014,Kevin Durant,201142.0
4,22014,2014-15,2014,2015,Stephen Curry,201939.0
5,22015,2015-16,2015,2016,Stephen Curry,201939.0
6,22016,2016-17,2016,2017,Russell Westbrook,201566.0
7,22017,2017-18,2017,2018,James Harden,201935.0
8,22018,2018-19,2018,2019,Giannis Antetokounmpo,203507.0
9,22019,2019-20,2019,2020,Giannis Antetokounmpo,203507.0


# Add mvp to each season

In [16]:
# season 22010
season22010['mvp'] = ' '
for index in season22010.index:
    if season22010['player_id'][index] == '201565':
        season22010['mvp'][index] = 0
    else:
        season22010['mvp'][index] = 1
    
# season 22011
season22011['mvp'] = ' '
for index in season22011.index:
    if season22011['player_id'][index] == '2544':
        season22011['mvp'][index] = 0
    else:
        season22011['mvp'][index] = 1

# season 22012
season22012['mvp'] = ' '
for index in season22012.index:
    if season22012['player_id'][index] == '2544':
        season22012['mvp'][index] = 0
    else:
        season22012['mvp'][index] = 1

# season 22013
season22013['mvp'] = ' '
for index in season22013.index:
    if season22013['player_id'][index] == '201142':
        season22013['mvp'][index] = 0
    else:
        season22013['mvp'][index] = 1

# season 22014
season22014['mvp'] = ' '
for index in season22014.index:
    if season22014['player_id'][index] == '201939':
        season22014['mvp'][index] = 0
    else:
        season22014['mvp'][index] = 1

# season 22015
season22015['mvp'] = ' '
for index in season22015.index:
    if season22015['player_id'][index] == '201939':
        season22015['mvp'][index] = 0
    else:
        season22015['mvp'][index] = 1

# season 22016
season22016['mvp'] = ' '
for index in season22016.index:
    if season22016['player_id'][index] == '201566':
        season22016['mvp'][index] = 0
    else:
        season22016['mvp'][index] = 1

# season 22017
season22017['mvp'] = ' '
for index in season22017.index:
    if season22017['player_id'][index] == '201935':
        season22017['mvp'][index] = 0
    else:
        season22017['mvp'][index] = 1

# season 22018
season22018['mvp'] = ' '
for index in season22018.index:
    if season22018['player_id'][index] == '203507':
        season22018['mvp'][index] = 0
    else:
        season22018['mvp'][index] = 1

# season 22019
season22019['mvp'] = ' '
for index in season22019.index:
    if season22019['player_id'][index] == '203507':
        season22019['mvp'][index] = 0
    else:
        season22019['mvp'][index] = 1
        
# season 22020
season22020['mvp'] = ' '
for index in season22020.index:
    if season22020['player_id'][index] == '203999':
        season22020['mvp'][index] = 0
    else:
        season22020['mvp'][index] = 1

# season 22021
season22021['mvp'] = ' '
for index in season22021.index:
    if season22021['player_id'][index] == '203999':
        season22021['mvp'][index] = 0
    else:
        season22021['mvp'][index] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the document

In [17]:
# Check if MVP was added correctly
season22010[season22010['player_id'] == '201565']

Unnamed: 0,player_id,player_name,team_id,season_id,time_played,fgm,fga,fg_pct,fg3m,fg3a,...,dreb,reb,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp
185,201565,Derrick Rose,1610612741,22010,2241.382716,8.777778,19.716049,0.447617,1.580247,4.753086,...,3.074074,4.074074,7.691358,1.049383,0.62963,3.432099,1.679012,25.012346,6.17284,0


In [18]:
# Check if MVP was added correctly
season22010['mvp'].value_counts()

1    546
0      1
Name: mvp, dtype: int64

# Add team wins to each player

In [19]:
game_df

Unnamed: 0,game_id,team_id,season_id,team_name,abb,wl
0,0021000003,1610612745,22010,Houston Rockets,HOU,L
1,0021000003,1610612747,22010,Los Angeles Lakers,LAL,W
2,0021000001,1610612738,22010,Boston Celtics,BOS,W
3,0021000001,1610612748,22010,Miami Heat,MIA,L
4,0021000002,1610612757,22010,Portland Trail Blazers,POR,W
...,...,...,...,...,...,...
28391,0022101223,1610612763,22021,Memphis Grizzlies,MEM,L
28392,0022101228,1610612755,22021,Philadelphia 76ers,PHI,W
28393,0022101228,1610612765,22021,Detroit Pistons,DET,L
28394,0022101224,1610612741,22021,Chicago Bulls,CHI,W


In [20]:
# strip empty white space at end of ids
game_df['game_id'] = game_df['game_id'].str.strip()
game_df['team_id'] = game_df['team_id'].str.strip()
game_df['season_id'] = game_df['season_id'].str.strip()

In [21]:
# for loop to split each season
for season in seasons:
    if season == '22010':
        t_season22010 = pd.DataFrame(game_df[(game_df['season_id'] == '22010') & (game_df['wl'] == 'W') ].groupby(['team_id', 'abb'])['wl'].count())
    elif season == '22011':
        t_season22011 = pd.DataFrame(game_df[(game_df['season_id'] == '22011') & (game_df['wl'] == 'W') ].groupby(['team_id', 'abb'])['wl'].count())
    elif season == '22012':
        t_season22012 = pd.DataFrame(game_df[(game_df['season_id'] == '22012') & (game_df['wl'] == 'W') ].groupby(['team_id', 'abb'])['wl'].count())
    elif season == '22013':
        t_season22013 = pd.DataFrame(game_df[(game_df['season_id'] == '22013') & (game_df['wl'] == 'W') ].groupby(['team_id', 'abb'])['wl'].count())
    elif season == '22014':
        t_season22014 = pd.DataFrame(game_df[(game_df['season_id'] == '22014') & (game_df['wl'] == 'W') ].groupby(['team_id', 'abb'])['wl'].count())
    elif season == '22015':
        t_season22015 = pd.DataFrame(game_df[(game_df['season_id'] == '22015') & (game_df['wl'] == 'W') ].groupby(['team_id', 'abb'])['wl'].count())
    elif season == '22016':
        t_season22016 = pd.DataFrame(game_df[(game_df['season_id'] == '22016') & (game_df['wl'] == 'W') ].groupby(['team_id', 'abb'])['wl'].count())
    elif season == '22017':
        t_season22017 = pd.DataFrame(game_df[(game_df['season_id'] == '22017') & (game_df['wl'] == 'W') ].groupby(['team_id', 'abb'])['wl'].count())
    elif season == '22018':
        t_season22018 = pd.DataFrame(game_df[(game_df['season_id'] == '22018') & (game_df['wl'] == 'W') ].groupby(['team_id', 'abb'])['wl'].count())
    elif season == '22019':
        t_season22019 = pd.DataFrame(game_df[(game_df['season_id'] == '22019') & (game_df['wl'] == 'W') ].groupby(['team_id', 'abb'])['wl'].count())
    elif season == '22020':
        t_season22020 = pd.DataFrame(game_df[(game_df['season_id'] == '22020') & (game_df['wl'] == 'W') ].groupby(['team_id', 'abb'])['wl'].count())
    elif season == '22021':
        t_season22021 = pd.DataFrame(game_df[(game_df['season_id'] == '22021') & (game_df['wl'] == 'W') ].groupby(['team_id', 'abb'])['wl'].count())

In [22]:
# reset index of each table
t_season22010.reset_index(inplace = True)
t_season22011.reset_index(inplace = True)
t_season22012.reset_index(inplace = True)
t_season22013.reset_index(inplace = True)
t_season22014.reset_index(inplace = True)
t_season22015.reset_index(inplace = True)
t_season22016.reset_index(inplace = True)
t_season22017.reset_index(inplace = True)
t_season22018.reset_index(inplace = True)
t_season22019.reset_index(inplace = True)
t_season22020.reset_index(inplace = True)
t_season22021.reset_index(inplace = True)

In [23]:
# Season 22010
season22010['w'] = ''
for x in t_season22010.index:
    for y in season22010.index:
        if season22010['team_id'][y] == t_season22010['team_id'][x]:
            season22010['w'][y] = t_season22010['wl'][x]
            
# Season 22011
season22011['w'] = ''
for x in t_season22011.index:
    for y in season22011.index:
        if season22011['team_id'][y] == t_season22011['team_id'][x]:
            season22011['w'][y] = t_season22011['wl'][x]
            
# Season 22012
season22012['w'] = ''
for x in t_season22012.index:
    for y in season22012.index:
        if season22012['team_id'][y] == t_season22012['team_id'][x]:
            season22012['w'][y] = t_season22012['wl'][x]

# Season 22013
season22013['w'] = ''
for x in t_season22013.index:
    for y in season22013.index:
        if season22013['team_id'][y] == t_season22013['team_id'][x]:
            season22013['w'][y] = t_season22013['wl'][x]

# Season 22014
season22014['w'] = ''
for x in t_season22014.index:
    for y in season22014.index:
        if season22014['team_id'][y] == t_season22014['team_id'][x]:
            season22014['w'][y] = t_season22014['wl'][x]

# Season 22015
season22015['w'] = ''
for x in t_season22015.index:
    for y in season22015.index:
        if season22015['team_id'][y] == t_season22015['team_id'][x]:
            season22015['w'][y] = t_season22015['wl'][x]

# Season 22016
season22016['w'] = ''
for x in t_season22016.index:
    for y in season22016.index:
        if season22016['team_id'][y] == t_season22016['team_id'][x]:
            season22016['w'][y] = t_season22016['wl'][x]

# Season 22017
season22017['w'] = ''
for x in t_season22017.index:
    for y in season22017.index:
        if season22017['team_id'][y] == t_season22017['team_id'][x]:
            season22017['w'][y] = t_season22017['wl'][x]

# Season 22018
season22018['w'] = ''
for x in t_season22018.index:
    for y in season22018.index:
        if season22018['team_id'][y] == t_season22018['team_id'][x]:
            season22018['w'][y] = t_season22018['wl'][x]
            
# Season 22019
season22019['w'] = ''
for x in t_season22019.index:
    for y in season22019.index:
        if season22019['team_id'][y] == t_season22019['team_id'][x]:
            season22019['w'][y] = t_season22019['wl'][x]
            
# Season 22020
season22020['w'] = ''
for x in t_season22020.index:
    for y in season22020.index:
        if season22020['team_id'][y] == t_season22020['team_id'][x]:
            season22020['w'][y] = t_season22020['wl'][x]
            
# Season 22021
season22021['w'] = ''
for x in t_season22021.index:
    for y in season22021.index:
        if season22021['team_id'][y] == t_season22021['team_id'][x]:
            season22021['w'][y] = t_season22021['wl'][x]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_

In [24]:
season22021[season22021['player_id'] == '201939']

Unnamed: 0,player_id,player_name,team_id,season_id,time_played,fgm,fga,fg_pct,fg3m,fg3a,...,reb,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w
576,201939,Stephen Curry,1610612744,22021,2072.96875,8.359375,19.125,0.433672,4.453125,11.71875,...,5.234375,6.3125,1.328125,0.359375,3.21875,2.03125,25.46875,7.953125,1,53


# Add games played

In [25]:
# for loop to split each season
for season in seasons:
    if season == '22010':
        p_season22010 = pd.DataFrame(scoreboard_df[(scoreboard_df['season_id'] == '22010') & (scoreboard_df['status'] == 'PLA') ].groupby(['player_id', 'player_name'])['status'].count())
    elif season == '22011':
        p_season22011 = pd.DataFrame(scoreboard_df[(scoreboard_df['season_id'] == '22011') & (scoreboard_df['status'] == 'PLA') ].groupby(['player_id', 'player_name'])['status'].count())
    elif season == '22012':
        p_season22012 = pd.DataFrame(scoreboard_df[(scoreboard_df['season_id'] == '22012') & (scoreboard_df['status'] == 'PLA') ].groupby(['player_id', 'player_name'])['status'].count())
    elif season == '22013':
        p_season22013 = pd.DataFrame(scoreboard_df[(scoreboard_df['season_id'] == '22013') & (scoreboard_df['status'] == 'PLA') ].groupby(['player_id', 'player_name'])['status'].count())
    elif season == '22014':
        p_season22014 = pd.DataFrame(scoreboard_df[(scoreboard_df['season_id'] == '22014') & (scoreboard_df['status'] == 'PLA') ].groupby(['player_id', 'player_name'])['status'].count())
    elif season == '22015':
        p_season22015 = pd.DataFrame(scoreboard_df[(scoreboard_df['season_id'] == '22015') & (scoreboard_df['status'] == 'PLA') ].groupby(['player_id', 'player_name'])['status'].count())
    elif season == '22016':
        p_season22016 = pd.DataFrame(scoreboard_df[(scoreboard_df['season_id'] == '22016') & (scoreboard_df['status'] == 'PLA') ].groupby(['player_id', 'player_name'])['status'].count())
    elif season == '22017':
        p_season22017 = pd.DataFrame(scoreboard_df[(scoreboard_df['season_id'] == '22017') & (scoreboard_df['status'] == 'PLA') ].groupby(['player_id', 'player_name'])['status'].count())
    elif season == '22018':
        p_season22018 = pd.DataFrame(scoreboard_df[(scoreboard_df['season_id'] == '22018') & (scoreboard_df['status'] == 'PLA') ].groupby(['player_id', 'player_name'])['status'].count())
    elif season == '22019':
        p_season22019 = pd.DataFrame(scoreboard_df[(scoreboard_df['season_id'] == '22019') & (scoreboard_df['status'] == 'PLA') ].groupby(['player_id', 'player_name'])['status'].count())
    elif season == '22020':
        p_season22020 = pd.DataFrame(scoreboard_df[(scoreboard_df['season_id'] == '22020') & (scoreboard_df['status'] == 'PLA') ].groupby(['player_id', 'player_name'])['status'].count())
    elif season == '22021':
        p_season22021 = pd.DataFrame(scoreboard_df[(scoreboard_df['season_id'] == '22021') & (scoreboard_df['status'] == 'PLA') ].groupby(['player_id', 'player_name'])['status'].count())

In [26]:
# reset index of each table
p_season22010.reset_index(inplace = True)
p_season22011.reset_index(inplace = True)
p_season22012.reset_index(inplace = True)
p_season22013.reset_index(inplace = True)
p_season22014.reset_index(inplace = True)
p_season22015.reset_index(inplace = True)
p_season22016.reset_index(inplace = True)
p_season22017.reset_index(inplace = True)
p_season22018.reset_index(inplace = True)
p_season22019.reset_index(inplace = True)
p_season22020.reset_index(inplace = True)
p_season22021.reset_index(inplace = True)

In [27]:
p_season22019[p_season22019['player_id'] == '203999']

Unnamed: 0,player_id,player_name,status
513,203999,Nikola Jokic,73


In [28]:
# Season 22010
season22010['played'] = ''
for x in p_season22010.index:
    for y in season22010.index:
        if season22010['player_id'][y] == p_season22010['player_id'][x]:
            season22010['played'][y] = p_season22010['status'][x]
            
# Season 22011
season22011['played'] = ''
for x in p_season22011.index:
    for y in season22011.index:
        if season22011['player_id'][y] == p_season22011['player_id'][x]:
            season22011['played'][y] = p_season22011['status'][x]
            
# Season 22012
season22012['played'] = ''
for x in p_season22012.index:
    for y in season22012.index:
        if season22012['player_id'][y] == p_season22012['player_id'][x]:
            season22012['played'][y] = p_season22012['status'][x]

# Season 22013
season22013['played'] = ''
for x in p_season22013.index:
    for y in season22013.index:
        if season22013['player_id'][y] == p_season22013['player_id'][x]:
            season22013['played'][y] = p_season22013['status'][x]

# Season 22014
season22014['played'] = ''
for x in p_season22014.index:
    for y in season22014.index:
        if season22014['player_id'][y] == p_season22014['player_id'][x]:
            season22014['played'][y] = p_season22014['status'][x]

# Season 22015
season22015['played'] = ''
for x in p_season22015.index:
    for y in season22015.index:
        if season22015['player_id'][y] == p_season22015['player_id'][x]:
            season22015['played'][y] = p_season22015['status'][x]

# Season 22016
season22016['played'] = ''
for x in p_season22016.index:
    for y in season22016.index:
        if season22016['player_id'][y] == p_season22016['player_id'][x]:
            season22016['played'][y] = p_season22016['status'][x]

# Season 22017
season22017['played'] = ''
for x in p_season22017.index:
    for y in season22017.index:
        if season22017['player_id'][y] == p_season22017['player_id'][x]:
            season22017['played'][y] = p_season22017['status'][x]

# Season 22018
season22018['played'] = ''
for x in p_season22018.index:
    for y in season22018.index:
        if season22018['player_id'][y] == p_season22018['player_id'][x]:
            season22018['played'][y] = p_season22018['status'][x]
            
# Season 22019
season22019['played'] = ''
for x in p_season22019.index:
    for y in season22019.index:
        if season22019['player_id'][y] == p_season22019['player_id'][x]:
            season22019['played'][y] = p_season22019['status'][x]
            
# Season 22020
season22020['played'] = ''
for x in p_season22020.index:
    for y in season22020.index:
        if season22020['player_id'][y] == p_season22020['player_id'][x]:
            season22020['played'][y] = p_season22020['status'][x]
            
# Season 22021
season22021['played'] = ''
for x in p_season22021.index:
    for y in season22021.index:
        if season22021['player_id'][y] == p_season22021['player_id'][x]:
            season22021['played'][y] = p_season22021['status'][x]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_

In [29]:
# Check 
season22020[season22020['player_id'] == '203999']

Unnamed: 0,player_id,player_name,team_id,season_id,time_played,fgm,fga,fg_pct,fg3m,fg3a,...,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played
618,203999,Nikola Jokic,1610612743,22020,2073.236111,10.166667,17.958333,0.576403,1.277778,3.291667,...,8.319444,1.319444,0.666667,3.083333,2.666667,26.361111,5.333333,0,47,72


In [30]:
# Check
season22015[season22015['player_id'] == '201939']

Unnamed: 0,player_id,player_name,team_id,season_id,time_played,fgm,fga,fg_pct,fg3m,fg3a,...,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played
181,201939,Stephen Curry,1610612744,22015,1975.609756,9.817073,19.487805,0.481134,4.902439,10.804878,...,6.426829,2.060976,0.182927,3.195122,1.963415,28.963415,12.463415,0,73,79


In [31]:
# append all frames into 1 large frame
dataframes = [season22010, season22011, season22012, season22013, season22014, season22015, season22016, season22017,
                season22018, season22019, season22020, season22021]

player_avg_all = pd.DataFrame()

for dataframe in dataframes:
    player_avg_all = player_avg_all.append(dataframe)

In [32]:
# Check to see if players didn't play
player_avg_all[player_avg_all['played'] == ''].head()

Unnamed: 0,player_id,player_name,team_id,season_id,time_played,fgm,fga,fg_pct,fg3m,fg3a,...,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played
134,201141,Greg Oden,1610612757,22010,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,48,
293,202077,Jerel McNeal,1610612740,22010,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,46,
328,202343,Elliot Williams,1610612757,22010,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,48,
358,202392,Marqus Blakely,1610612745,22010,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,43,
240,202067,Diamon Simpson,1610612745,22011,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,34,


In [33]:
# fill players that didn't play as 0
player_avg_all[player_avg_all['played'] ==''] = player_avg_all[player_avg_all['played'] ==''] = 0

In [36]:
# Round scores to 1 decimal
player_avg_all[['time_played', 'fgm', 'fga', 'fg3m', 'fg3a', 'ftm', 'fta', 'oreb', 'dreb',
               'reb', 'ast', 'stl', 'blk', 'turn_over', 'pf', 'pts',
               'plus_minus']] = player_avg_all[['time_played','fgm', 'fga', 'fg3m', 'fg3a', 'ftm', 'fta', 'oreb', 'dreb',
               'reb', 'ast', 'stl', 'blk', 'turn_over', 'pf', 'pts',
               'plus_minus']].round(decimals = 1)

In [37]:
# round % to 2 decminal
player_avg_all[['fg_pct', 'fg3_pct', 'ft_pct']] = player_avg_all[['fg_pct', 'fg3_pct', 'ft_pct']].round(decimals = 2)

In [38]:
player_avg_all.dtypes

player_id       object
player_name     object
team_id         object
season_id       object
time_played    float64
fgm            float64
fga            float64
fg_pct         float64
fg3m           float64
fg3a           float64
fg3_pct        float64
ftm            float64
fta            float64
ft_pct         float64
oreb           float64
dreb           float64
reb            float64
ast            float64
stl            float64
blk            float64
turn_over      float64
pf             float64
pts            float64
plus_minus     float64
mvp             object
w               object
played          object
dtype: object

In [39]:
# change datatypes
player_avg_all['mvp'] = player_avg_all['mvp'].astype(int)
player_avg_all['w'] = player_avg_all['w'].astype(int)
player_avg_all['played'] = player_avg_all['played'].astype(int)

In [40]:
player_avg_all[player_avg_all['played'] =='']

Unnamed: 0,player_id,player_name,team_id,season_id,time_played,fgm,fga,fg_pct,fg3m,fg3a,...,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played


In [41]:
# display all mvps
pd.options.display.max_columns = None
mvps = player_avg_all[player_avg_all['mvp'] == 1]
mvps

Unnamed: 0,player_id,player_name,team_id,season_id,time_played,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played
0,101106,Andrew Bogut,1610612749,22010,1837.9,4.9,9.9,0.42,0.0,0.1,0.00,1.3,2.9,0.34,2.7,6.9,9.6,1.7,0.6,2.2,1.7,2.9,11.1,0.1,1,35,65
1,101107,Marvin Williams,1610612737,22010,1575.7,3.5,7.6,0.41,0.5,1.5,0.24,2.1,2.5,0.51,1.0,3.5,4.4,1.2,0.5,0.3,0.9,1.5,9.5,-0.6,1,44,65
2,101108,Chris Paul,1610612740,22010,2133.1,5.3,11.5,0.45,0.9,2.3,0.35,4.2,4.7,0.79,0.5,3.6,4.0,9.7,2.3,0.1,2.2,2.4,15.7,3.2,1,46,80
3,101109,Raymond Felton,1610612743,22010,1808.2,4.0,9.3,0.41,1.3,2.8,0.37,1.7,2.7,0.49,0.4,3.0,3.4,6.2,1.3,0.0,2.0,1.7,11.0,7.4,1,50,75
4,101109,Raymond Felton,1610612752,22010,2304.8,6.3,14.8,0.42,1.6,5.0,0.30,2.9,3.4,0.78,0.7,2.9,3.6,9.0,1.8,0.2,3.3,2.1,17.1,-0.6,1,42,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
734,2546,Carmelo Anthony,1610612747,22021,1537.1,4.6,10.3,0.43,2.1,5.7,0.35,1.9,2.3,0.59,0.9,3.2,4.1,1.0,0.7,0.7,0.8,2.4,13.1,-1.3,1,33,69
735,2617,Udonis Haslem,1610612748,22021,65.7,0.2,0.4,0.08,0.0,0.1,0.01,0.0,0.0,0.01,0.1,0.3,0.3,0.1,0.0,0.0,0.1,0.2,0.4,-0.2,1,53,13
736,2730,Dwight Howard,1610612747,22021,756.4,1.7,2.8,0.43,0.1,0.2,0.08,1.2,1.9,0.38,1.5,3.1,4.6,0.5,0.4,0.5,0.6,1.5,4.8,-1.7,1,33,60
737,2738,Andre Iguodala,1610612744,22021,904.2,1.2,3.0,0.29,0.4,1.8,0.17,0.4,0.5,0.19,0.5,2.0,2.5,2.8,0.7,0.6,0.7,0.8,3.1,2.8,1,53,31


In [42]:
#pd.options.display.max_columns = 20

# Preprocess for ML models

In [43]:
player_avg_2010_to_2020 = player_avg_all[player_avg_all['season_id'] != '22021']

In [44]:
player_avg_all.columns

Index(['player_id', 'player_name', 'team_id', 'season_id', 'time_played',
       'fgm', 'fga', 'fg_pct', 'fg3m', 'fg3a', 'fg3_pct', 'ftm', 'fta',
       'ft_pct', 'oreb', 'dreb', 'reb', 'ast', 'stl', 'blk', 'turn_over', 'pf',
       'pts', 'plus_minus', 'mvp', 'w', 'played'],
      dtype='object')

In [45]:
# features

X = player_avg_2010_to_2020[['fgm', 'fga',
       'fg_pct', 'fg3m', 'fg3a', 'fg3_pct', 'ftm', 'fta', 'ft_pct', 'oreb',
       'dreb', 'reb', 'ast', 'stl', 'blk', 'turn_over', 'pf', 'pts',
       'plus_minus', 'w', 'played']]

# targe

y= player_avg_2010_to_2020['mvp']

In [46]:
# split training data and target data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

Counter(y_train)

Counter({1: 4676, 0: 55})

In [47]:
# Scale features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [48]:
# season 2021-2022
player_avg_22021 = player_avg_all[player_avg_all['season_id'] == '22021']
player_avg_22021.drop(columns =['team_id', 'season_id'], axis = 1, inplace = True)
player_avg_22021.set_index(['player_id', 'player_name'],inplace = True)
player_avg_22021

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0_level_0,Unnamed: 1_level_0,time_played,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played
player_id,player_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
101108,Chris Paul,1915.5,5.4,11.0,0.48,0.9,3.0,0.28,2.5,3.0,0.66,0.3,3.9,4.2,10.5,1.8,0.3,2.3,2.0,14.3,6.9,1,64,65
101139,CJ Miles,58.5,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,-2.5,1,51,1
101150,Lou Williams,749.0,2.0,5.0,0.34,0.6,1.6,0.30,1.0,1.2,0.40,0.2,1.1,1.4,1.6,0.4,0.0,0.7,0.8,5.5,0.2,1,43,56
1626144,Emmanuel Mudiay,329.5,0.0,1.0,0.00,0.0,0.0,0.00,1.5,2.0,0.38,0.0,0.0,0.0,2.0,0.5,0.0,0.5,0.5,1.5,-3.5,1,30,2
1626145,Tyus Jones,1256.3,3.4,7.5,0.42,1.1,2.8,0.33,0.7,0.9,0.30,0.2,2.1,2.4,4.4,0.9,0.0,0.6,0.4,8.5,3.4,1,56,73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2546,Carmelo Anthony,1537.1,4.6,10.3,0.43,2.1,5.7,0.35,1.9,2.3,0.59,0.9,3.2,4.1,1.0,0.7,0.7,0.8,2.4,13.1,-1.3,1,33,69
2617,Udonis Haslem,65.7,0.2,0.4,0.08,0.0,0.1,0.01,0.0,0.0,0.01,0.1,0.3,0.3,0.1,0.0,0.0,0.1,0.2,0.4,-0.2,1,53,13
2730,Dwight Howard,756.4,1.7,2.8,0.43,0.1,0.2,0.08,1.2,1.9,0.38,1.5,3.1,4.6,0.5,0.4,0.5,0.6,1.5,4.8,-1.7,1,33,60
2738,Andre Iguodala,904.2,1.2,3.0,0.29,0.4,1.8,0.17,0.4,0.5,0.19,0.5,2.0,2.5,2.8,0.7,0.6,0.7,0.8,3.1,2.8,1,53,31


In [49]:
player_avg_22021[player_avg_22021['mvp'] == 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,time_played,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played
player_id,player_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
203999,Nikola Jokic,1980.5,10.2,17.5,0.59,1.3,3.8,0.32,5.1,6.2,0.8,2.7,10.8,13.6,7.8,1.5,0.8,3.7,2.5,26.7,5.9,0,48,74


In [50]:
# features

Xs = player_avg_22021[['fgm', 'fga',
       'fg_pct', 'fg3m', 'fg3a', 'fg3_pct', 'ftm', 'fta', 'ft_pct', 'oreb',
       'dreb', 'reb', 'ast', 'stl', 'blk', 'turn_over', 'pf', 'pts',
       'plus_minus', 'w', 'played']]

# targe

yy= player_avg_22021['mvp']

In [51]:
# Scale features
scaled_22021 = X_scaler.transform(Xs)

# Random Oversampling

In [52]:
# Oversample
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({1: 4676, 0: 4676})

## Logistic Regression

In [53]:
# create model & fit model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [54]:
# predict and generate report
y_pred = model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)
print(f'Confusion Matrix')
display(cm_df)
print( f'Accuracy: {acc}')
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,21,0
Actual NON,15,1542


Accuracy: 0.9904942965779467
                   pre       rec       spe        f1       geo       iba       sup

          0       0.58      1.00      0.99      0.74      1.00      0.99        21
          1       1.00      0.99      1.00      1.00      1.00      0.99      1557

avg / total       0.99      0.99      1.00      0.99      1.00      0.99      1578



In [55]:
# predict 2021-2022 MVP
y_pred_22021 = model.predict(scaled_22021)
cm = confusion_matrix(yy, y_pred_22021)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)

print(f'Confusion matrix')
display(cm_df)
print(f'Accuracy score: {acc}')
print(classification_report_imbalanced(yy, y_pred_22021))

Confusion matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,1,0
Actual NON,6,718


Accuracy score: 0.9904942965779467
                   pre       rec       spe        f1       geo       iba       sup

          0       0.14      1.00      0.99      0.25      1.00      0.99         1
          1       1.00      0.99      1.00      1.00      1.00      0.99       724

avg / total       1.00      0.99      1.00      0.99      1.00      0.99       725



In [56]:
# display MVP for 2021-2022
player_avg_22021['predict'] = model.predict(scaled_22021)
player_avg_22021[player_avg_22021['predict'] == 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Unnamed: 1_level_0,time_played,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played,predict
player_id,player_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1628369,Jayson Tatum,2156.0,9.3,20.6,0.45,3.0,8.6,0.33,5.3,6.2,0.83,1.1,6.9,8.0,4.4,1.0,0.6,2.9,2.3,26.9,8.8,1,51,76,0
1629027,Trae Young,2093.6,9.4,20.3,0.45,3.1,8.0,0.36,6.6,7.3,0.87,0.7,3.1,3.7,9.7,0.9,0.1,4.0,1.7,28.4,2.1,1,43,76,0
201142,Kevin Durant,2233.5,10.5,20.3,0.53,2.1,5.5,0.39,6.8,7.4,0.9,0.5,6.9,7.4,6.4,0.9,0.9,3.5,2.1,29.9,4.9,1,44,55,0
201935,James Harden,2264.1,5.5,13.6,0.41,2.2,6.7,0.32,7.9,8.9,0.89,0.6,6.5,7.1,10.5,1.2,0.2,3.4,2.3,21.0,7.1,1,51,65,0
203507,Giannis Antetokounmpo,1944.9,10.1,18.3,0.54,1.0,3.6,0.27,8.1,11.3,0.72,2.0,9.5,11.4,5.7,1.1,1.3,3.2,3.1,29.4,5.8,1,51,67,0
203954,Joel Embiid,1968.4,9.5,19.1,0.48,1.3,3.6,0.36,9.3,11.5,0.79,2.1,9.3,11.4,4.1,1.1,1.4,3.1,2.6,29.7,5.3,1,51,68,0
203999,Nikola Jokic,1980.5,10.2,17.5,0.59,1.3,3.8,0.32,5.1,6.2,0.8,2.7,10.8,13.6,7.8,1.5,0.8,3.7,2.5,26.7,5.9,0,48,74,0


## Support vector machine (SVM)

In [57]:
from sklearn.svm import SVC
model = SVC(kernel='linear')

In [58]:
model.fit(X_resampled, y_resampled)

SVC(kernel='linear')

In [59]:
# predict and generate report
y_pred = model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)
print(f'Confusion Matrix')
display(cm_df)
print( f'Accuracy: {acc}')
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,19,2
Actual NON,6,1551


Accuracy: 0.9949302915082383
                   pre       rec       spe        f1       geo       iba       sup

          0       0.76      0.90      1.00      0.83      0.95      0.89        21
          1       1.00      1.00      0.90      1.00      0.95      0.91      1557

avg / total       1.00      0.99      0.91      1.00      0.95      0.91      1578



In [60]:
# predict 2021-2022 MVP
y_pred_22021 = model.predict(scaled_22021)
cm = confusion_matrix(yy, y_pred_22021)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)

print(f'Confusion matrix')
display(cm_df)
print(f'Accuracy score: {acc}')
print(classification_report_imbalanced(yy, y_pred_22021))

Confusion matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,0,1
Actual NON,4,720


Accuracy score: 0.9949302915082383
                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      0.99      0.00      0.00      0.00         1
          1       1.00      0.99      0.00      1.00      0.00      0.00       724

avg / total       1.00      0.99      0.00      1.00      0.00      0.00       725



In [61]:
# display MVP for 2021-2022
player_avg_22021['predict'] = model.predict(scaled_22021)
player_avg_22021[player_avg_22021['predict'] == 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Unnamed: 1_level_0,time_played,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played,predict
player_id,player_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1629027,Trae Young,2093.6,9.4,20.3,0.45,3.1,8.0,0.36,6.6,7.3,0.87,0.7,3.1,3.7,9.7,0.9,0.1,4.0,1.7,28.4,2.1,1,43,76,0
201142,Kevin Durant,2233.5,10.5,20.3,0.53,2.1,5.5,0.39,6.8,7.4,0.9,0.5,6.9,7.4,6.4,0.9,0.9,3.5,2.1,29.9,4.9,1,44,55,0
203507,Giannis Antetokounmpo,1944.9,10.1,18.3,0.54,1.0,3.6,0.27,8.1,11.3,0.72,2.0,9.5,11.4,5.7,1.1,1.3,3.2,3.1,29.4,5.8,1,51,67,0
203954,Joel Embiid,1968.4,9.5,19.1,0.48,1.3,3.6,0.36,9.3,11.5,0.79,2.1,9.3,11.4,4.1,1.1,1.4,3.1,2.6,29.7,5.3,1,51,68,0


## Decision Tree

In [62]:
from sklearn import tree
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_resampled, y_resampled)

In [63]:
# predict and generate report
y_pred = model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)
print(f'Confusion Matrix')
display(cm_df)
print( f'Accuracy: {acc}')
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,19,2
Actual NON,4,1553


Accuracy: 0.9961977186311787
                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      0.90      1.00      0.86      0.95      0.89        21
          1       1.00      1.00      0.90      1.00      0.95      0.91      1557

avg / total       1.00      1.00      0.91      1.00      0.95      0.91      1578



In [64]:
# predict 2021-2022 MVP
y_pred_22021 = model.predict(scaled_22021)
cm = confusion_matrix(yy, y_pred_22021)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)

print(f'Confusion matrix')
display(cm_df)
print(f'Accuracy score: {acc}')
print(classification_report_imbalanced(yy, y_pred_22021))

Confusion matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,1,0
Actual NON,1,723


Accuracy score: 0.9961977186311787
                   pre       rec       spe        f1       geo       iba       sup

          0       0.50      1.00      1.00      0.67      1.00      1.00         1
          1       1.00      1.00      1.00      1.00      1.00      1.00       724

avg / total       1.00      1.00      1.00      1.00      1.00      1.00       725



In [65]:
# display MVP for 2021-2022
player_avg_22021['predict'] = model.predict(scaled_22021)
player_avg_22021[player_avg_22021['predict'] == 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Unnamed: 1_level_0,time_played,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played,predict
player_id,player_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1629029,Luka Doncic,2123.8,9.9,21.6,0.45,3.1,8.8,0.33,5.6,7.5,0.72,0.9,8.3,9.1,8.7,1.2,0.6,4.5,2.2,28.4,2.2,1,52,65,0
203999,Nikola Jokic,1980.5,10.2,17.5,0.59,1.3,3.8,0.32,5.1,6.2,0.8,2.7,10.8,13.6,7.8,1.5,0.8,3.7,2.5,26.7,5.9,0,48,74,0


## Random Forest

In [66]:
from sklearn.ensemble import RandomForestClassifier
model = tree.DecisionTreeClassifier()
model = RandomForestClassifier(n_estimators=128, random_state=1) 
model = model.fit(X_resampled, y_resampled)

In [67]:
# predict and generate report
y_pred = model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)
print(f'Confusion Matrix')
display(cm_df)
print( f'Accuracy: {acc}')
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,19,2
Actual NON,1,1556


Accuracy: 0.9980988593155894
                   pre       rec       spe        f1       geo       iba       sup

          0       0.95      0.90      1.00      0.93      0.95      0.90        21
          1       1.00      1.00      0.90      1.00      0.95      0.91      1557

avg / total       1.00      1.00      0.91      1.00      0.95      0.91      1578



In [68]:
# predict 2021-2022 MVP
y_pred_22021 = model.predict(scaled_22021)
cm = confusion_matrix(yy, y_pred_22021)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)

print(f'Confusion matrix')
display(cm_df)
print(f'Accuracy score: {acc}')
print(classification_report_imbalanced(yy, y_pred_22021))

Confusion matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,1,0
Actual NON,1,723


Accuracy score: 0.9980988593155894
                   pre       rec       spe        f1       geo       iba       sup

          0       0.50      1.00      1.00      0.67      1.00      1.00         1
          1       1.00      1.00      1.00      1.00      1.00      1.00       724

avg / total       1.00      1.00      1.00      1.00      1.00      1.00       725



In [69]:
# display MVP for 2021-2022
player_avg_22021['predict'] = model.predict(scaled_22021)
player_avg_22021[player_avg_22021['predict'] == 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Unnamed: 1_level_0,time_played,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played,predict
player_id,player_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
203507,Giannis Antetokounmpo,1944.9,10.1,18.3,0.54,1.0,3.6,0.27,8.1,11.3,0.72,2.0,9.5,11.4,5.7,1.1,1.3,3.2,3.1,29.4,5.8,1,51,67,0
203999,Nikola Jokic,1980.5,10.2,17.5,0.59,1.3,3.8,0.32,5.1,6.2,0.8,2.7,10.8,13.6,7.8,1.5,0.8,3.7,2.5,26.7,5.9,0,48,74,0


## Boosting

In [70]:
from sklearn.ensemble import GradientBoostingClassifier

learning_rates = [0.001, 0.01, 0.025, 0.05, 0.1, 0.25]

for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
    learning_rate= learning_rate,
    max_features = 5,
    max_depth = 3,
    random_state = 0)
    classifier.fit(X_resampled, y_resampled)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_train_scaled,
           y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test_scaled,
           y_test)))

Learning rate:  0.001
Accuracy score (training): 0.995
Accuracy score (validation): 0.994
Learning rate:  0.01
Accuracy score (training): 0.995
Accuracy score (validation): 0.994
Learning rate:  0.025
Accuracy score (training): 0.994
Accuracy score (validation): 0.994
Learning rate:  0.05
Accuracy score (training): 0.996
Accuracy score (validation): 0.994
Learning rate:  0.1
Accuracy score (training): 0.996
Accuracy score (validation): 0.995
Learning rate:  0.25
Accuracy score (training): 0.999
Accuracy score (validation): 0.996


In [71]:
classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=0.01, max_features=5, max_depth=3, random_state=0)

classifier.fit(X_resampled, y_resampled)
# predict and generate report
y_pred = model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)
print(f'Confusion Matrix')
display(cm_df)
print( f'Accuracy: {acc}')
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,19,2
Actual NON,1,1556


Accuracy: 0.9980988593155894
                   pre       rec       spe        f1       geo       iba       sup

          0       0.95      0.90      1.00      0.93      0.95      0.90        21
          1       1.00      1.00      0.90      1.00      0.95      0.91      1557

avg / total       1.00      1.00      0.91      1.00      0.95      0.91      1578



In [72]:
# predict 2021-2022 MVP
y_pred_22021 = model.predict(scaled_22021)
cm = confusion_matrix(yy, y_pred_22021)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)

print(f'Confusion matrix')
display(cm_df)
print(f'Accuracy score: {acc}')
print(classification_report_imbalanced(yy, y_pred_22021))

Confusion matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,1,0
Actual NON,1,723


Accuracy score: 0.9980988593155894
                   pre       rec       spe        f1       geo       iba       sup

          0       0.50      1.00      1.00      0.67      1.00      1.00         1
          1       1.00      1.00      1.00      1.00      1.00      1.00       724

avg / total       1.00      1.00      1.00      1.00      1.00      1.00       725



In [73]:
# display MVP for 2021-2022
player_avg_22021['predict'] = model.predict(scaled_22021)
player_avg_22021[player_avg_22021['predict'] == 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Unnamed: 1_level_0,time_played,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played,predict
player_id,player_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
203507,Giannis Antetokounmpo,1944.9,10.1,18.3,0.54,1.0,3.6,0.27,8.1,11.3,0.72,2.0,9.5,11.4,5.7,1.1,1.3,3.2,3.1,29.4,5.8,1,51,67,0
203999,Nikola Jokic,1980.5,10.2,17.5,0.59,1.3,3.8,0.32,5.1,6.2,0.8,2.7,10.8,13.6,7.8,1.5,0.8,3.7,2.5,26.7,5.9,0,48,74,0


# SMOTE

In [74]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({1: 4676, 0: 4676})

## Logistic Regression

In [75]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [76]:
# predict and generate report
y_pred = model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)
print(f'Confusion Matrix')
display(cm_df)
print( f'Accuracy: {acc}')
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,19,2
Actual NON,16,1541


Accuracy: 0.9885931558935361
                   pre       rec       spe        f1       geo       iba       sup

          0       0.54      0.90      0.99      0.68      0.95      0.89        21
          1       1.00      0.99      0.90      0.99      0.95      0.90      1557

avg / total       0.99      0.99      0.91      0.99      0.95      0.90      1578



In [77]:
# predict 2021-2022 MVP
y_pred_22021 = model.predict(scaled_22021)
cm = confusion_matrix(yy, y_pred_22021)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)

print(f'Confusion matrix')
display(cm_df)
print(f'Accuracy score: {acc}')
print(classification_report_imbalanced(yy, y_pred_22021))

Confusion matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,1,0
Actual NON,6,718


Accuracy score: 0.9885931558935361
                   pre       rec       spe        f1       geo       iba       sup

          0       0.14      1.00      0.99      0.25      1.00      0.99         1
          1       1.00      0.99      1.00      1.00      1.00      0.99       724

avg / total       1.00      0.99      1.00      0.99      1.00      0.99       725



In [78]:
# display MVP for 2021-2022
player_avg_22021['predict'] = model.predict(scaled_22021)
player_avg_22021[player_avg_22021['predict'] == 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Unnamed: 1_level_0,time_played,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played,predict
player_id,player_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1628369,Jayson Tatum,2156.0,9.3,20.6,0.45,3.0,8.6,0.33,5.3,6.2,0.83,1.1,6.9,8.0,4.4,1.0,0.6,2.9,2.3,26.9,8.8,1,51,76,0
1629027,Trae Young,2093.6,9.4,20.3,0.45,3.1,8.0,0.36,6.6,7.3,0.87,0.7,3.1,3.7,9.7,0.9,0.1,4.0,1.7,28.4,2.1,1,43,76,0
201142,Kevin Durant,2233.5,10.5,20.3,0.53,2.1,5.5,0.39,6.8,7.4,0.9,0.5,6.9,7.4,6.4,0.9,0.9,3.5,2.1,29.9,4.9,1,44,55,0
201935,James Harden,2264.1,5.5,13.6,0.41,2.2,6.7,0.32,7.9,8.9,0.89,0.6,6.5,7.1,10.5,1.2,0.2,3.4,2.3,21.0,7.1,1,51,65,0
203507,Giannis Antetokounmpo,1944.9,10.1,18.3,0.54,1.0,3.6,0.27,8.1,11.3,0.72,2.0,9.5,11.4,5.7,1.1,1.3,3.2,3.1,29.4,5.8,1,51,67,0
203954,Joel Embiid,1968.4,9.5,19.1,0.48,1.3,3.6,0.36,9.3,11.5,0.79,2.1,9.3,11.4,4.1,1.1,1.4,3.1,2.6,29.7,5.3,1,51,68,0
203999,Nikola Jokic,1980.5,10.2,17.5,0.59,1.3,3.8,0.32,5.1,6.2,0.8,2.7,10.8,13.6,7.8,1.5,0.8,3.7,2.5,26.7,5.9,0,48,74,0


## Support vector machine (SVM)

In [79]:
model = SVC(kernel='linear')

In [80]:
model.fit(X_resampled, y_resampled)

SVC(kernel='linear')

In [81]:
# predict and generate report
y_pred = model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)
print(f'Confusion Matrix')
display(cm_df)
print( f'Accuracy: {acc}')
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,19,2
Actual NON,6,1551


Accuracy: 0.9949302915082383
                   pre       rec       spe        f1       geo       iba       sup

          0       0.76      0.90      1.00      0.83      0.95      0.89        21
          1       1.00      1.00      0.90      1.00      0.95      0.91      1557

avg / total       1.00      0.99      0.91      1.00      0.95      0.91      1578



In [82]:
# predict 2021-2022 MVP
y_pred_22021 = model.predict(scaled_22021)
cm = confusion_matrix(yy, y_pred_22021)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)

print(f'Confusion matrix')
display(cm_df)
print(f'Accuracy score: {acc}')
print(classification_report_imbalanced(yy, y_pred_22021))

Confusion matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,0,1
Actual NON,4,720


Accuracy score: 0.9949302915082383
                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      0.99      0.00      0.00      0.00         1
          1       1.00      0.99      0.00      1.00      0.00      0.00       724

avg / total       1.00      0.99      0.00      1.00      0.00      0.00       725



In [83]:
# display MVP for 2021-2022
player_avg_22021['predict'] = model.predict(scaled_22021)
player_avg_22021[player_avg_22021['predict'] == 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Unnamed: 1_level_0,time_played,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played,predict
player_id,player_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1629027,Trae Young,2093.6,9.4,20.3,0.45,3.1,8.0,0.36,6.6,7.3,0.87,0.7,3.1,3.7,9.7,0.9,0.1,4.0,1.7,28.4,2.1,1,43,76,0
201142,Kevin Durant,2233.5,10.5,20.3,0.53,2.1,5.5,0.39,6.8,7.4,0.9,0.5,6.9,7.4,6.4,0.9,0.9,3.5,2.1,29.9,4.9,1,44,55,0
203507,Giannis Antetokounmpo,1944.9,10.1,18.3,0.54,1.0,3.6,0.27,8.1,11.3,0.72,2.0,9.5,11.4,5.7,1.1,1.3,3.2,3.1,29.4,5.8,1,51,67,0
203954,Joel Embiid,1968.4,9.5,19.1,0.48,1.3,3.6,0.36,9.3,11.5,0.79,2.1,9.3,11.4,4.1,1.1,1.4,3.1,2.6,29.7,5.3,1,51,68,0


## Decision Tree

In [84]:
from sklearn import tree
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_resampled, y_resampled)

In [85]:
# predict and generate report
y_pred = model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)
print(f'Confusion Matrix')
display(cm_df)
print( f'Accuracy: {acc}')
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,19,2
Actual NON,5,1552


Accuracy: 0.9955640050697085
                   pre       rec       spe        f1       geo       iba       sup

          0       0.79      0.90      1.00      0.84      0.95      0.89        21
          1       1.00      1.00      0.90      1.00      0.95      0.91      1557

avg / total       1.00      1.00      0.91      1.00      0.95      0.91      1578



In [86]:
# predict 2021-2022 MVP
y_pred_22021 = model.predict(scaled_22021)
cm = confusion_matrix(yy, y_pred_22021)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)

print(f'Confusion matrix')
display(cm_df)
print(f'Accuracy score: {acc}')
print(classification_report_imbalanced(yy, y_pred_22021))

Confusion matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,1,0
Actual NON,3,721


Accuracy score: 0.9955640050697085
                   pre       rec       spe        f1       geo       iba       sup

          0       0.25      1.00      1.00      0.40      1.00      1.00         1
          1       1.00      1.00      1.00      1.00      1.00      1.00       724

avg / total       1.00      1.00      1.00      1.00      1.00      1.00       725



In [87]:
# display MVP for 2021-2022
player_avg_22021['predict'] = model.predict(scaled_22021)
player_avg_22021[player_avg_22021['predict'] == 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Unnamed: 1_level_0,time_played,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played,predict
player_id,player_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1629029,Luka Doncic,2123.8,9.9,21.6,0.45,3.1,8.8,0.33,5.6,7.5,0.72,0.9,8.3,9.1,8.7,1.2,0.6,4.5,2.2,28.4,2.2,1,52,65,0
1629630,Ja Morant,1988.2,10.2,20.6,0.48,1.5,4.5,0.31,5.5,7.3,0.73,1.4,4.4,5.7,6.7,1.2,0.4,3.4,1.5,27.4,3.3,1,56,57,0
203507,Giannis Antetokounmpo,1944.9,10.1,18.3,0.54,1.0,3.6,0.27,8.1,11.3,0.72,2.0,9.5,11.4,5.7,1.1,1.3,3.2,3.1,29.4,5.8,1,51,67,0
203999,Nikola Jokic,1980.5,10.2,17.5,0.59,1.3,3.8,0.32,5.1,6.2,0.8,2.7,10.8,13.6,7.8,1.5,0.8,3.7,2.5,26.7,5.9,0,48,74,0


## Random Forest

In [88]:
from sklearn.ensemble import RandomForestClassifier
model = tree.DecisionTreeClassifier()
model = RandomForestClassifier(n_estimators=128, random_state=1) 
model = model.fit(X_resampled, y_resampled)

In [89]:
# predict and generate report
y_pred = model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)
print(f'Confusion Matrix')
display(cm_df)
print( f'Accuracy: {acc}')
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,19,2
Actual NON,2,1555


Accuracy: 0.9974651457541192
                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      0.90      1.00      0.90      0.95      0.90        21
          1       1.00      1.00      0.90      1.00      0.95      0.91      1557

avg / total       1.00      1.00      0.91      1.00      0.95      0.91      1578



In [90]:
# predict 2021-2022 MVP
y_pred_22021 = model.predict(scaled_22021)
cm = confusion_matrix(yy, y_pred_22021)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)

print(f'Confusion matrix')
display(cm_df)
print(f'Accuracy score: {acc}')
print(classification_report_imbalanced(yy, y_pred_22021))

Confusion matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,1,0
Actual NON,2,722


Accuracy score: 0.9974651457541192
                   pre       rec       spe        f1       geo       iba       sup

          0       0.33      1.00      1.00      0.50      1.00      1.00         1
          1       1.00      1.00      1.00      1.00      1.00      1.00       724

avg / total       1.00      1.00      1.00      1.00      1.00      1.00       725



In [91]:
# display MVP for 2021-2022
player_avg_22021['predict'] = model.predict(scaled_22021)
player_avg_22021[player_avg_22021['predict'] == 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Unnamed: 1_level_0,time_played,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played,predict
player_id,player_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1629029,Luka Doncic,2123.8,9.9,21.6,0.45,3.1,8.8,0.33,5.6,7.5,0.72,0.9,8.3,9.1,8.7,1.2,0.6,4.5,2.2,28.4,2.2,1,52,65,0
203507,Giannis Antetokounmpo,1944.9,10.1,18.3,0.54,1.0,3.6,0.27,8.1,11.3,0.72,2.0,9.5,11.4,5.7,1.1,1.3,3.2,3.1,29.4,5.8,1,51,67,0
203999,Nikola Jokic,1980.5,10.2,17.5,0.59,1.3,3.8,0.32,5.1,6.2,0.8,2.7,10.8,13.6,7.8,1.5,0.8,3.7,2.5,26.7,5.9,0,48,74,0


## Boosting

In [92]:
from sklearn.ensemble import GradientBoostingClassifier

learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
    learning_rate= learning_rate,
    max_features = 5,
    max_depth = 3,
    random_state = 0)
    classifier.fit(X_resampled, y_resampled)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_train_scaled,
           y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test_scaled,
           y_test)))

Learning rate:  0.05
Accuracy score (training): 0.997
Accuracy score (validation): 0.994
Learning rate:  0.1
Accuracy score (training): 0.997
Accuracy score (validation): 0.994
Learning rate:  0.25
Accuracy score (training): 0.999
Accuracy score (validation): 0.996
Learning rate:  0.5
Accuracy score (training): 1.000
Accuracy score (validation): 0.997
Learning rate:  0.75
Accuracy score (training): 1.000
Accuracy score (validation): 0.997
Learning rate:  1
Accuracy score (training): 1.000
Accuracy score (validation): 0.996


In [93]:
classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=1, max_features=5, max_depth=3, random_state=0)

classifier.fit(X_resampled, y_resampled)
# predict and generate report
y_pred = model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)
print(f'Confusion Matrix')
display(cm_df)
print( f'Accuracy: {acc}')
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,19,2
Actual NON,2,1555


Accuracy: 0.9974651457541192
                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      0.90      1.00      0.90      0.95      0.90        21
          1       1.00      1.00      0.90      1.00      0.95      0.91      1557

avg / total       1.00      1.00      0.91      1.00      0.95      0.91      1578



In [94]:
# predict 2021-2022 MVP
y_pred_22021 = model.predict(scaled_22021)
cm = confusion_matrix(yy, y_pred_22021)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)

print(f'Confusion matrix')
display(cm_df)
print(f'Accuracy score: {acc}')
print(classification_report_imbalanced(yy, y_pred_22021))

Confusion matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,1,0
Actual NON,2,722


Accuracy score: 0.9974651457541192
                   pre       rec       spe        f1       geo       iba       sup

          0       0.33      1.00      1.00      0.50      1.00      1.00         1
          1       1.00      1.00      1.00      1.00      1.00      1.00       724

avg / total       1.00      1.00      1.00      1.00      1.00      1.00       725



In [95]:
# display MVP for 2021-2022
player_avg_22021['predict'] = model.predict(scaled_22021)
player_avg_22021[player_avg_22021['predict'] == 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Unnamed: 1_level_0,time_played,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played,predict
player_id,player_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1629029,Luka Doncic,2123.8,9.9,21.6,0.45,3.1,8.8,0.33,5.6,7.5,0.72,0.9,8.3,9.1,8.7,1.2,0.6,4.5,2.2,28.4,2.2,1,52,65,0
203507,Giannis Antetokounmpo,1944.9,10.1,18.3,0.54,1.0,3.6,0.27,8.1,11.3,0.72,2.0,9.5,11.4,5.7,1.1,1.3,3.2,3.1,29.4,5.8,1,51,67,0
203999,Nikola Jokic,1980.5,10.2,17.5,0.59,1.3,3.8,0.32,5.1,6.2,0.8,2.7,10.8,13.6,7.8,1.5,0.8,3.7,2.5,26.7,5.9,0,48,74,0


# SMOTEENN over+under

In [96]:
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled, y_train)

## Logistic Regression

In [97]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [98]:
# predict and generate report
y_pred = model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)
print(f'Confusion Matrix')
display(cm_df)
print( f'Accuracy: {acc}')
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,21,0
Actual NON,14,1543


Accuracy: 0.991128010139417
                   pre       rec       spe        f1       geo       iba       sup

          0       0.60      1.00      0.99      0.75      1.00      0.99        21
          1       1.00      0.99      1.00      1.00      1.00      0.99      1557

avg / total       0.99      0.99      1.00      0.99      1.00      0.99      1578



In [99]:
# predict 2021-2022 MVP
y_pred_22021 = model.predict(scaled_22021)
cm = confusion_matrix(yy, y_pred_22021)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)

print(f'Confusion matrix')
display(cm_df)
print(f'Accuracy score: {acc}')
print(classification_report_imbalanced(yy, y_pred_22021))

Confusion matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,1,0
Actual NON,7,717


Accuracy score: 0.991128010139417
                   pre       rec       spe        f1       geo       iba       sup

          0       0.12      1.00      0.99      0.22      1.00      0.99         1
          1       1.00      0.99      1.00      1.00      1.00      0.99       724

avg / total       1.00      0.99      1.00      0.99      1.00      0.99       725



In [100]:
# display MVP for 2021-2022
player_avg_22021['predict'] = model.predict(scaled_22021)
player_avg_22021[player_avg_22021['predict'] == 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Unnamed: 1_level_0,time_played,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played,predict
player_id,player_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1628369,Jayson Tatum,2156.0,9.3,20.6,0.45,3.0,8.6,0.33,5.3,6.2,0.83,1.1,6.9,8.0,4.4,1.0,0.6,2.9,2.3,26.9,8.8,1,51,76,0
1629027,Trae Young,2093.6,9.4,20.3,0.45,3.1,8.0,0.36,6.6,7.3,0.87,0.7,3.1,3.7,9.7,0.9,0.1,4.0,1.7,28.4,2.1,1,43,76,0
1629636,Darius Garland,2144.1,8.0,17.2,0.47,2.6,6.7,0.39,3.2,3.5,0.74,0.6,2.7,3.3,8.6,1.3,0.1,3.6,1.7,21.7,5.5,1,44,68,0
201142,Kevin Durant,2233.5,10.5,20.3,0.53,2.1,5.5,0.39,6.8,7.4,0.9,0.5,6.9,7.4,6.4,0.9,0.9,3.5,2.1,29.9,4.9,1,44,55,0
201935,James Harden,2264.1,5.5,13.6,0.41,2.2,6.7,0.32,7.9,8.9,0.89,0.6,6.5,7.1,10.5,1.2,0.2,3.4,2.3,21.0,7.1,1,51,65,0
203507,Giannis Antetokounmpo,1944.9,10.1,18.3,0.54,1.0,3.6,0.27,8.1,11.3,0.72,2.0,9.5,11.4,5.7,1.1,1.3,3.2,3.1,29.4,5.8,1,51,67,0
203954,Joel Embiid,1968.4,9.5,19.1,0.48,1.3,3.6,0.36,9.3,11.5,0.79,2.1,9.3,11.4,4.1,1.1,1.4,3.1,2.6,29.7,5.3,1,51,68,0
203999,Nikola Jokic,1980.5,10.2,17.5,0.59,1.3,3.8,0.32,5.1,6.2,0.8,2.7,10.8,13.6,7.8,1.5,0.8,3.7,2.5,26.7,5.9,0,48,74,0


## Support vector machine (SVM)

In [101]:
from sklearn.svm import SVC
model = SVC(kernel='linear')
model.fit(X_resampled, y_resampled)

SVC(kernel='linear')

In [102]:
# predict and generate report
y_pred = model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)
print(f'Confusion Matrix')
display(cm_df)
print( f'Accuracy: {acc}')
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,21,0
Actual NON,8,1549


Accuracy: 0.9949302915082383
                   pre       rec       spe        f1       geo       iba       sup

          0       0.72      1.00      0.99      0.84      1.00      1.00        21
          1       1.00      0.99      1.00      1.00      1.00      0.99      1557

avg / total       1.00      0.99      1.00      1.00      1.00      0.99      1578



In [103]:
# predict 2021-2022 MVP
y_pred_22021 = model.predict(scaled_22021)
cm = confusion_matrix(yy, y_pred_22021)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)

print(f'Confusion matrix')
display(cm_df)
print(f'Accuracy score: {acc}')
print(classification_report_imbalanced(yy, y_pred_22021))

Confusion matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,1,0
Actual NON,6,718


Accuracy score: 0.9949302915082383
                   pre       rec       spe        f1       geo       iba       sup

          0       0.14      1.00      0.99      0.25      1.00      0.99         1
          1       1.00      0.99      1.00      1.00      1.00      0.99       724

avg / total       1.00      0.99      1.00      0.99      1.00      0.99       725



In [104]:
# display MVP for 2021-2022
player_avg_22021['predict'] = model.predict(scaled_22021)
player_avg_22021[player_avg_22021['predict'] == 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Unnamed: 1_level_0,time_played,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played,predict
player_id,player_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1628369,Jayson Tatum,2156.0,9.3,20.6,0.45,3.0,8.6,0.33,5.3,6.2,0.83,1.1,6.9,8.0,4.4,1.0,0.6,2.9,2.3,26.9,8.8,1,51,76,0
1629027,Trae Young,2093.6,9.4,20.3,0.45,3.1,8.0,0.36,6.6,7.3,0.87,0.7,3.1,3.7,9.7,0.9,0.1,4.0,1.7,28.4,2.1,1,43,76,0
201142,Kevin Durant,2233.5,10.5,20.3,0.53,2.1,5.5,0.39,6.8,7.4,0.9,0.5,6.9,7.4,6.4,0.9,0.9,3.5,2.1,29.9,4.9,1,44,55,0
201935,James Harden,2264.1,5.5,13.6,0.41,2.2,6.7,0.32,7.9,8.9,0.89,0.6,6.5,7.1,10.5,1.2,0.2,3.4,2.3,21.0,7.1,1,51,65,0
203507,Giannis Antetokounmpo,1944.9,10.1,18.3,0.54,1.0,3.6,0.27,8.1,11.3,0.72,2.0,9.5,11.4,5.7,1.1,1.3,3.2,3.1,29.4,5.8,1,51,67,0
203954,Joel Embiid,1968.4,9.5,19.1,0.48,1.3,3.6,0.36,9.3,11.5,0.79,2.1,9.3,11.4,4.1,1.1,1.4,3.1,2.6,29.7,5.3,1,51,68,0
203999,Nikola Jokic,1980.5,10.2,17.5,0.59,1.3,3.8,0.32,5.1,6.2,0.8,2.7,10.8,13.6,7.8,1.5,0.8,3.7,2.5,26.7,5.9,0,48,74,0


## Decision Tree

In [105]:
from sklearn import tree
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_resampled, y_resampled)

In [106]:
# predict and generate report
y_pred = model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)
print(f'Confusion Matrix')
display(cm_df)
print( f'Accuracy: {acc}')
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,20,1
Actual NON,4,1553


Accuracy: 0.9968314321926489
                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      0.95      1.00      0.89      0.97      0.95        21
          1       1.00      1.00      0.95      1.00      0.97      0.95      1557

avg / total       1.00      1.00      0.95      1.00      0.97      0.95      1578



In [107]:
# predict 2021-2022 MVP
y_pred_22021 = model.predict(scaled_22021)
cm = confusion_matrix(yy, y_pred_22021)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)

print(f'Confusion matrix')
display(cm_df)
print(f'Accuracy score: {acc}')
print(classification_report_imbalanced(yy, y_pred_22021))

Confusion matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,1,0
Actual NON,6,718


Accuracy score: 0.9968314321926489
                   pre       rec       spe        f1       geo       iba       sup

          0       0.14      1.00      0.99      0.25      1.00      0.99         1
          1       1.00      0.99      1.00      1.00      1.00      0.99       724

avg / total       1.00      0.99      1.00      0.99      1.00      0.99       725



In [108]:
# display MVP for 2021-2022
player_avg_22021['predict'] = model.predict(scaled_22021)
player_avg_22021[player_avg_22021['predict'] == 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Unnamed: 1_level_0,time_played,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played,predict
player_id,player_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1628369,Jayson Tatum,2156.0,9.3,20.6,0.45,3.0,8.6,0.33,5.3,6.2,0.83,1.1,6.9,8.0,4.4,1.0,0.6,2.9,2.3,26.9,8.8,1,51,76,0
1629029,Luka Doncic,2123.8,9.9,21.6,0.45,3.1,8.8,0.33,5.6,7.5,0.72,0.9,8.3,9.1,8.7,1.2,0.6,4.5,2.2,28.4,2.2,1,52,65,0
1629630,Ja Morant,1988.2,10.2,20.6,0.48,1.5,4.5,0.31,5.5,7.3,0.73,1.4,4.4,5.7,6.7,1.2,0.4,3.4,1.5,27.4,3.3,1,56,57,0
201939,Stephen Curry,2073.0,8.4,19.1,0.43,4.5,11.7,0.37,4.3,4.7,0.79,0.5,4.7,5.2,6.3,1.3,0.4,3.2,2.0,25.5,8.0,1,53,64,0
203507,Giannis Antetokounmpo,1944.9,10.1,18.3,0.54,1.0,3.6,0.27,8.1,11.3,0.72,2.0,9.5,11.4,5.7,1.1,1.3,3.2,3.1,29.4,5.8,1,51,67,0
203954,Joel Embiid,1968.4,9.5,19.1,0.48,1.3,3.6,0.36,9.3,11.5,0.79,2.1,9.3,11.4,4.1,1.1,1.4,3.1,2.6,29.7,5.3,1,51,68,0
203999,Nikola Jokic,1980.5,10.2,17.5,0.59,1.3,3.8,0.32,5.1,6.2,0.8,2.7,10.8,13.6,7.8,1.5,0.8,3.7,2.5,26.7,5.9,0,48,74,0


## Random Forest

In [109]:
from sklearn.ensemble import RandomForestClassifier
model = tree.DecisionTreeClassifier()
model = RandomForestClassifier(n_estimators=128, random_state=1) 
model = model.fit(X_resampled, y_resampled)

In [110]:
# predict and generate report
y_pred = model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)
print(f'Confusion Matrix')
display(cm_df)
print( f'Accuracy: {acc}')
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,20,1
Actual NON,4,1553


Accuracy: 0.9968314321926489
                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      0.95      1.00      0.89      0.97      0.95        21
          1       1.00      1.00      0.95      1.00      0.97      0.95      1557

avg / total       1.00      1.00      0.95      1.00      0.97      0.95      1578



In [111]:
# predict 2021-2022 MVP
y_pred_22021 = model.predict(scaled_22021)
cm = confusion_matrix(yy, y_pred_22021)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)

print(f'Confusion matrix')
display(cm_df)
print(f'Accuracy score: {acc}')
print(classification_report_imbalanced(yy, y_pred_22021))

Confusion matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,1,0
Actual NON,5,719


Accuracy score: 0.9968314321926489
                   pre       rec       spe        f1       geo       iba       sup

          0       0.17      1.00      0.99      0.29      1.00      0.99         1
          1       1.00      0.99      1.00      1.00      1.00      0.99       724

avg / total       1.00      0.99      1.00      1.00      1.00      0.99       725



In [112]:
# display MVP for 2021-2022
player_avg_22021['predict'] = model.predict(scaled_22021)
player_avg_22021[player_avg_22021['predict'] == 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Unnamed: 1_level_0,time_played,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played,predict
player_id,player_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1628369,Jayson Tatum,2156.0,9.3,20.6,0.45,3.0,8.6,0.33,5.3,6.2,0.83,1.1,6.9,8.0,4.4,1.0,0.6,2.9,2.3,26.9,8.8,1,51,76,0
1629029,Luka Doncic,2123.8,9.9,21.6,0.45,3.1,8.8,0.33,5.6,7.5,0.72,0.9,8.3,9.1,8.7,1.2,0.6,4.5,2.2,28.4,2.2,1,52,65,0
1629630,Ja Morant,1988.2,10.2,20.6,0.48,1.5,4.5,0.31,5.5,7.3,0.73,1.4,4.4,5.7,6.7,1.2,0.4,3.4,1.5,27.4,3.3,1,56,57,0
203507,Giannis Antetokounmpo,1944.9,10.1,18.3,0.54,1.0,3.6,0.27,8.1,11.3,0.72,2.0,9.5,11.4,5.7,1.1,1.3,3.2,3.1,29.4,5.8,1,51,67,0
203954,Joel Embiid,1968.4,9.5,19.1,0.48,1.3,3.6,0.36,9.3,11.5,0.79,2.1,9.3,11.4,4.1,1.1,1.4,3.1,2.6,29.7,5.3,1,51,68,0
203999,Nikola Jokic,1980.5,10.2,17.5,0.59,1.3,3.8,0.32,5.1,6.2,0.8,2.7,10.8,13.6,7.8,1.5,0.8,3.7,2.5,26.7,5.9,0,48,74,0


## Boosting

In [113]:
from sklearn.ensemble import GradientBoostingClassifier

learning_rates = [0.001, 0.01, 0.025, 0.05, 0.1, 0.25]

for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
    learning_rate= learning_rate,
    max_features = 5,
    max_depth = 3,
    random_state = 0)
    classifier.fit(X_resampled, y_resampled)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_train_scaled,
           y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test_scaled,
           y_test)))

Learning rate:  0.001
Accuracy score (training): 0.993
Accuracy score (validation): 0.993
Learning rate:  0.01
Accuracy score (training): 0.995
Accuracy score (validation): 0.995
Learning rate:  0.025
Accuracy score (training): 0.994
Accuracy score (validation): 0.994
Learning rate:  0.05
Accuracy score (training): 0.995
Accuracy score (validation): 0.996
Learning rate:  0.1
Accuracy score (training): 0.995
Accuracy score (validation): 0.996
Learning rate:  0.25
Accuracy score (training): 0.997
Accuracy score (validation): 0.996


In [114]:
classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=0.001, max_features=5, max_depth=3, random_state=0)

classifier.fit(X_resampled, y_resampled)
# predict and generate report
y_pred = model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)
print(f'Confusion Matrix')
display(cm_df)
print( f'Accuracy: {acc}')
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,20,1
Actual NON,4,1553


Accuracy: 0.9968314321926489
                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      0.95      1.00      0.89      0.97      0.95        21
          1       1.00      1.00      0.95      1.00      0.97      0.95      1557

avg / total       1.00      1.00      0.95      1.00      0.97      0.95      1578



In [115]:
# predict 2021-2022 MVP
y_pred_22021 = model.predict(scaled_22021)
cm = confusion_matrix(yy, y_pred_22021)
cm_df = pd.DataFrame(data = cm, index= ["Actual MVP", "Actual NON"], columns= ["Predict MVP", "Predict NON"])
acc = accuracy_score(y_test, y_pred)

print(f'Confusion matrix')
display(cm_df)
print(f'Accuracy score: {acc}')
print(classification_report_imbalanced(yy, y_pred_22021))

Confusion matrix


Unnamed: 0,Predict MVP,Predict NON
Actual MVP,1,0
Actual NON,5,719


Accuracy score: 0.9968314321926489
                   pre       rec       spe        f1       geo       iba       sup

          0       0.17      1.00      0.99      0.29      1.00      0.99         1
          1       1.00      0.99      1.00      1.00      1.00      0.99       724

avg / total       1.00      0.99      1.00      1.00      1.00      0.99       725



In [116]:
# display MVP for 2021-2022
player_avg_22021['predict'] = model.predict(scaled_22021)
player_avg_22021[player_avg_22021['predict'] == 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Unnamed: 1_level_0,time_played,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,turn_over,pf,pts,plus_minus,mvp,w,played,predict
player_id,player_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1628369,Jayson Tatum,2156.0,9.3,20.6,0.45,3.0,8.6,0.33,5.3,6.2,0.83,1.1,6.9,8.0,4.4,1.0,0.6,2.9,2.3,26.9,8.8,1,51,76,0
1629029,Luka Doncic,2123.8,9.9,21.6,0.45,3.1,8.8,0.33,5.6,7.5,0.72,0.9,8.3,9.1,8.7,1.2,0.6,4.5,2.2,28.4,2.2,1,52,65,0
1629630,Ja Morant,1988.2,10.2,20.6,0.48,1.5,4.5,0.31,5.5,7.3,0.73,1.4,4.4,5.7,6.7,1.2,0.4,3.4,1.5,27.4,3.3,1,56,57,0
203507,Giannis Antetokounmpo,1944.9,10.1,18.3,0.54,1.0,3.6,0.27,8.1,11.3,0.72,2.0,9.5,11.4,5.7,1.1,1.3,3.2,3.1,29.4,5.8,1,51,67,0
203954,Joel Embiid,1968.4,9.5,19.1,0.48,1.3,3.6,0.36,9.3,11.5,0.79,2.1,9.3,11.4,4.1,1.1,1.4,3.1,2.6,29.7,5.3,1,51,68,0
203999,Nikola Jokic,1980.5,10.2,17.5,0.59,1.3,3.8,0.32,5.1,6.2,0.8,2.7,10.8,13.6,7.8,1.5,0.8,3.7,2.5,26.7,5.9,0,48,74,0
