# Import Packages and Data

In [33]:
import numpy as np
import re
import pandas as pd
import matplotlib.pyplot as plt
import missingno

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


In [34]:
hof_file = 'Data/hof.csv'
nba_file = 'Data/ALL NBA Players.csv'
active_file = 'Data/Active Players.csv'


# Create the Dataframes


In [35]:
hof_df = pd.read_csv(hof_file)
hof_df = hof_df.rename(columns={'Name': 'PLAYER'})
hof_df

Unnamed: 0,PLAYER
0,Kareem Abdul-Jabbar
1,Ray Allen
2,Nate Archibald
3,Paul Arizin
4,Seimone Augustus
...,...
233,Dominique Wilkins
234,Lynette Woodard
235,John Wooden
236,James Worthy


In [36]:
nba_df = pd.read_csv(nba_file)
nba_df

Unnamed: 0,PLAYER,GP,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFG%,TS%
0,Michael Jordan,1072,38.3,30.1,11.4,22.9,49.7,0.5,1.7,32.7,...,83.5,1.6,4.7,6.2,5.3,2.3,0.8,2.7,50.9,56.9
1,Wilt Chamberlain,1045,45.8,30.1,12.1,22.5,54.0,-,-,-,...,51.1,-,-,22.9,4.4,-,-,-,54.0,54.7
2,Luka Doncic,400,34.9,28.7,9.7,20.7,47.0,3,8.6,34.7,...,74.7,1,7.7,8.7,8.3,1.2,0.5,4,54.2,58.8
3,Joel Embiid,433,31.9,27.9,9.2,18.2,50.4,1.2,3.4,34.1,...,82.6,2.2,8.9,11.2,3.6,0.9,1.7,3.4,53.6,61.5
4,Elgin Baylor,846,40.0,27.4,10.3,23.8,43.1,-,-,-,...,78.0,-,-,13.5,4.3,-,-,-,43.1,49.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1368,Joel Anthony,490,14.4,2.2,0.8,1.6,51.3,0,0,0,...,66.2,1.1,1.6,2.8,0.2,0.3,1.1,0.5,51.3,55.8
1369,Mark Madsen,453,11.8,2.2,0.8,1.8,45.7,0,0,6.3,...,52.7,1.2,1.3,2.6,0.4,0.3,0.2,0.5,45.7,48.2
1370,Greg Dreiling,474,8.9,2.1,0.8,1.7,46.7,0,0,33.3,...,64.9,0.6,1.5,2.1,0.4,0.2,0.3,0.5,46.9,51.7
1371,DeSagana Diop,601,14.0,2.0,0.8,2.0,42.7,0,0,16.7,...,46.7,1.4,2.3,3.7,0.4,0.4,1,0.6,42.8,44.1


In [37]:
active_df = pd.read_csv(active_file)
active_df

Unnamed: 0,PLAYER,GP,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFG%,TS%
0,Luka Doncic,400,34.9,28.7,9.7,20.7,47.0,3.0,8.6,34.7,...,74.7,1.0,7.7,8.7,8.3,1.2,0.5,4.0,54.2,58.8
1,Joel Embiid,433,31.9,27.9,9.2,18.2,50.4,1.2,3.4,34.1,...,82.6,2.2,8.9,11.2,3.6,0.9,1.7,3.4,53.6,61.5
2,Kevin Durant,1061,36.7,27.3,9.4,18.7,50.1,1.9,4.9,38.7,...,88.4,0.7,6.3,7.0,4.4,1.1,1.1,3.2,55.2,61.9
3,LeBron James,1492,37.9,27.1,9.9,19.6,50.6,1.6,4.6,34.8,...,73.6,1.2,6.3,7.5,7.4,1.5,0.7,3.5,54.7,58.9
4,Trae Young,407,34.1,25.5,8.1,18.6,43.6,2.6,7.3,35.5,...,87.3,0.6,2.9,3.6,9.5,1.0,0.2,4.2,50.6,58.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,Pat Connaughton,554,19.6,6.1,2.2,5.1,43.6,1.2,3.3,35.8,...,77.3,0.7,2.8,3.5,1.4,0.5,0.3,0.6,55.2,56.8
153,Torrey Craig,432,19.9,6.0,2.3,5.1,45.3,0.9,2.6,35.3,...,70.3,1.3,2.7,4.0,1.1,0.5,0.6,0.7,54.4,55.9
154,Mike Muscala,548,15.0,5.9,2.1,4.6,45.1,0.9,2.5,37.3,...,83.0,0.8,2.3,3.1,0.8,0.3,0.5,0.6,55.3,58.3
155,Bismack Biyombo,839,19.5,5.1,2.0,3.7,53.5,0.0,0.0,0.0,...,55.3,1.9,4.0,5.9,0.7,0.3,1.3,0.9,53.5,55.3


# Clean and Process the Data


In [38]:
# Remove the values from nba df that exist in the active player df
nba_df = nba_df[~nba_df['PLAYER'].isin(active_df['PLAYER'])]

# Function for formatting names of players
def keep_alphanumeric(name):
    return re.sub(r'[^a-zA-Z0-9]', '', name)

# Apply the function to the 'PLAYER' column
nba_df['PLAYER'] = nba_df['PLAYER'].apply(keep_alphanumeric)
hof_df['PLAYER'] = hof_df['PLAYER'].apply(keep_alphanumeric)

nba_df['PLAYER'] = nba_df['PLAYER'].str.strip()
hof_df['PLAYER'] = hof_df['PLAYER'].str.strip()

# make all words lowercase
nba_df['PLAYER'] = nba_df['PLAYER'].str.lower()
hof_df['PLAYER'] = hof_df['PLAYER'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nba_df['PLAYER'] = nba_df['PLAYER'].apply(keep_alphanumeric)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nba_df['PLAYER'] = nba_df['PLAYER'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nba_df['PLAYER'] = nba_df['PLAYER'].str.lower()


In [39]:
# Dashes indicate the stat was not recorded, these are not blank values
nba_df = nba_df.replace('-', None)
nba_df

Unnamed: 0,PLAYER,GP,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFG%,TS%
0,michaeljordan,1072,38.3,30.1,11.4,22.9,49.7,0.5,1.7,32.7,...,83.5,1.6,4.7,6.2,5.3,2.3,0.8,2.7,50.9,56.9
1,wiltchamberlain,1045,45.8,30.1,12.1,22.5,54.0,,,,...,51.1,,,22.9,4.4,,,,54.0,54.7
4,elginbaylor,846,40.0,27.4,10.3,23.8,43.1,,,,...,78.0,,,13.5,4.3,,,,43.1,49.4
7,jerrywest,932,39.2,27.0,9.7,20.4,47.4,,,,...,81.4,1,2.8,5.8,6.7,2.6,0.7,,47.4,55.0
8,alleniverson,914,41.1,26.7,9.3,21.8,42.5,1.2,3.7,31.3,...,78.0,0.8,2.9,3.7,6.2,2.2,0.2,3.6,45.2,51.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1368,joelanthony,490,14.4,2.2,0.8,1.6,51.3,0,0,0,...,66.2,1.1,1.6,2.8,0.2,0.3,1.1,0.5,51.3,55.8
1369,markmadsen,453,11.8,2.2,0.8,1.8,45.7,0,0,6.3,...,52.7,1.2,1.3,2.6,0.4,0.3,0.2,0.5,45.7,48.2
1370,gregdreiling,474,8.9,2.1,0.8,1.7,46.7,0,0,33.3,...,64.9,0.6,1.5,2.1,0.4,0.2,0.3,0.5,46.9,51.7
1371,desaganadiop,601,14.0,2.0,0.8,2.0,42.7,0,0,16.7,...,46.7,1.4,2.3,3.7,0.4,0.4,1,0.6,42.8,44.1


In [40]:
nba_df.isna().sum()

PLAYER      0
GP          0
MIN         0
PTS         0
FGM         0
FGA         0
FG%         0
3PM       212
3PA       212
3P%       212
FTM         0
FTA         0
FT%         0
OREB      119
DREB      119
REB         0
AST         0
STL       119
BLK       119
TOV       187
EFG%        0
TS%         0
dtype: int64

In [41]:
# You have to convert dtypes to int or float prior to applying mean to missing values 
nba_df['GP'] = nba_df['GP'].str.replace(',', '')

# Apply the mean of the column to the missing values
for column in nba_df.columns[1:]:
    nba_df[column] = nba_df[column].astype(float)
    nba_df[column] = nba_df[column].fillna(nba_df[column].mean().round(2), axis=0)

    # Convert percentages to decimals
percentages = ['FG%', '3P%', 'FT%','EFG%', 'TS%']
nba_df[percentages] = nba_df[percentages]/100
nba_df[percentages] = nba_df[percentages].round(2)

nba_df


Unnamed: 0,PLAYER,GP,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFG%,TS%
0,michaeljordan,1072.0,38.3,30.1,11.4,22.9,0.50,0.50,1.7,0.33,...,0.84,1.6,4.70,6.2,5.3,2.30,0.8,2.70,0.51,0.57
1,wiltchamberlain,1045.0,45.8,30.1,12.1,22.5,0.54,0.42,1.2,0.25,...,0.51,1.3,3.09,22.9,4.4,0.84,0.5,1.56,0.54,0.55
4,elginbaylor,846.0,40.0,27.4,10.3,23.8,0.43,0.42,1.2,0.25,...,0.78,1.3,3.09,13.5,4.3,0.84,0.5,1.56,0.43,0.49
7,jerrywest,932.0,39.2,27.0,9.7,20.4,0.47,0.42,1.2,0.25,...,0.81,1.0,2.80,5.8,6.7,2.60,0.7,1.56,0.47,0.55
8,alleniverson,914.0,41.1,26.7,9.3,21.8,0.42,1.20,3.7,0.31,...,0.78,0.8,2.90,3.7,6.2,2.20,0.2,3.60,0.45,0.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1368,joelanthony,490.0,14.4,2.2,0.8,1.6,0.51,0.00,0.0,0.00,...,0.66,1.1,1.60,2.8,0.2,0.30,1.1,0.50,0.51,0.56
1369,markmadsen,453.0,11.8,2.2,0.8,1.8,0.46,0.00,0.0,0.06,...,0.53,1.2,1.30,2.6,0.4,0.30,0.2,0.50,0.46,0.48
1370,gregdreiling,474.0,8.9,2.1,0.8,1.7,0.47,0.00,0.0,0.33,...,0.65,0.6,1.50,2.1,0.4,0.20,0.3,0.50,0.47,0.52
1371,desaganadiop,601.0,14.0,2.0,0.8,2.0,0.43,0.00,0.0,0.17,...,0.47,1.4,2.30,3.7,0.4,0.40,1.0,0.60,0.43,0.44


In [42]:
hof_df = hof_df.rename(columns={'Name': 'PLAYER'})
hof_df['Hof'] = 1
hof_df['Hof'].value_counts()

Hof
1    238
Name: count, dtype: int64

In [43]:
hof_df

Unnamed: 0,PLAYER,Hof
0,kareemabduljabbar,1
1,rayallen,1
2,natearchibald,1
3,paularizin,1
4,seimoneaugustus,1
...,...,...
233,dominiquewilkins,1
234,lynettewoodard,1
235,johnwooden,1
236,jamesworthy,1


In [44]:
# remove any whitespace
nba_df['PLAYER'] = nba_df['PLAYER'].str.strip()
hof_df['PLAYER'] = hof_df['PLAYER'].str.strip()

# make all words lowercase
nba_df['PLAYER'] = nba_df['PLAYER'].str.lower()
hof_df['PLAYER'] = hof_df['PLAYER'].str.lower()

def keep_alphanumeric(name):
    return re.sub(r'[^a-zA-Z0-9]', '', name)

# Apply the function to the 'PLAYER' column
nba_df['PLAYER'] = nba_df['PLAYER'].apply(keep_alphanumeric)
hof_df['PLAYER'] = hof_df['PLAYER'].apply(keep_alphanumeric)

In [45]:
nba_df = nba_df.drop_duplicates(subset='PLAYER', keep='first')
nba_df['PLAYER'].value_counts()

PLAYER
michaeljordan    1
billymckinney    1
terrydehere      1
albianchi        1
kurtthomas       1
                ..
arnierisen       1
sergeibaka       1
natewilliams     1
vinniejohnson    1
michaelruffin    1
Name: count, Length: 1212, dtype: int64

In [46]:
hof_df = hof_df.drop_duplicates(subset='PLAYER', keep='first')
hof_df['Hof'].value_counts()

Hof
1    238
Name: count, dtype: int64

In [47]:
nba_df

Unnamed: 0,PLAYER,GP,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFG%,TS%
0,michaeljordan,1072.0,38.3,30.1,11.4,22.9,0.50,0.50,1.7,0.33,...,0.84,1.6,4.70,6.2,5.3,2.30,0.8,2.70,0.51,0.57
1,wiltchamberlain,1045.0,45.8,30.1,12.1,22.5,0.54,0.42,1.2,0.25,...,0.51,1.3,3.09,22.9,4.4,0.84,0.5,1.56,0.54,0.55
4,elginbaylor,846.0,40.0,27.4,10.3,23.8,0.43,0.42,1.2,0.25,...,0.78,1.3,3.09,13.5,4.3,0.84,0.5,1.56,0.43,0.49
7,jerrywest,932.0,39.2,27.0,9.7,20.4,0.47,0.42,1.2,0.25,...,0.81,1.0,2.80,5.8,6.7,2.60,0.7,1.56,0.47,0.55
8,alleniverson,914.0,41.1,26.7,9.3,21.8,0.42,1.20,3.7,0.31,...,0.78,0.8,2.90,3.7,6.2,2.20,0.2,3.60,0.45,0.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1368,joelanthony,490.0,14.4,2.2,0.8,1.6,0.51,0.00,0.0,0.00,...,0.66,1.1,1.60,2.8,0.2,0.30,1.1,0.50,0.51,0.56
1369,markmadsen,453.0,11.8,2.2,0.8,1.8,0.46,0.00,0.0,0.06,...,0.53,1.2,1.30,2.6,0.4,0.30,0.2,0.50,0.46,0.48
1370,gregdreiling,474.0,8.9,2.1,0.8,1.7,0.47,0.00,0.0,0.33,...,0.65,0.6,1.50,2.1,0.4,0.20,0.3,0.50,0.47,0.52
1371,desaganadiop,601.0,14.0,2.0,0.8,2.0,0.43,0.00,0.0,0.17,...,0.47,1.4,2.30,3.7,0.4,0.40,1.0,0.60,0.43,0.44


In [48]:
merge_df = pd.merge(nba_df, hof_df, on='PLAYER', how='left')
merge_df = merge_df.fillna(0)

In [49]:
merge_df['Hof'].value_counts()

Hof
0.0    1072
1.0     140
Name: count, dtype: int64

In [50]:
merge_df.columns

Index(['PLAYER', 'GP', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%',
       'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV',
       'EFG%', 'TS%', 'Hof'],
      dtype='object')

In [51]:
merge_df

Unnamed: 0,PLAYER,GP,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,...,OREB,DREB,REB,AST,STL,BLK,TOV,EFG%,TS%,Hof
0,michaeljordan,1072.0,38.3,30.1,11.4,22.9,0.50,0.50,1.7,0.33,...,1.6,4.70,6.2,5.3,2.30,0.8,2.70,0.51,0.57,1.0
1,wiltchamberlain,1045.0,45.8,30.1,12.1,22.5,0.54,0.42,1.2,0.25,...,1.3,3.09,22.9,4.4,0.84,0.5,1.56,0.54,0.55,1.0
2,elginbaylor,846.0,40.0,27.4,10.3,23.8,0.43,0.42,1.2,0.25,...,1.3,3.09,13.5,4.3,0.84,0.5,1.56,0.43,0.49,1.0
3,jerrywest,932.0,39.2,27.0,9.7,20.4,0.47,0.42,1.2,0.25,...,1.0,2.80,5.8,6.7,2.60,0.7,1.56,0.47,0.55,1.0
4,alleniverson,914.0,41.1,26.7,9.3,21.8,0.42,1.20,3.7,0.31,...,0.8,2.90,3.7,6.2,2.20,0.2,3.60,0.45,0.52,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1207,joelanthony,490.0,14.4,2.2,0.8,1.6,0.51,0.00,0.0,0.00,...,1.1,1.60,2.8,0.2,0.30,1.1,0.50,0.51,0.56,0.0
1208,markmadsen,453.0,11.8,2.2,0.8,1.8,0.46,0.00,0.0,0.06,...,1.2,1.30,2.6,0.4,0.30,0.2,0.50,0.46,0.48,0.0
1209,gregdreiling,474.0,8.9,2.1,0.8,1.7,0.47,0.00,0.0,0.33,...,0.6,1.50,2.1,0.4,0.20,0.3,0.50,0.47,0.52,0.0
1210,desaganadiop,601.0,14.0,2.0,0.8,2.0,0.43,0.00,0.0,0.17,...,1.4,2.30,3.7,0.4,0.40,1.0,0.60,0.43,0.44,0.0


In [52]:
y = merge_df['Hof']
X = merge_df.copy()
X = X.drop('Hof', axis=1)
X = X.drop(['PLAYER', 'GP'], axis = 1)


In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, \
                                                    stratify=y, random_state=42)

In [54]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Get the predictions on test data
y_preds = gnb.predict(X_test)

In [55]:
print('Accuracy:', '%.3f' % accuracy_score(y_test, y_preds))
print('Precision:', '%.3f' % precision_score(y_test, y_preds))
print('Recall:', '%.3f' % recall_score(y_test, y_preds))
print('F1 Score:', '%.3f' % f1_score(y_test, y_preds))

Accuracy: 0.898
Precision: 0.534
Recall: 0.886
F1 Score: 0.667


In [56]:
active_df

Unnamed: 0,PLAYER,GP,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFG%,TS%
0,Luka Doncic,400,34.9,28.7,9.7,20.7,47.0,3.0,8.6,34.7,...,74.7,1.0,7.7,8.7,8.3,1.2,0.5,4.0,54.2,58.8
1,Joel Embiid,433,31.9,27.9,9.2,18.2,50.4,1.2,3.4,34.1,...,82.6,2.2,8.9,11.2,3.6,0.9,1.7,3.4,53.6,61.5
2,Kevin Durant,1061,36.7,27.3,9.4,18.7,50.1,1.9,4.9,38.7,...,88.4,0.7,6.3,7.0,4.4,1.1,1.1,3.2,55.2,61.9
3,LeBron James,1492,37.9,27.1,9.9,19.6,50.6,1.6,4.6,34.8,...,73.6,1.2,6.3,7.5,7.4,1.5,0.7,3.5,54.7,58.9
4,Trae Young,407,34.1,25.5,8.1,18.6,43.6,2.6,7.3,35.5,...,87.3,0.6,2.9,3.6,9.5,1.0,0.2,4.2,50.6,58.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,Pat Connaughton,554,19.6,6.1,2.2,5.1,43.6,1.2,3.3,35.8,...,77.3,0.7,2.8,3.5,1.4,0.5,0.3,0.6,55.2,56.8
153,Torrey Craig,432,19.9,6.0,2.3,5.1,45.3,0.9,2.6,35.3,...,70.3,1.3,2.7,4.0,1.1,0.5,0.6,0.7,54.4,55.9
154,Mike Muscala,548,15.0,5.9,2.1,4.6,45.1,0.9,2.5,37.3,...,83.0,0.8,2.3,3.1,0.8,0.3,0.5,0.6,55.3,58.3
155,Bismack Biyombo,839,19.5,5.1,2.0,3.7,53.5,0.0,0.0,0.0,...,55.3,1.9,4.0,5.9,0.7,0.3,1.3,0.9,53.5,55.3


In [57]:
# You have to convert dtypes to int or float prior to applying mean to missing values 
active_df['GP'] = active_df['GP'].str.replace(',', '')

# Apply the mean of the column to the missing values
for column in nba_df.columns[1:]:
    active_df[column] = active_df[column].astype(float)
    active_df[column] = active_df[column].fillna(nba_df[column].mean().round(2), axis=0)

active_df = active_df.replace('-', None)

# Convert percentages to decimals
percentages = ['FG%', '3P%', 'FT%','EFG%', 'TS%']
active_df[percentages] = active_df[percentages]/100
active_df[percentages] = active_df[percentages].round(2)

# remove any whitespace
active_df['PLAYER'] = active_df['PLAYER'].str.strip()

# make all words lowercase
active_df['PLAYER'] = active_df['PLAYER'].str.lower()


In [58]:
active_df

Unnamed: 0,PLAYER,GP,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,EFG%,TS%
0,luka doncic,400.0,34.9,28.7,9.7,20.7,0.47,3.0,8.6,0.35,...,0.75,1.0,7.7,8.7,8.3,1.2,0.5,4.0,0.54,0.59
1,joel embiid,433.0,31.9,27.9,9.2,18.2,0.50,1.2,3.4,0.34,...,0.83,2.2,8.9,11.2,3.6,0.9,1.7,3.4,0.54,0.62
2,kevin durant,1061.0,36.7,27.3,9.4,18.7,0.50,1.9,4.9,0.39,...,0.88,0.7,6.3,7.0,4.4,1.1,1.1,3.2,0.55,0.62
3,lebron james,1492.0,37.9,27.1,9.9,19.6,0.51,1.6,4.6,0.35,...,0.74,1.2,6.3,7.5,7.4,1.5,0.7,3.5,0.55,0.59
4,trae young,407.0,34.1,25.5,8.1,18.6,0.44,2.6,7.3,0.36,...,0.87,0.6,2.9,3.6,9.5,1.0,0.2,4.2,0.51,0.58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,pat connaughton,554.0,19.6,6.1,2.2,5.1,0.44,1.2,3.3,0.36,...,0.77,0.7,2.8,3.5,1.4,0.5,0.3,0.6,0.55,0.57
153,torrey craig,432.0,19.9,6.0,2.3,5.1,0.45,0.9,2.6,0.35,...,0.70,1.3,2.7,4.0,1.1,0.5,0.6,0.7,0.54,0.56
154,mike muscala,548.0,15.0,5.9,2.1,4.6,0.45,0.9,2.5,0.37,...,0.83,0.8,2.3,3.1,0.8,0.3,0.5,0.6,0.55,0.58
155,bismack biyombo,839.0,19.5,5.1,2.0,3.7,0.54,0.0,0.0,0.00,...,0.55,1.9,4.0,5.9,0.7,0.3,1.3,0.9,0.54,0.55


In [59]:
# Make a copy of active_df for predictions
predict_df = active_df.copy()

# Drop unnecessary columns
predict_df.drop(columns=['PLAYER', 'GP'], inplace=True)

# Predicting probabilities for active players
prediction = gnb.predict(predict_df)
probability = gnb.predict_proba(predict_df)

# Create a final DataFrame and add relevant columns
final_df = active_df[['PLAYER', 'GP']].copy()  # Start with PLAYER and GP
final_df['Hof Prediction'] = prediction  # Add predictions
final_df['Probability'] = (probability[:, 1]* 100).round(2)  # Add the probability of making the Hall of Fame

# Keep only the last three columns
final_df = final_df.iloc[:, -4:]

# Display the final DataFrame
final_df = final_df.sort_values(by=['Probability','PLAYER'], ascending =False)
final_df



Unnamed: 0,PLAYER,GP,Hof Prediction,Probability
4,trae young,407.0,1.0,100.0
16,russell westbrook,1162.0,1.0,100.0
79,rudy gobert,757.0,1.0,100.0
39,nikola vucevic,899.0,1.0,100.0
19,nikola jokic,675.0,1.0,100.0
...,...,...,...,...
59,bogdan bogdanovic,449.0,0.0,0.0
90,bobby portis,605.0,0.0,0.0
155,bismack biyombo,839.0,0.0,0.0
142,alex len,644.0,0.0,0.0


In [61]:
final_df.to_html('Exports/hof_predictions.html')
final_df.to_csv('Exports/hof_predictions.csv')