This is the main notebook that will generate predictions for new players with no targets. The process is as follows:
- Import formatted data for new draftees
- Import formatted data for old draftees for training
- Train a model (or multiple models) on the old data
- Generate predictions (or multiple predictions) for the new players
- Summarize these predictions in an easy to understand graphic / table

In [34]:
# Imports
import pandas as pd
import numpy as np
import pickle

import tensorflow as tf
from sklearn.metrics import pairwise

import preds_helper as helper

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
# Import old dataset for training
X, Y = helper.load_training_data()

# Import NBA players histories
NBAdata = helper.getNBAdata()

# Import new player data
draftees = helper.load_draftee_data()

In [3]:
# Define training variables
clusteringCols = ['FT%', '3P%', 'eFG%', 'ORB%', 'DRB%', 'AST%', 'TOV%', 'STL%', 'BLK%', 'USG%', 'OWS', 'DWS', 
                  'FTA', '3PA', 'PTS', 'PF', 'MP_per_PF', 'FTA_per_FGA', 'MP_per_3PA', 'PTS_per_FGA', 
                  'C', 'F', 'G', 'PPM', 'PPG', 'HEIGHT', 'WEIGHT']

x_cols = ['gamesPlayed', 'minutes', 'FT%', '3P%', 'SOS', 'PER', 'eFG%', 'ORB%', 'DRB%', 'AST%', 'TOV%', 
          'STL%', 'BLK%', 'USG%','OWS', 'DWS', 'FTA', 'FGA', 'MP', '3PA', 'PTS', 'PF', 'MP_per_PF', 'PPG', 
          'PPM','FTA_per_FGA', 'MP_per_3PA', 'PTS_per_FGA', "AST_per_TOV", 'ORtg', 'DRtg','awards','RSCI', 
          'm1', 'm2', 'm3', 'm4', 'm5', 'm6', 'SHUTTLE_RUN','THREE_QUARTER_SPRINT', 'STANDING_VERTICAL', 
          'MAX_VERTICAL','BENCH_PRESS', 'BODY_FAT', 'HAND_LENGTH', 'HAND_WIDTH', "didCombine", 
          'HEIGHT_W_SHOES', 'REACH', 'WEIGHT', 'WINGSPAN', 'C', 'F', 'G']
target = "WM"
allCols = list(dict.fromkeys(clusteringCols + x_cols)) # removes duplicates
draftOnlyCols = [col for col in allCols if col not in clusteringCols]

with open('../Model/cluster_scaler.pkl', 'rb') as f:
    c_scaler = pickle.load(f)
with open('../Model/draft_scaler.pkl', 'rb') as f:
    d_scaler = pickle.load(f)

scaledDraft, scaledNBA = draftees.copy(), NBAdata.copy()
scaledDraft[clusteringCols] = c_scaler.transform(scaledDraft[clusteringCols])
scaledDraft[draftOnlyCols] = d_scaler.transform(scaledDraft[draftOnlyCols])
scaledNBA[clusteringCols] = c_scaler.transform(scaledNBA[clusteringCols])

In [4]:
### Do the clustering
n_clusters =  3
with open('../Model/cluster_model.pkl', 'rb') as f:
    fittedCluster = pickle.load(f)
scaledNBA['label'] = fittedCluster.predict(scaledNBA[clusteringCols].values, sample_weight=None)
scaledDraft['label'] = fittedCluster.predict(scaledDraft[clusteringCols].values, sample_weight=None)

In [5]:
# Create a function that will get features for all draftees for each team
metricCols, metric, n_players = clusteringCols, "manhattan", 7
metric_function = pairwise.distance_metrics()[metric]

nba_features = ['dist_avg', "dist_std", "dist_dot_min", "dist_dot_WS", "min_dist", "label_count"]
# Closest we have to current raptors lineup
raptors = ['Pascal Siakam', 'Kyle Lowry', 'Norman Powell', 'Aron Baynes', "DeAndre' Bembry", "Stanley Johnson", 
        "Patrick McCaw"]
teamSize = len(raptors)

teamData = scaledNBA[scaledNBA.Player.isin(raptors)].groupby("Player").mean()
#teamData.loc["Kyle Lowry"]["label"] = 0
teamData['label'] = teamData['label'].astype(int)

def getTeamFeatures(draftee):
    
    preds = pd.DataFrame(columns=nba_features, index=[draftee['Player']])

    distances = [metric_function(draftee[metricCols].to_numpy().reshape(1,-1), 
                                 teamData.loc[player][metricCols].to_numpy().reshape(1,-1)).item()
                 for player in raptors]
    # Turn them into a feature vector
    preds["dist_avg"] = np.mean(distances)
    preds["dist_std"] = np.std(distances)
    preds["dist_dot_min"] = np.dot(distances, teamData["MP"].values)
    preds["dist_dot_WS"] = np.dot(distances, teamData["MP"].values)
    preds["min_dist"] = np.min(distances)
    labelPlayerCount = len(teamData[teamData['label']==draftee["label"]])
    preds["label_count"] = labelPlayerCount/teamSize

    return preds
        
# Create a dictionary containing {draftee name:team features df}
teamFeatures = []
for i in range(len(scaledDraft)):
    draftee = scaledDraft.iloc[i]
    drafteeName = draftee["Player"]
    teamFeatures.append(getTeamFeatures(draftee))
    #print("{}/{} completed!".format(i+1, len(scaledDraft)))

    
# Scale these features
teamFeatures = pd.concat(teamFeatures)
with open('../Model/team_features_scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

#df = pd.DataFrame(scaler.transform(teamFeatures[nba_features]), index=teamFeatures.index, columns=nba_features)
teamFeatures[nba_features] = scaler.transform(teamFeatures[nba_features])

In [6]:
# Change what our training subset looks like 
allCols = list(dict.fromkeys(allCols)) # removes duplicate
trainingCols = allCols + nba_features

# Add the actual features (for training the model) to the overall dataset
for col in nba_features: scaledDraft[col] = np.nan
for i in range(len(scaledDraft)):
    player = scaledDraft.iloc[i]["Player"]
    for c in nba_features:
        scaledDraft.iloc[i, scaledDraft.columns.get_loc(c)] = teamFeatures.loc[player][c]

In [57]:
# Do the actual loop to get predicted values
preds = pd.DataFrame(index=scaledDraft.Player)

itters = 100

# Train and test it multiple times to average results
for i in range(itters):
    model = helper.create_NN()
    X, Y = X.sample(frac=1), Y.sample(frac=1)

    # Convert the input to tensors
    #X_train = tf.convert_to_tensor(X[trainingCols], dtype=tf.float32)
    #Y_train = tf.convert_to_tensor(Y[target], dtype=tf.float32)

    #X_raptors = tf.convert_to_tensor(scaledDraft[trainingCols])

    # Train the model
    #model.fit(X_train, Y_train)
    model.fit(X.to_numpy(), Y.to_numpy())

    # Generate predictions
#     preds[f"{i}"] = model.predict(X_raptors)
    preds[f"{i}"] = model.predict(scaledDraft[trainingCols])
    print(f"{i}/{itters} Completed")

0/100 Completed
1/100 Completed
2/100 Completed
3/100 Completed
4/100 Completed
5/100 Completed
6/100 Completed
7/100 Completed
8/100 Completed
9/100 Completed
10/100 Completed
11/100 Completed
12/100 Completed
13/100 Completed
14/100 Completed
15/100 Completed
16/100 Completed
17/100 Completed
18/100 Completed
19/100 Completed
20/100 Completed
21/100 Completed
22/100 Completed
23/100 Completed
24/100 Completed
25/100 Completed
26/100 Completed
27/100 Completed
28/100 Completed
29/100 Completed
30/100 Completed
31/100 Completed
32/100 Completed
33/100 Completed
34/100 Completed
35/100 Completed
36/100 Completed
37/100 Completed
38/100 Completed
39/100 Completed
40/100 Completed
41/100 Completed
42/100 Completed
43/100 Completed
44/100 Completed
45/100 Completed
46/100 Completed
47/100 Completed
48/100 Completed
49/100 Completed
50/100 Completed
51/100 Completed
52/100 Completed
53/100 Completed
54/100 Completed
55/100 Completed
56/100 Completed
57/100 Completed
58/100 Completed
59/100 

In [58]:
results = pd.DataFrame(index=preds.index)
results['avg_pred'] = [np.mean(preds.loc[player]) for player in preds.index]

In [59]:
results.sort_values(by="avg_pred", ascending=False)

Unnamed: 0_level_0,avg_pred
Player,Unnamed: 1_level_1
matthewmayer,1.191207
joewieskamp,1.186065
ochaiagbaji,1.185991
yvespons,1.183238
tremann,1.183115
jerichosims,1.181601
kessleredwards,1.181369
aaronwiggins,1.173553
justinchampagnie,1.172272
davionmitchell,1.168521
