In [1]:
import numpy as np
import pandas as pd
import torch

# Data exploration

In [2]:
df = pd.read_csv("data/CollegeBasketballPlayers2009-2021.csv", low_memory=False)
df

Unnamed: 0,player_name,team,conf,GP,Min_per,Ortg,usg,eFG,TS_per,ORB_per,...,dgbpm,oreb,dreb,treb,ast,stl,blk,pts,Unnamed: 64,Unnamed: 65
0,DeAndrae Ross,South Alabama,SB,26,29.5,97.3,16.6,42.5,44.43,1.6,...,-1.941150,0.1923,0.6154,0.8077,1.1923,0.3462,0.0385,3.8846,,6.22026
1,Pooh Williams,Utah St.,WAC,34,60.9,108.3,14.9,52.4,54.48,3.8,...,-0.247934,0.6765,1.2647,1.9412,1.8235,0.4118,0.2353,5.9412,,3.94375
2,Jesus Verdejo,South Florida,BE,27,72.0,96.2,21.8,45.7,47.98,2.1,...,-0.883163,0.6296,2.3333,2.9630,1.9630,0.4815,0.0000,12.1852,,10.92680
3,Mike Hornbuckle,Pepperdine,WCC,30,44.5,97.7,16.0,53.6,53.69,4.1,...,-0.393459,0.7000,1.4333,2.1333,1.1000,0.5667,0.1333,4.9333,,6.77427
4,Anthony Brown,Pacific,BW,33,56.2,96.5,22.0,52.8,54.31,8.3,...,-0.668318,1.4242,3.3030,4.7273,0.8485,0.4545,0.3333,7.5758,,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61056,Trey Patterson,Villanova,BE,2,0.3,60.5,19.3,0.0,0.00,0.0,...,16.018200,0.0000,0.0000,0.0000,0.5000,0.5000,0.0000,0.0000,Pure PG,0.00000
61057,Stavros Polatoglou,Northwestern St.,Slnd,4,1.3,28.3,7.1,0.0,0.00,7.0,...,-4.993820,0.2500,0.0000,0.2500,0.0000,0.0000,0.0000,0.0000,C,0.00000
61058,Sandy Ryan,Tulane,Amer,1,0.1,0.0,0.0,0.0,0.00,0.0,...,-1.126810,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,PF/C,0.00000
61059,Ty Larson,Texas Tech,B12,1,0.1,0.0,0.0,0.0,0.00,0.0,...,-2.380750,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,PF/C,0.00000


In [3]:
df.columns

Index(['player_name', 'team', 'conf', 'GP', 'Min_per', 'Ortg', 'usg', 'eFG',
       'TS_per', 'ORB_per', 'DRB_per', 'AST_per', 'TO_per', 'FTM', 'FTA',
       'FT_per', 'twoPM', 'twoPA', 'twoP_per', 'TPM', 'TPA', 'TP_per',
       'blk_per', 'stl_per', 'ftr', 'yr', 'ht', 'num', 'porpag', 'adjoe',
       'pfr', 'year', 'pid', 'type', 'Rec Rank', 'ast/tov', 'rimmade',
       'rimmade+rimmiss', 'midmade', 'midmade+midmiss',
       'rimmade/(rimmade+rimmiss)', 'midmade/(midmade+midmiss)', 'dunksmade',
       'dunksmiss+dunksmade', 'dunksmade/(dunksmade+dunksmiss)', 'pick',
       'drtg', 'adrtg', 'dporpag', 'stops', 'bpm', 'obpm', 'dbpm', 'gbpm',
       'mp', 'ogbpm', 'dgbpm', 'oreb', 'dreb', 'treb', 'ast', 'stl', 'blk',
       'pts', 'Unnamed: 64', 'Unnamed: 65'],
      dtype='object')

### Biranje atributa

U datasetu postoje 66 atributa za svakog igrača, što je previše. Redukcija prostora atributa će biti vršena u tri koraka:

1. Izbacivanje redundantnih atributa.  _Primer_: Odnos asistencija i izgubljenih lopti (`ast/tov`) je suvišan kada već postoje atributi za broj asistencije i broj izgubljenih lopti
2. Izbacivanje atributa na osnovu iskustvenog zananja. _Primer_: Nepotrebno je da pored škole za koju igra student koristimo kao atribut i konferenciju u kojoj je ta škola
3. Nad ostalim atributima će biti izračunata korelacija sa ciljanim izlazima modela, i biće izabrani najboljih 10

Ovim postupkom će model biti manji, i samim tim će mu biti smanjena mogućnost overfittovanja.

Cilj mreže je predviđanje da li će neki pik biti izabran na draftu. Svi odbirci koji imaju vrednost u `pick` koloni biće označeni kao pozitivni, dok će ostali biti označeni kao negativni

In [19]:
# Choose valuable features
clean_df = df[["GP", "Min_per", "usg", "eFG", "TS_per", "ORB_per", "DRB_per", "AST_per", 
               "TO_per", "FT_per", "twoP_per", "TP_per", "blk_per", "stl_per", "porpag", "adjoe", "adrtg", "bpm"]].copy()

# Calculate target based on if the player was picked in the draft
clean_df["Target"] = df["pick"].copy().fillna(0) > 0

In [22]:
# Describe features
clean_df.describe()

Unnamed: 0,GP,Min_per,usg,eFG,TS_per,ORB_per,DRB_per,AST_per,TO_per,FT_per,twoP_per,TP_per,blk_per,stl_per,porpag,adjoe,adrtg,bpm
count,61061.0,61061.0,61061.0,61061.0,61061.0,61061.0,61061.0,61061.0,61061.0,61061.0,61061.0,61061.0,61061.0,61061.0,61061.0,61061.0,61016.0,61016.0
mean,22.79776,37.12839,18.126341,44.500768,47.584212,5.54225,12.704242,10.808699,20.225856,0.577909,0.426696,0.227514,1.866607,1.71482,0.772884,89.137664,103.418621,-2.539234
std,10.166805,28.05805,6.253742,18.431761,17.640613,9.30561,10.755123,9.362704,12.318765,0.271861,0.195703,0.190639,5.703945,2.134472,1.333199,31.339883,10.957062,7.585037
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.79241,-39.0213,-1292.02,-102.186
25%,15.0,9.3,14.5,40.0,43.8,1.8,8.4,4.6,14.7,0.5,0.374,0.0,0.0,0.9,-0.17516,79.2384,98.013,-5.395683
50%,27.0,35.6,18.1,47.7,50.85,4.3,11.9,9.0,19.1,0.66,0.461,0.274,0.8,1.5,0.390887,94.6019,103.756,-1.823935
75%,31.0,62.0,21.8,53.3,56.11,8.1,16.1,15.0,24.4,0.756,0.526,0.357,2.4,2.2,1.48672,106.628,109.25725,1.407212
max,41.0,98.0,50.0,150.0,150.0,1576.6,1385.0,100.0,100.0,1.0,1.0,1.0,1072.0,128.3,8.65819,678.942,144.685,685.313


In [9]:
# Calculate percentage of True datapoints
value_cnts = clean_df["Target"].value_counts()
print(f"Dataset sadrži {len(clean_df)} odbiraka, od čega su {value_cnts[True] / len(clean_df) * 100:0.2f}% pozitivni.")

Dataset sadrži 61061 odbiraka, od čega su 2.35% pozitivni.


## Podela na trening i test

In [14]:
from sklearn.model_selection import train_test_split

# Do train/test split
train_df, test_df = train_test_split(clean_df, train_size=.7)

In [17]:
# We can normalize dataset here

# Definisanje modela

Model će se sastojati od duboke neuronske mreže, sa potpuno povezanim slojevima.
Ulaz u mrežu će biti prethodno izabrani atributi, a izlaz će sadžati jedan neuron čiji izlaz predstavlja uverenost modela da će ulazni igrač biti NBA pik.

#### Dataset klasa

Prvi korak predstavlja definisanje klase koja će dohvatati podatke u _batch_-evima tokom treniranja.

In [111]:
from torch.utils.data import Dataset, DataLoader

class NBA_Dataset(Dataset):
    """Class for holding data of NCAA players and if there were picked for the NBA."""

    def __init__(self, df: pd.DataFrame) -> None:
        """Initialize torch Dataset based on pandas Dataframe."""
        super().__init__()

        # Take all except the last column as inputs
        self.inputs = df.values[:, :-1].astype(float)

        # Cast Targets column to numpy
        self.targets = df["Target"].values.astype(int)

        # Save dataframe as part of the class
        self.df = df

        # Define dataset length
        self.len = len(df)

    def __getitem__(self, index) -> dict:
        """Return dict with information of datapoint at `index`"""
        # Get input
        input_row = torch.Tensor(self.inputs[index, :])

        # Get output
        target = torch.Tensor([self.targets[index]])

        return {
            "Inputs": input_row,
            "Targets": target,
        }

    def __len__(self):
        """Get dataset length."""
        return self.len

#### Definicija modela

**OVDE SE PISE O TOME STA SVE IMA U MODELU**

In [112]:
import torch.nn as nn

class NBA_Draft_Predictor(nn.Module):
    """Class for prediction NCAA player chances of being piked in NBA draft."""

    def __init__(self, layer_sizes: list, activation_function: nn.Module = None) -> None:
        """Initialize fully-connected model based on input parameters."""
        super().__init__()

        assert len(layer_sizes)

        # Check if activation function is not defined
        if activation_function is None:
            activation_function = nn.ReLU()

        # Define activation function
        self.activation = activation_function

        # Layers init
        self.fc_layers = []

        # Define layer sizes
        for in_size, out_size in zip(layer_sizes[:-1], layer_sizes[1:]):
            self.fc_layers.append(nn.Linear(in_size, out_size))

        # Define output layer
        self.fc_layers.append(nn.Linear(layer_sizes[-1], 1))

    def forward(self, X) -> torch.Tensor:
        """Define network behavior when called on data."""
        # Pass data trough fully-connected layers
        for layer in self.fc_layers:
            X = self.activation(layer(X))
        
        # Calculate output probability as sigmoid of output
        output_probability = torch.sigmoid(X)

        return output_probability
        

# Trening

In [113]:
dataset = NBA_Dataset(train_df)

train_parameters = {
    "batch_size": 32,
    "shuffle": True,
    "num_workers": 1,
}

training_loader = DataLoader(dataset, **train_parameters)

In [114]:
model = NBA_Draft_Predictor([18, 5])

model(dataset[0]["Inputs"])

# for batch in training_loader:
#     X = batch["Inputs"]
#     Y = batch["Targets"]
#     print(X)
#     print(model(X))
#     break

tensor([0.5000], grad_fn=<SigmoidBackward0>)