### 1. Import Packages

In [3]:
# Install personal custom functions
!pip install -i https://test.pypi.org/simple/ my_krml_ratana

Looking in indexes: https://test.pypi.org/simple/
Collecting my_krml_ratana
  Using cached https://test-files.pythonhosted.org/packages/65/fe/81a5ac8dd010e1e813fc75d811c8a9d58ca7277460e4ac27c68a1aba824b/my_krml_ratana-2024.0.1.7-py3-none-any.whl.metadata (1.1 kB)
Using cached https://test-files.pythonhosted.org/packages/65/fe/81a5ac8dd010e1e813fc75d811c8a9d58ca7277460e4ac27c68a1aba824b/my_krml_ratana-2024.0.1.7-py3-none-any.whl (11 kB)
Installing collected packages: my_krml_ratana
Successfully installed my_krml_ratana-2024.0.1.7


In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import PowerTransformer
from joblib import dump

### 2. Load Dataset

In [22]:
df_train = pd.read_csv('../data/interim/train_cleaned_1.csv')

In [23]:
df_test = pd.read_csv('../data/interim/test_cleaned_1.csv')

In [24]:
from my_krml_ratana.data.sets import pop_target

In [25]:
# Select the target variable
df_train, target = pop_target(df_train, 'drafted')

In [26]:
target

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
46216    0.0
46217    0.0
46218    0.0
46219    0.0
46220    0.0
Name: drafted, Length: 46221, dtype: float64

In [27]:
target.sum()

np.float64(444.0)

In [28]:
df_train

Unnamed: 0,team,conf,GP,Min_per,Ortg,usg,eFG,TS_per,ORB_per,DRB_per,...,ogbpm,dgbpm,oreb,dreb,treb,ast,stl,blk,pts,player_id
0,South Alabama,SB,26,29.5,97.3,16.6,42.5,44.43,1.6,4.6,...,-2.781990,-1.941150,0.1923,0.6154,0.8077,1.1923,0.3462,0.0385,3.8846,b2716b2d-3422-4959-9fe7-fe496414a4e8
1,Utah St.,WAC,34,60.9,108.3,14.9,52.4,54.48,3.8,6.3,...,-0.052263,-0.247934,0.6765,1.2647,1.9412,1.8235,0.4118,0.2353,5.9412,bb387960-e3a0-4ffd-96ae-184bad07cb8a
2,South Florida,BE,27,72.0,96.2,21.8,45.7,47.98,2.1,8.0,...,1.548230,-0.883163,0.6296,2.3333,2.9630,1.9630,0.4815,0.0000,12.1852,89bbdf11-eadd-4de4-95e5-03f7e2874aa1
3,Pepperdine,WCC,30,44.5,97.7,16.0,53.6,53.69,4.1,9.4,...,-0.342775,-0.393459,0.7000,1.4333,2.1333,1.1000,0.5667,0.1333,4.9333,0c87ada8-8446-4ea2-a05b-bd07aed5f37a
4,Pacific,BW,33,56.2,96.5,22.0,52.8,54.31,8.3,18.6,...,-1.684860,-0.668318,1.4242,3.3030,4.7273,0.8485,0.4545,0.3333,7.5758,84661e00-ad20-4308-817e-efa8da52a86c
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46216,Southern Illinois,MVC,1,0.1,0.0,20.0,0.0,0.00,0.0,0.0,...,-31.182600,-21.537200,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,7da0b8f8-3582-496b-9a93-a4e76e073650
46217,Texas Southern,SWAC,3,0.3,20.3,36.1,0.0,0.00,0.0,26.3,...,-51.274400,-13.655700,0.0000,0.3333,0.3333,0.3333,0.3333,0.0000,0.0000,523630b3-3155-47a5-b214-299e8117913d
46218,Western Illinois,Sum,1,0.3,0.0,0.0,0.0,0.00,0.0,0.0,...,-5.526260,-6.035070,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,9818a0c6-2ceb-4e55-9483-76c177322f3c
46219,Northern Illinois,MAC,1,0.1,54.0,20.0,0.0,0.00,0.0,0.0,...,2.591320,-12.849200,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000,3237ee2e-9cf8-4d6b-a348-b5619804afd7


### 3. Data pre-precessing

In [29]:
# Remove player_id and team
df_train = df_train.drop(['player_id', 'team', 'type'], axis =1)


In [30]:
# Select numerical columns and categorical columns
num_col = list(df_train.select_dtypes('number').columns)
cat_col = list(set(df_train.columns) - set(num_col))

In [31]:
df_train[num_col]

Unnamed: 0,GP,Min_per,Ortg,usg,eFG,TS_per,ORB_per,DRB_per,AST_per,TO_per,...,mp,ogbpm,dgbpm,oreb,dreb,treb,ast,stl,blk,pts
0,26,29.5,97.3,16.6,42.5,44.43,1.6,4.6,15.8,16.3,...,14.5769,-2.781990,-1.941150,0.1923,0.6154,0.8077,1.1923,0.3462,0.0385,3.8846
1,34,60.9,108.3,14.9,52.4,54.48,3.8,6.3,13.6,19.8,...,24.5294,-0.052263,-0.247934,0.6765,1.2647,1.9412,1.8235,0.4118,0.2353,5.9412
2,27,72.0,96.2,21.8,45.7,47.98,2.1,8.0,14.7,15.9,...,33.1852,1.548230,-0.883163,0.6296,2.3333,2.9630,1.9630,0.4815,0.0000,12.1852
3,30,44.5,97.7,16.0,53.6,53.69,4.1,9.4,13.7,23.8,...,17.9667,-0.342775,-0.393459,0.7000,1.4333,2.1333,1.1000,0.5667,0.1333,4.9333
4,33,56.2,96.5,22.0,52.8,54.31,8.3,18.6,8.2,22.7,...,22.9091,-1.684860,-0.668318,1.4242,3.3030,4.7273,0.8485,0.4545,0.3333,7.5758
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46216,1,0.1,0.0,20.0,0.0,0.00,0.0,0.0,0.0,0.0,...,1.0000,-31.182600,-21.537200,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
46217,3,0.3,20.3,36.1,0.0,0.00,0.0,26.3,39.2,38.2,...,1.3333,-51.274400,-13.655700,0.0000,0.3333,0.3333,0.3333,0.3333,0.0000,0.0000
46218,1,0.3,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,3.0000,-5.526260,-6.035070,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
46219,1,0.1,54.0,20.0,0.0,0.00,0.0,0.0,100.0,0.0,...,1.0000,2.591320,-12.849200,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000


In [32]:
df_train[cat_col]

Unnamed: 0,yr,conf
0,So,SB
1,So,WAC
2,Sr,BE
3,Sr,WCC
4,Sr,BW
...,...,...
46216,Sr,MVC
46217,So,SWAC
46218,Jr,Sum
46219,So,MAC


#### [3.1] One-Hot encode Categorical variablies  

In [33]:
# One hot encode
ohe = OneHotEncoder(sparse_output=False, drop='first')

In [34]:
features_train = ohe.fit_transform(df_train[cat_col])

In [35]:
features_train = pd.DataFrame(features_train, columns=ohe.get_feature_names_out())

#### [3.2] Scaling Numerical variables

In [36]:
# Initiate StandardScaler
scaler = StandardScaler()

In [37]:
features_train[num_col] = scaler.fit_transform(df_train[num_col])

In [38]:
# Check the features
features_train

Unnamed: 0,yr_Fr,yr_Jr,yr_So,yr_Sr,conf_ACC,conf_AE,conf_ASun,conf_Amer,conf_B10,conf_B12,...,mp,ogbpm,dgbpm,oreb,dreb,treb,ast,stl,blk,pts
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.233776,-0.121603,-0.461089,-0.793304,-0.863773,-0.885819,0.100742,-0.393791,-0.583150,-0.381906
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.705198,0.335939,0.061273,-0.143705,-0.420041,-0.344611,0.639428,-0.254064,-0.118649,0.034399
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.521834,0.604205,-0.134697,-0.206625,0.310241,0.143264,0.758482,-0.105604,-0.674020,1.298333
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.086037,0.287245,0.016378,-0.112177,-0.304819,-0.252889,0.021971,0.075871,-0.359396,-0.169624
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.552330,0.062292,-0.068417,0.859405,0.972935,0.985658,-0.192667,-0.163113,0.112657,0.365281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46216,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.514696,-4.881959,-6.506523,-1.051293,-1.284338,-1.271468,-0.916804,-1.131191,-0.674020,-1.168241
46217,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.483251,-8.249638,-4.075059,-1.051293,-1.056560,-1.112329,-0.632355,-0.421267,-0.674020,-1.168241
46218,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.326005,-0.581583,-1.724074,-1.051293,-1.284338,-1.271468,-0.916804,-1.131191,-0.674020,-1.168241
46219,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.514696,0.779042,-3.826252,-1.051293,-1.284338,-1.271468,-0.063373,-1.131191,-0.674020,-1.168241


In [39]:
# Assign X_train
X_train = features_train

In [40]:
# Assign y_train
y_train = target

In [41]:
y_train.shape

(46221,)

In [42]:
# Check the length of df_test
len(df_test)

4970

In [43]:
# Apply transformation to testing dataset
X_test = ohe.fit_transform(df_test[cat_col])

In [44]:
X_test = pd.DataFrame(X_test, columns=ohe.get_feature_names_out())

In [45]:
X_test[num_col] = scaler.fit_transform(df_test[num_col])

In [46]:
X_test

Unnamed: 0,yr_Fr,yr_Jr,yr_So,yr_Sr,conf_ACC,conf_AE,conf_ASun,conf_Amer,conf_B10,conf_B12,...,mp,ogbpm,dgbpm,oreb,dreb,treb,ast,stl,blk,pts
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.622727,-0.047246,-0.446385,-1.019688,-1.057036,-1.117648,-0.927909,-1.114426,-0.684429,-0.973761
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.476229,-0.792039,-0.991545,-0.620751,-0.325595,-0.445316,-0.533705,-0.731520,-0.684429,-0.694412
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.641133,-0.114986,0.572453,0.930816,1.624937,1.508257,-0.060574,0.289774,4.280789,0.642372
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-1.353963,0.204599,0.379729,-1.019688,-1.088089,-1.140606,-0.927909,-1.114426,-0.684429,-1.032290
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.536830,0.747086,-0.125785,0.506863,0.597579,0.608856,0.090255,0.625501,-0.455738,1.751779
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4965,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.428439,0.635920,4.220760,-1.019688,-1.274472,-1.278400,-0.494242,-0.061329,-0.684429,-1.178623
4966,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.215163,-0.754630,-1.139845,-0.653959,-1.274472,-1.157824,-0.927909,-1.114426,-0.684429,-1.178623
4967,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-1.475834,-0.203862,-0.153290,-1.019688,-1.274472,-1.278400,-0.927909,-1.114426,-0.684429,-1.178623
4968,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-1.475834,0.040420,-0.473196,-1.019688,-1.274472,-1.278400,-0.927909,-1.114426,-0.684429,-1.178623


In [47]:
X_train.shape

(46221, 95)

In [48]:
X_test.shape

(4970, 90)

In [49]:
X_train

Unnamed: 0,yr_Fr,yr_Jr,yr_So,yr_Sr,conf_ACC,conf_AE,conf_ASun,conf_Amer,conf_B10,conf_B12,...,mp,ogbpm,dgbpm,oreb,dreb,treb,ast,stl,blk,pts
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.233776,-0.121603,-0.461089,-0.793304,-0.863773,-0.885819,0.100742,-0.393791,-0.583150,-0.381906
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.705198,0.335939,0.061273,-0.143705,-0.420041,-0.344611,0.639428,-0.254064,-0.118649,0.034399
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.521834,0.604205,-0.134697,-0.206625,0.310241,0.143264,0.758482,-0.105604,-0.674020,1.298333
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.086037,0.287245,0.016378,-0.112177,-0.304819,-0.252889,0.021971,0.075871,-0.359396,-0.169624
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.552330,0.062292,-0.068417,0.859405,0.972935,0.985658,-0.192667,-0.163113,0.112657,0.365281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46216,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.514696,-4.881959,-6.506523,-1.051293,-1.284338,-1.271468,-0.916804,-1.131191,-0.674020,-1.168241
46217,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.483251,-8.249638,-4.075059,-1.051293,-1.056560,-1.112329,-0.632355,-0.421267,-0.674020,-1.168241
46218,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.326005,-0.581583,-1.724074,-1.051293,-1.284338,-1.271468,-0.916804,-1.131191,-0.674020,-1.168241
46219,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.514696,0.779042,-3.826252,-1.051293,-1.284338,-1.271468,-0.063373,-1.131191,-0.674020,-1.168241


#### [3.3] Ensure same dimension for for testing and training sets

In [53]:
missing_in_test = set(features_train) - set(X_test.columns)
missing_in_train = set(X_test.columns) - set(features_train)

In [54]:
print("Missing in test set:", missing_in_test)

Missing in test set: {'conf_GWC', 'conf_Ind', 'conf_ind', 'conf_P10', 'conf_Ivy'}


In [55]:
print("Extra in test set:", missing_in_train)

Extra in test set: set()


In [56]:
# We need to ensure the same feature dimension for train and test set
#Fill in missing feature from train with 0
for feature in features_train:
    if feature not in X_test.columns:
        X_test[feature] = 0

In [57]:
# Drop these extra columns from the test set
extra_in_test = set(X_test.columns) - set(features_train)
X_test = X_test.drop(columns=extra_in_test)


In [58]:
missing_in_test = set(features_train) - set(X_test.columns)
missing_in_train = set(X_test.columns) - set(features_train)
print("Missing in test set:", missing_in_test)
print("Extra in test set:", missing_in_train)

Missing in test set: set()
Extra in test set: set()


In [59]:
X_train.shape

(46221, 95)

In [60]:
X_test.shape

(4970, 95)

In [61]:
X_train

Unnamed: 0,yr_Fr,yr_Jr,yr_So,yr_Sr,conf_ACC,conf_AE,conf_ASun,conf_Amer,conf_B10,conf_B12,...,mp,ogbpm,dgbpm,oreb,dreb,treb,ast,stl,blk,pts
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.233776,-0.121603,-0.461089,-0.793304,-0.863773,-0.885819,0.100742,-0.393791,-0.583150,-0.381906
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.705198,0.335939,0.061273,-0.143705,-0.420041,-0.344611,0.639428,-0.254064,-0.118649,0.034399
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.521834,0.604205,-0.134697,-0.206625,0.310241,0.143264,0.758482,-0.105604,-0.674020,1.298333
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.086037,0.287245,0.016378,-0.112177,-0.304819,-0.252889,0.021971,0.075871,-0.359396,-0.169624
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.552330,0.062292,-0.068417,0.859405,0.972935,0.985658,-0.192667,-0.163113,0.112657,0.365281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46216,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.514696,-4.881959,-6.506523,-1.051293,-1.284338,-1.271468,-0.916804,-1.131191,-0.674020,-1.168241
46217,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.483251,-8.249638,-4.075059,-1.051293,-1.056560,-1.112329,-0.632355,-0.421267,-0.674020,-1.168241
46218,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.326005,-0.581583,-1.724074,-1.051293,-1.284338,-1.271468,-0.916804,-1.131191,-0.674020,-1.168241
46219,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.514696,0.779042,-3.826252,-1.051293,-1.284338,-1.271468,-0.063373,-1.131191,-0.674020,-1.168241


In [62]:
X_test

Unnamed: 0,yr_Fr,yr_Jr,yr_So,yr_Sr,conf_ACC,conf_AE,conf_ASun,conf_Amer,conf_B10,conf_B12,...,treb,ast,stl,blk,pts,conf_GWC,conf_Ind,conf_Ivy,conf_P10,conf_ind
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.117648,-0.927909,-1.114426,-0.684429,-0.973761,0,0,0,0,0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.445316,-0.533705,-0.731520,-0.684429,-0.694412,0,0,0,0,0
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.508257,-0.060574,0.289774,4.280789,0.642372,0,0,0,0,0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-1.140606,-0.927909,-1.114426,-0.684429,-1.032290,0,0,0,0,0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.608856,0.090255,0.625501,-0.455738,1.751779,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4965,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.278400,-0.494242,-0.061329,-0.684429,-1.178623,0,0,0,0,0
4966,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.157824,-0.927909,-1.114426,-0.684429,-1.178623,0,0,0,0,0
4967,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-1.278400,-0.927909,-1.114426,-0.684429,-1.178623,0,0,0,0,0
4968,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-1.278400,-0.927909,-1.114426,-0.684429,-1.178623,0,0,0,0,0


In [63]:
# Ensure the columns in the test set are in the same order as in the training set
X_test = X_test[features_train.columns]

#### [3.4] Split data into train and validation

In [69]:
# Split training data for validation
# Perform stratified train-test split
X_train, X_val, y_train, y_val = train_test_split(features_train, target, test_size=0.3, stratify=target, random_state=42)

### 4. Train Model

In [65]:
# Load model
logistic_1 = LogisticRegression(random_state = 42, max_iter = 10000)

In [70]:
# Fit model
logistic_1.fit(X_train, y_train)

In [71]:
# Store model using dump
from joblib import dump
dump(logistic_1,  '../models/logistic_1.joblib')

['../models/logistic_1.joblib']

#### [4.1] Model Assessment

In [72]:
#Predict the probabilities on the validation set
y_val_probs = logistic_1.predict_proba(X_val)[:, 1]  # Get probabilities for the positive class (class 1)


In [73]:
#Predict the probabilities on the training set
y_train_prob = logistic_1.predict_proba(X_train)[:, 1]

In [74]:
#Calculate the AUC score for training set
auc_score = roc_auc_score(y_train, y_train_prob)
print(f"AUC Score: {auc_score:.4f}")

AUC Score: 0.9925


In [75]:
#Calculate the AUC score for validation set
auc_score = roc_auc_score(y_val, y_val_probs)
print(f"AUC Score: {auc_score:.4f}")

AUC Score: 0.9932


In [76]:
# Predict the probabilities for the test set
y_pred_prob = logistic_1.predict_proba(X_test)[:, 1]

In [77]:
# Create the output DataFrame
output_df = df_test[['player_id']].copy()  # Ensure 'player_id' is in the test dataset
output_df['drafted'] = y_pred_prob

In [78]:
output_df['drafted'].describe()

count    4.970000e+03
mean     1.042257e-02
std      6.688159e-02
min      2.526112e-16
25%      1.923742e-06
50%      1.972736e-05
75%      2.733420e-04
max      9.991831e-01
Name: drafted, dtype: float64

In [79]:
# Format as required by Kaggle Competition
output_df

Unnamed: 0,player_id,drafted
0,23549e01-c1b3-4ca0-a0fd-de9b5d76276b,2.608326e-08
1,52a518bb-b34a-4b43-adee-5e996cb853fa,5.466351e-08
2,ad3d9117-b6bf-4675-ab97-3497acf3e555,7.017369e-05
3,eaf66a5c-6f4c-4070-bc70-a99d731b3740,1.846426e-06
4,55d07491-5bd1-447f-844e-9cb36eaa442e,5.587810e-03
...,...,...
4965,28222513-8a1f-4a48-8fde-16888e9e11ce,6.295669e-07
4966,c32a466d-7a66-47eb-805c-a94e328261bc,1.217894e-05
4967,55f0ddef-9f29-47ae-87b5-da43c687d25c,4.286406e-13
4968,460d6a42-5dbc-48f0-bc94-3650da83f345,2.301287e-12


In [80]:
output_df.to_csv('../data/external/logistic_2.csv', index=False)

In [81]:
# Save training and test data for future modelling
X_train.to_csv('../data/processed/X_train_1.csv', index=False)
X_test.to_csv('../data/processed/X_test_1.csv', index=False)
y_train.to_csv('../data/processed/y_train_1.csv', index=False)


Summary of features used:
- Numeric Columns: GP, Min_per, Ortg, usg, eFG, TS_per, ORB_per, DRB_per, AST_per, TO_per, FTM, FTA, FT_per, twoPM, twoPA, twoP_per, TPM, TPA, TP_per, blk_per, stl_per, ftr, ht, porpag, adjoe, pfr, Rec_Rank, ast_tov, rimmade, rimmade_rimmiss, midmade, midmade_midmiss, rim_ratio, mid_ratio, dunksmade, dunksmiss_dunksmade, dunks

- Categorical column: yr, conf

- Features dropped: team, type

Outliers are not removed from these features yet for the first iteration of the experiment.
