In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('all_seasons.csv')
df = df[['player_name', 'college', 'country', 'age', 'player_height', 'player_weight', 'pts', 'reb']]
df.columns = ['Name', 'Club', 'Country', 'Age', 'Height', 'Weight', 'Avg Points', 'Avg Rebounds']
df.head()

Unnamed: 0,Name,Club,Country,Age,Height,Weight,Avg Points,Avg Rebounds
0,Chris Robinson,Western Kentucky,USA,23.0,195.58,90.7184,4.6,1.7
1,Matt Fish,North Carolina-Wilmington,USA,27.0,210.82,106.59412,0.3,0.8
2,Matt Bullard,Iowa,USA,30.0,208.28,106.59412,4.5,1.6
3,Marty Conlon,Providence,USA,29.0,210.82,111.13004,7.8,4.4
4,Martin Muursepp,,USA,22.0,205.74,106.59412,3.7,1.6


In [3]:
df['Diterima'] = 0
df.loc[(df['Age'] <= 25) & 
       (df['Height'] >= 180) &
       (df['Weight'] <= 90) &
       (df['Avg Points'] >= 6) &
       (df['Avg Rebounds'] >= 3), 
       'Diterima'] = 1
df

Unnamed: 0,Name,Club,Country,Age,Height,Weight,Avg Points,Avg Rebounds,Diterima
0,Chris Robinson,Western Kentucky,USA,23.0,195.58,90.71840,4.6,1.7,0
1,Matt Fish,North Carolina-Wilmington,USA,27.0,210.82,106.59412,0.3,0.8,0
2,Matt Bullard,Iowa,USA,30.0,208.28,106.59412,4.5,1.6,0
3,Marty Conlon,Providence,USA,29.0,210.82,111.13004,7.8,4.4,0
4,Martin Muursepp,,USA,22.0,205.74,106.59412,3.7,1.6,0
...,...,...,...,...,...,...,...,...,...
9556,Elijah Millsap,Alabama-Birmingham,USA,29.0,200.66,97.52228,1.5,3.0,0
9557,Elfrid Payton,Louisiana-Lafayette,USA,23.0,193.04,83.91452,12.8,4.7,1
9558,Edy Tavares,,Cabo Verde,25.0,220.98,120.20188,4.0,5.5,0
9559,Willy Hernangomez,,Spain,23.0,210.82,108.86208,8.2,7.0,0


In [4]:
df.isnull().sum()

Name            0
Club            0
Country         0
Age             0
Height          0
Weight          0
Avg Points      0
Avg Rebounds    0
Diterima        0
dtype: int64

In [5]:
scaler = StandardScaler()
dfscl = pd.DataFrame(scaler.fit_transform(df[['Age', 'Height', 'Weight', 'Avg Points', 'Avg Rebounds']]),
                     columns=['Age', 'Height', 'Weight', 'Avg Points', 'Avg Rebounds'])
dfscl

Unnamed: 0,Age,Height,Weight,Avg Points,Avg Rebounds
0,-0.996742,-0.578993,-0.802604,-0.589390,-0.747850
1,-0.077272,1.063130,0.439607,-1.319614,-1.108712
2,0.612331,0.789443,0.439607,-0.606372,-0.787946
3,0.382463,1.063130,0.794524,-0.045967,0.334735
4,-1.226609,0.515755,0.439607,-0.742228,-0.787946
...,...,...,...,...,...
9556,0.382463,-0.031619,-0.270228,-1.115831,-0.226605
9557,-0.996742,-0.852680,-1.334980,0.803131,0.455022
9558,-0.537007,2.157878,1.504359,-0.691282,0.775788
9559,-0.996742,1.063130,0.617065,0.021961,1.377224


In [6]:
xtr, xts, ytr, yts = train_test_split(
    dfscl, df['Diterima'], train_size=0.82
)
len(xts)

1721

### Hyper Parameter Tuning

In [7]:
# param LogReg
penalty = ['l1', 'l2', 'elasticnet', 'none']
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
max_iter = [1, 10, 100, 1000, 10000]

param1 = {
    'penalty': penalty, 'solver': solver, 'max_iter': max_iter
}
param1

{'penalty': ['l1', 'l2', 'elasticnet', 'none'],
 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
 'max_iter': [1, 10, 100, 1000, 10000]}

In [8]:
# param DecTree
criterion = ['gini', 'entropy']
splitter = ['best', 'random']

param2 = {
    'criterion': criterion, 'splitter': splitter
}
param2

{'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random']}

In [9]:
# param KNN
n_neighbors = [1,2,3,4,5,6,7,8,9,10]
weights = ['uniform', 'distance']
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']

param3 = {
    'n_neighbors': n_neighbors, 'weights': weights, 'algorithm': algorithm
}

param3

{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 'weights': ['uniform', 'distance'],
 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

In [10]:
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = KNeighborsClassifier()

In [11]:
modelgs1 = GridSearchCV(
    model1, param1, cv= 5
)
modelgs2 = GridSearchCV(
    model2, param2, cv= 5
)
modelgs3 = GridSearchCV(
    model3, param3, cv= 5
)

In [12]:
modelgs1.fit(xtr, ytr)
print(modelgs1.best_params_)

modelgs2.fit(xtr, ytr)
print(modelgs2.best_params_)

modelgs3.fit(xtr, ytr)
print(modelgs3.best_params_)

ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

In [13]:
modelBaru1 = LogisticRegression(
    solver='newton-cg', penalty='none', max_iter=10
)
modelBaru2 = DecisionTreeClassifier(
    criterion='gini', splitter='best'
)
modelBaru3 = KNeighborsClassifier(
    algorithm='auto', n_neighbors=9, weights='uniform'
)

modelBaru1.fit(xtr, ytr)
modelBaru2.fit(xtr, ytr)
modelBaru3.fit(xtr, ytr)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                     weights='uniform')

### Evaluation Metrics
Balanced Accuracy

In [14]:
print('Balanced Accuracy LogReg=', balanced_accuracy_score(yts, modelBaru1.predict(xts)))
print('Balanced Accuracy DecTree=', balanced_accuracy_score(yts, modelBaru2.predict(xts)))
print('Balanced Accuracy KNN=', balanced_accuracy_score(yts, modelBaru3.predict(xts)))

Balanced Accuracy LogReg= 0.6999985819223461
Balanced Accuracy DecTree= 1.0
Balanced Accuracy KNN= 0.8556538756062282


In [15]:
print('Precision LogReg=', precision_score(yts, modelBaru1.predict(xts)))
print('Precision DecTree=', precision_score(yts, modelBaru2.predict(xts)))
print('Precision KNN=', precision_score(yts, modelBaru3.predict(xts)))

Precision LogReg= 0.68
Precision DecTree= 1.0
Precision KNN= 0.8571428571428571


In [16]:
print('Recall LogReg=', recall_score(yts, modelBaru1.predict(xts)))
print('Recall DecTree=', recall_score(yts, modelBaru2.predict(xts)))
print('Recall KNN=', recall_score(yts, modelBaru3.predict(xts)))

Recall LogReg= 0.40476190476190477
Recall DecTree= 1.0
Recall KNN= 0.7142857142857143


In [17]:
print('F1 Score LogReg=', f1_score(yts, modelBaru1.predict(xts)))
print('F1 Score DecTree=', f1_score(yts, modelBaru2.predict(xts)))
print('F1 Score KNN=', f1_score(yts, modelBaru3.predict(xts)))

F1 Score LogReg= 0.5074626865671642
F1 Score DecTree= 1.0
F1 Score KNN= 0.7792207792207793


In [18]:
print('ROC AUC Score LogReg=', roc_auc_score(yts, modelBaru1.predict(xts)))
print('ROC AUC Score DecTree=', roc_auc_score(yts, modelBaru2.predict(xts)))
print('ROC AUC Score KNN=', roc_auc_score(yts, modelBaru3.predict(xts)))

ROC AUC Score LogReg= 0.6999985819223461
ROC AUC Score DecTree= 1.0
ROC AUC Score KNN= 0.8556538756062282


### Menggunakan Decision Tree sebagai model klasifikasi¶

In [20]:
dfINA = pd.read_excel('indo.xlsx')
dfINA

Unnamed: 0,Name,Club,Country,Age,Height,Weight,Avg Points,Avg Rebounds
0,Andakara Prastawa Dyaksa,Pelita Jaya Bakrie,Indonesia,24,190,90,7,6
1,Reggie Mononimbar,Pelita Jaya Bakrie,Indonesia,21,185,86,6,3
2,Hardianus Lakudu,Satria Muda Pertamina Jakarta,Indonesia,23,178,83,10,3
3,Kevin Yonas Sitorus,Satria Muda Pertamina Jakarta,Indonesia,26,185,75,11,4
4,Arki Dikania Wisnu,Satria Muda Pertamina Jakarta,Indonesia,20,183,80,5,2
5,Laurentius Steven Oei,Satria Muda Pertamina Jakarta,Indonesia,21,191,85,4,10
6,Mei Joni,Stapac,Indonesia,25,188,90,7,5
7,Vincent Rivaldi Kosasih,Stapac,Indonesia,23,179,87,1,2
8,Hardian Wicaksono,Pacific Caesar Surabaya,Indonesia,21,177,80,9,8
9,Brandon Jawato,Louvre Surabaya,Indonesia,24,182,85,6,5


In [21]:
dfINA['Diterima'] = modelBaru2.predict(dfINA[['Age', 'Height', 'Weight', 'Avg Points', 'Avg Rebounds']])
dfINA.loc[dfINA['Diterima'] == 0, 'Diterima'] = 'Tidak Diterima'
dfINA.loc[dfINA['Diterima'] == 1, 'Diterima'] = 'Diterima'
dfINA

Unnamed: 0,Name,Club,Country,Age,Height,Weight,Avg Points,Avg Rebounds,Diterima
0,Andakara Prastawa Dyaksa,Pelita Jaya Bakrie,Indonesia,24,190,90,7,6,Tidak Diterima
1,Reggie Mononimbar,Pelita Jaya Bakrie,Indonesia,21,185,86,6,3,Tidak Diterima
2,Hardianus Lakudu,Satria Muda Pertamina Jakarta,Indonesia,23,178,83,10,3,Tidak Diterima
3,Kevin Yonas Sitorus,Satria Muda Pertamina Jakarta,Indonesia,26,185,75,11,4,Tidak Diterima
4,Arki Dikania Wisnu,Satria Muda Pertamina Jakarta,Indonesia,20,183,80,5,2,Tidak Diterima
5,Laurentius Steven Oei,Satria Muda Pertamina Jakarta,Indonesia,21,191,85,4,10,Tidak Diterima
6,Mei Joni,Stapac,Indonesia,25,188,90,7,5,Tidak Diterima
7,Vincent Rivaldi Kosasih,Stapac,Indonesia,23,179,87,1,2,Tidak Diterima
8,Hardian Wicaksono,Pacific Caesar Surabaya,Indonesia,21,177,80,9,8,Tidak Diterima
9,Brandon Jawato,Louvre Surabaya,Indonesia,24,182,85,6,5,Tidak Diterima


#### Berdasarkan hasil yang didapatkan apat kami disimpulkan bahwa dari data pemain muda Indonesia yang diberikan ini semuanya belum layak untuk direkrut
In [ ]:
