## Splitting Data

In [None]:
from sklearn.model_selection import train_test_split

X = yourData.drop(columns=['Label'], axis=1)
y = yourData['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # You can seet test train size

## Functional

**Scoring Metrics**

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

def evaluasi(model, x_train, y_train, x_test, y_test) :
    prediksi_train = model.predict(x_train)
    prediksi_test = model.predict(x_test)
    print('Akurasi data latih = {}'.format(accuracy_score(y_train, prediksi_train)))
    print('Akurasi presisi data latih = {}'.format(precision_score(y_train, prediksi_train)))
    print('Akurasi recall data latih = {}'.format(recall_score(y_train, prediksi_train)))
    print("================================================")
    print('Akurasi data uji = {}'.format(accuracy_score(y_test, prediksi_test)))
    print('Akurasi presisi data uji = {}'.format(precision_score(y_test, prediksi_test)))
    print('Akurasi recall data uji = {}'.format(recall_score(y_test, prediksi_test)))
    print("================================================")
    print('Akurasi ROC AUC data latih = {}'.format(roc_auc_score(y_train, prediksi_train)))
    print('Akurasi ROC AUC data uji = {}'.format(roc_auc_score(y_test, prediksi_test)))

**Best parameter if using hyperparameter**

In [None]:
def show_best_hyperparameter(model, hyperparameters):
    for key, value in hyperparameters.items() :
        print('Best '+key+':', model.get_params()[key])

**Feature importance at classification model for evaluation**

In [None]:
import matplotlib.pyplot as plt

def show_feature_importance(model):
    feat_importances = pd.Series(model.feature_importances_, index=X.columns)
    ax = feat_importances.nlargest(25).plot(kind='barh', figsize=(10, 8))
    ax.invert_yaxis()

    plt.xlabel('Score')
    plt.ylabel('Feature')
    plt.title('Feature Importance')

## KNN

If you dont want to use hyperparameter or want to compare both of them, you can delete it and modified it

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV # You can use GridSearch too

# Membuat hyperparameter
hyperparameters = dict(
    n_neighbors = list(range(1, 1000)),
    weights = ['uniform', 'distance'],
    algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute'],
    p = [1, 2],
)

# Membuat model
knn_model = RandomizedSearchCV(KNeighborsClassifier(), hyperparameters, random_state=42, cv=5, scoring='recall')

# Latih data
knn_model.fit(X_train, y_train)

# Evaluasi model
evaluasi(knn_model, X_train, y_train, X_test, y_test)
print('\n')
show_best_hyperparameter(knn_model.best_estimator_, hyperparameters)

## Decision Tree

If you dont want to use hyperparameter or want to compare both of them, you can delete it and modified it

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV # You can use grid search too

# Membuat Hyperparameter
hyperparameters = dict(
    criterion = ['gini', 'entropy'],
    splitter = ['best', 'random'],
    max_depth = list(range(1, 2000)),
    max_features = ['auto', 'sqrt', 'log2'],   
)

# Membuat model
decision_model = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), hyperparameters, random_state=42, cv=5, scoring='recall')

# Latih data
decision_model.fit(X_train, y_train)

# Evaluasi model
evaluasi(decision_model, X_train, y_train, X_test, y_test)
print('\n')
show_best_hyperparameter(decision_model.best_estimator_, hyperparameters)

## Random Forest

If you dont want to use hyperparameter or want to compare both of them, you can delete it and modified it

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Membuat Hyperparameter
hyperparameters = dict(
    n_estimators = [int(x) for x in np.linspace(1, 1000, num=20)],
    criterion = ['gini', 'entropy'],
    max_depth = [int(x) for x in np.linspace(1, 1000, num=20)],
    min_samples_split = [int(x) for x in np.linspace(10, 500, num=20)],
    min_samples_leaf = [int(x) for x in np.linspace(10, 500, num=20)],
    max_features = ['auto', 'sqrt', 'log2'],
    bootstrap = [True, False],
    class_weight = ['balanced', 'balanced_subsample'],
)

# Membuat model
randomForest_model = RandomizedSearchCV(RandomForestClassifier(random_state=42), hyperparameters, cv=5, random_state=42, scoring='recall')

# Latih data
randomForest_model.fit(X_train, y_train)

# Evaluasi model
evaluasi(randomForest_model, X_train, y_train, X_test, y_test)
print('\n')
show_feature_importance(randomForest_model.best_estimator_)

## AdaBoost

In [None]:
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV

# Hyperparameter
hyperparameters = dict(
    n_estimators = [int(x) for x in np.linspace(1, 1000, num=20)],
    learning_rate = [float(x) for x in np.linspace(0.001, 0.1, num=20)],  
    algorithm = ['SAMME', 'SAMME.R']
)

# Membuat model
adaboost_model = RandomizedSearchCV(AdaBoostClassifier(random_state=42), hyperparameters, cv=5, random_state=42, scoring='recall')

# Latih data
adaboost_model.fit(X_train, y_train)

# Evaluasi model
evaluasi(adaboost_model, X_train, y_train, X_test, y_test)
print('\n')
show_best_hyperparameter(adaboost_model.best_estimator_, hyperparameters)
print('\n')
show_feature_importance(adaboost_model.best_estimator_)

## XGBoost

If you dont want to use hyperparameter or want to compare both of them, you can delete it and modified it

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Hyperparameter
hyperparameters = {
    'max_depth' : [int(x) for x in np.linspace(1, 1000, num=10)],
    'min_child_weight' : [int(x) for x in np.linspace(1, 50, num=10)],
    'gamma' : [float(x) for x in np.linspace(0, 1, num=10)],
    'tree_method' : ['auto', 'exact', 'approx', 'hist'],
    'colsample_bytree' : [float(x) for x in np.linspace(0, 1, num=10)],
    'eta' : [float(x) for x in np.linspace(0, 1, num=100)],
    'lambda' : [float(x) for x in np.linspace(0, 1, num=10)],
    'alpha' : [float(x) for x in np.linspace(0, 1, num=10)]
}

# Membuat model
xgboost_model = RandomizedSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), hyperparameters, cv=5, random_state=42, scoring='recall')

# Latih data
xgboost_model.fit(X_train, y_train)

# Evaluasi model
evaluasi(xgboost_model, X_train, y_train, X_test, y_test)
print('\n')
show_best_hyperparameter(xgboost_model.best_estimator_, hyperparameters)