In [None]:
# def fit_metrics_print(model, X_train, y_train, X_val, y_val):
#     from sklearn.metrics import accuracy_score, classification_report
#     model.fit(X_train, y_train)
    
#     pred_train = model.predict(X_train)
#     pred_val = model.predict(X_val)
#     print(model)
#     print()
#     print('train accuracy :', accuracy_score(y_train, pred_train))
#     print('validation accuracy :', accuracy_score(y_val, pred_val))
#     print('='*60)

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [7]:
df = pd.read_csv()

In [8]:
drop_cols = ['enrollee_id', 'target']
y = df['target']
X = df.drop(columns=drop_cols)

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, random_state=1)

# Decision Tree

In [None]:
# 학습
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

tree.feature_importances_

In [None]:
# 검증
from sklearn.metrics import accuracy_score, classification_report

pred_train = tree.predict(X_train)
pred_val = tree.predict(X_val)

print('train accuracy :', accuracy_score(y_train, pred_train))
print('validation accuracy :', accuracy_score(y_val, pred_val))

## gridsearch

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = dict(
    max_depth=range(1, 11),
    max_leaf_nodes=range(1, 11),
    random_state=[1]
)

grid_tree = GridSearchCV(tree,
                         param_grid=param_grid,
                         scoring='accuracy',
                         cv=3,
                         n_jobs=-1)
grid_tree.fit(X_train, y_train)

In [None]:
grid_tree.best_estimator_

In [None]:
pred_train = grid_tree.predict(X_train)
pred_val = grid_tree.predict(X_val)

print('train accuracy :', accuracy_score(y_train, pred_train))
print('validation accuracy :', accuracy_score(y_val, pred_val))

-----------------------------------
# RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth=7, max_features=12, n_estimators=200,
                       random_state=1)
rf.fit(X_train, y_train)

In [None]:
pred_train = rf.predict(X_train)
pred_val = rf.predict(X_val)

print('train accuracy :', accuracy_score(y_train, pred_train))
print('validation accuracy :', accuracy_score(y_val, pred_val))

## gridsearch

In [None]:
param_grid = dict(
    n_estimators=[200],
    max_features=range(10, 13),
    max_depth=range(1, 11),
    random_state=[1]
)

rf = RandomForestClassifier()
grid_rf = GridSearchCV(rf,
                       param_grid=param_grid,
                       scoring='accuracy',
                       cv=3,
                       n_jobs=-1)
grid_rf.fit(X_train, y_train)

In [None]:
grid_rf.best_estimator_

In [None]:
pred_train = grid_rf.predict(X_train)
pred_val = grid_rf.predict(X_val)

print('train accuracy :', accuracy_score(y_train, pred_train))
print('validation accuracy :', accuracy_score(y_val, pred_val))

-----------------------------------
# KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [None]:
pred_train = knn.predict(X_train)
pred_val = knn.predict(X_val)

print('train accuracy :', accuracy_score(y_train, pred_train))
print('validation accuracy :', accuracy_score(y_val, pred_val))

## gridsearch

In [None]:
grid_knn = GridSearchCV(knn,
                        param_grid=dict(n_neighbors=range(1, 10)),
                        scoring='accuracy',
                        n_jobs=-1,
                        cv=2,
                       )
grid_knn.fit(X_train, y_train)

In [None]:
grid_knn.best_estimator_

In [None]:
pred_train = grid_knn.predict(X_train)
pred_val = grid_knn.predict(X_val)

print('train accuracy :', accuracy_score(y_train, pred_train))
print('validation accuracy :', accuracy_score(y_val, pred_val))

-----------------------------------
# svc

In [None]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)

In [None]:
pred_train = svc.predict(X_train)
pred_val = svc.predict(X_val)

print('train accuracy :', accuracy_score(y_train, pred_train))
print('validation accuracy :', accuracy_score(y_val, pred_val))

## RandomizedSearch

In [None]:
param = dict(
    kernel=['rbf', 'linear'],
    C=[0.01,  0.1, 1, 10],
    gamma=[0.01, 0.1,  1]
)

random_svc = RandomizedSearchCV(svc,
                                param_distributions=param,
                                n_iter=10,
                                scoring='accuracy',
                                n_jobs=-1,
                                cv=2)
random_svc.fit(X_train, y_train)

In [None]:
random_svc.best_estimator_

In [None]:
pred_train = random_svc.predict(X_train)
pred_val = random_svc.predict(X_val)

print('train accuracy :', accuracy_score(y_train, pred_train))
print('validation accuracy :', accuracy_score(y_val, pred_val))

-----------------------------------
# voting (hard)

In [None]:
from sklearn.ensemble import VotingClassifier

estimators = [('tree', tree),
              ('knn', knn), 
              ('random forest', rf), 
              ('svm', svc)]

voting = VotingClassifier(estimators)
voting.fit(X_train, y_train)

In [None]:
pred_train = voting.predict(X_train)
pred_val = voting.predict(X_val)

print('train accuracy :', accuracy_score(y_train, pred_train))
print('validation accuracy :', accuracy_score(y_val, pred_val))

# voting (soft)

In [None]:
voting_soft = VotingClassifier(estimators, voting='soft')
voting_soft.fit(X_train, y_train)

In [None]:
pred_train = voting.predict(X_train)
pred_val = voting.predict(X_val)

print('train accuracy :', accuracy_score(y_train, pred_train))
print('validation accuracy :', accuracy_score(y_val, pred_val))

-----------------------------------
# gradientboosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(random_state=1)
gb.fit(X_train, y_train)

In [None]:
pred_train = gb.predict(X_train)
pred_val = gb.predict(X_val)

print('train accuracy :', accuracy_score(y_train, pred_train))
print('validation accuracy :', accuracy_score(y_val, pred_val))

## randomized

In [None]:
param = dict(
    n_estimators=[100, 200, 300, 400, 500],
    learning_rate=[0.001,0.005,0.01,0.05,0.1,0.5],
    max_depth=range(1, 5),
    subsample=[0.5,0.7,1]
)

random_gb = RandomizedSearchCV(gb,
                             param_distributions=param,
                             n_iter=10,
                             scoring='accuracy',
                             cv=3,
                             n_jobs=-1,
                             random_state=1
                             )
random_gb.fit(X_train, y_train)

In [None]:
random_gb.best_estimator_

In [None]:
pred_train = random_gb.predict(X_train)
pred_val = random_gb.predict(X_val)

print('train accuracy :', accuracy_score(y_train, pred_train))
print('validation accuracy :', accuracy_score(y_val, pred_val))

-------------------------------------
# LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=1, C=1, max_iter=300)
lr.fit(X_train, y_train)

In [None]:
pred_train = lr.predict(X_train)
pred_val = lr.predict(X_val)

print('train accuracy :', accuracy_score(y_train, pred_train))
print('validation accuracy :', accuracy_score(y_val, pred_val))