In [1]:
import pandas as pd
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval

import warnings
warnings.filterwarnings("ignore")

In [2]:
train_df = pd.read_csv('data/clean_train.csv')
test_df = pd.read_csv('data/clean_test.csv')

In [3]:
X = train_df.copy().drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
y = train_df['Survived'].copy()

X_test = test_df.copy().drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [4]:
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score

validation = StratifiedKFold(n_splits=3)

In [11]:
def predicted_y(X, y, validation, params, algorithm):
    model = algorithm(**params)
    full_probabilities = cross_val_predict(model, X, y, cv=validation, method='predict_proba')
    y_proba = [probability[1] for probability in full_probabilities] 
    return y_proba

def model_function(X, y, validation, algorithm):
    def objective_function(params):
        curr_auc = roc_auc_score(y, predicted_y(X, y, validation, params, algorithm))
        curr_run = {'loss': -curr_auc, 'auc': curr_auc, 'parameters': params, 'status': STATUS_OK}
#         report.append(curr_run)
        return curr_run
    return objective_function

In [12]:
def best_parameters(algorithm, X, y, validation, hyper_space, n_evals):
#     trials_list = []
    objective = model_function(X, y, validation, algorithm)
    trials = Trials()
    best = fmin(objective, space=hyper_space, algo=tpe.suggest, max_evals=n_evals, trials=trials)
#     sorted_by_auc = sorted(trials.trials, key=lambda k: k['result']['auc'], reverse=True)
#     best_trial = sorted_by_auc[0]['result']
#     best_auc = best_trial['auc']
#     best_params = space_eval(hyper_space, best_trial['parameters'])
#     return {'auc': best_auc, 'parameters': best_params}
    return best


# trials_list = []

# logistic_objective = model_function(X, y, validation, LogisticRegression, trials_list)

# hyper_space = {
#     'C': hp.uniform('C', 0, 1),
#     'penalty': hp.choice('penalty', ['l2', 'l1'])
# }
# objective = model_function(X, y, validation, RandomForestClassifier, trials_list)
# trials = Trials()
# best = fmin(objective, space=rf_space, algo=tpe.suggest, max_evals=100, trials=trials)


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

logreg_space = {
    'penalty': hp.choice('penalty', ['l1', 'l2']),
    'C': hp.uniform('C', 0, 5),
    'solver': hp.choice('solver', ['liblinear', 'saga']),
    'max_iter': hp.randint('max_iter', 500) + 100
}

rf_space = {
    'n_estimators': 100,
    'criterion': hp.choice('criterion', ['gini', 'entropy']),
    'max_features': hp.uniform('max_features', 0, 1),
    'min_samples_split': hp.uniform('min_samples_split', 0, 1),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
    'min_weight_fraction_leaf': hp.uniform('min_weight_fraction_leaf', 0, 0.5),
    'bootstrap': hp.choice('bootstrap', [True, False]),
    'class_weight': hp.choice('class_weight', ['balanced_subsample', 'balanced'])
}

svc_space = {
    'C': hp.uniform('C', 0, 5),
    'kernel': hp.choice('kernel', ['rbf', 'linear', 'poly', 'rbf', 'sigmoid']),
    
}

knn2_space = {
    'n_neighbors': 2,
    'weights': hp.choice('wights', ['uniform', 'distance']),
    'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree']),
    'leaf_size': hp.randint('leaf_size', 120) + 30,
    'p': hp.randint('p', 2) + 2
}

knn4_space = {
    'n_neighbors': 4,
    'weights': hp.choice('wights', ['uniform', 'distance']),
    'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree']),
    'leaf_size': hp.randint('leaf_size', 120) + 30,
    'p': hp.randint('p', 2) + 2
}

knn8_space = {
    'n_neighbors': 8,
    'weights': hp.choice('wights', ['uniform', 'distance']),
    'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree']),
    'leaf_size': hp.randint('leaf_size', 120) + 30,
    'p': hp.randint('p', 2) + 2
}

knn16_space = {
    'n_neighbors': 16,
    'weights': hp.choice('wights', ['uniform', 'distance']),
    'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree']),
    'leaf_size': hp.randint('leaf_size', 120) + 30,
    'p': hp.randint('p', 2) + 2
}

algorithms_space = [
#     {'algo': LogisticRegression, 'space': logreg_space},
    {'algo': RandomForestClassifier, 'space': rf_space},
    {'algo': SVC, 'params': svc_space},
    {'algo': KNeighborsClassifier, 'space': knn2_space},
    {'algo': KNeighborsClassifier, 'space': knn4_space},
    {'algo': KNeighborsClassifier, 'space': knn8_space},
    {'algo': KNeighborsClassifier, 'space': knn16_space}
]

In [17]:
parameter_list = []

for algo in algorithms_space:
    best = best_parameters(algo['algo'], X, y, validation, algo['space'], 500)
    print(best)
    parameter_list.append(best)

KeyboardInterrupt: 

In [None]:
parameter_list