In [17]:
import os
from math import exp

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from category_encoders import MEstimateEncoder
from sklearn.linear_model import ElasticNet
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

ImportError: cannot import name 'fmax'

In [3]:

def make_mi_scores(X, y):
    X = X.copy()
    # All discrete features should now have integer dtypes
    mi_scores = mutual_info_regression(X, y, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

In [4]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(
        100 * (start_mem - end_mem) / start_mem))

    return df


def import_data(file):
    # Reading File
    df = pd.read_csv(file)

    # Reducing Size by Optimizing Dtypes of columns
    df = reduce_mem_usage(df)

    # Converting Bool cols into integer
    bool_cols = []
    for i, col in enumerate(df.columns):
        if df[col].dtypes == bool:
            bool_cols.append(i)
    df.iloc[:, bool_cols] = df.iloc[:, bool_cols].astype(int)

    return df


In [5]:
train_data_path = "train.csv"
df_train = import_data(train_data_path)


Memory usage of dataframe is 2189.64 MB
Memory usage after optimization is: 505.45 MB
Decreased by 76.9%


In [166]:
#df_smaller = df_train.sample(random_state=1, n=10000, axis=0)
X_train, X_test, y_train, y_test = train_test_split(df_train.loc[:,"f0":"f284"], df_train["target"], test_size=0.2, random_state=0)

In [167]:
mi_scores = make_mi_scores(X_train, y_train)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [None]:
X_train = X_train.loc[:, mi_scores > 0]
X_test = X_test.loc[:, mi_scores > 0]

In [65]:
xgb_params = dict(
    #objective = "binary",
    eval_metric = roc_auc_score,
    max_depth=2,           # maximum depth of each tree - try 2 to 10
    learning_rate=0.01,    # effect of each tree - try 0.0001 to 0.1
    n_estimators=1000,     # number of trees (that is, boosting rounds) - try 1000 to 8000
    min_child_weight=2,    # minimum number of houses in a leaf - try 1 to 10
    colsample_bytree=0.2,  # fraction of features (columns) per tree - try 0.2 to 1.0
    subsample=0.7,         # fraction of instances (rows) per tree - try 0.2 to 1.0
    reg_alpha=3,         # L1 regularization (like LASSO) - try 0.0 to 10.0
    reg_lambda=2.0,        # L2 regularization (like Ridge) - try 0.0 to 10.0
    num_parallel_tree=1,   # set > 1 for boosted random forests
    use_label_encoder=False,
)

In [None]:
space={
    "learning_rate": hp.loguniform("learning_rate", log(0.001), log(0.03)), # effect of each tree - try 0.0001 to 0.1
    "max_depth": hp.quniform("max_depth", 2, 10, 1), # maximum depth of each tree - try 2 to 10
    "gamma": hp.uniform ("gamma", 1,9),
    "reg_alpha" : hp.uniform("reg_alpha", 2,10), # L1 regularization (like LASSO) - try 0.0 to 10.0
    "reg_lambda" : hp.uniform("reg_lambda", 2,100), # L2 regularization (like Ridge) - try 0.0 to 10.0
    "colsample_bytree" : hp.uniform("colsample_bytree", 0.1,1), # fraction of features (columns) per tree - try 0.2 to 1.0
    "min_child_weight" : hp.quniform("min_child_weight", 0, 100, 1), # minimum number of houses in a leaf - try 1 to 10
    "n_estimators": 1000, #  number of trees (that is, boosting rounds) - try 1000 to 8000
    "subsample":0.7,         # fraction of instances (rows) per tree - try 0.2 to 1.0
    "seed": 0,
    "use_label_encoder":False,
    }

def objective(space):
    model=XGBRegressor(
        objective="binary:logistic",
        eval_metric = "auc",
        max_depth=int(space['max_depth']),           # maximum depth of each tree - try 2 to 10
        learning_rate=0.02,  # effect of each tree - try 0.0001 to 0.1
        n_estimators=space['n_estimators'],     # number of trees (that is, boosting rounds) - try 1000 to 8000
        min_child_weight=int(space['min_child_weight']),    # minimum number of houses in a leaf - try 1 to 10
        colsample_bytree=int(space['colsample_bytree']),  # fraction of features (columns) per tree - try 0.2 to 1.0
        subsample=0.7,         # fraction of instances (rows) per tree - try 0.2 to 1.0
        reg_alpha=int(space['reg_alpha']),         # L1 regularization (like LASSO) - try 0.0 to 10.0
        reg_lambda=int(space['reg_lambda']),        # L2 regularization (like Ridge) - try 0.0 to 10.0
        gamma = space['gamma'],
        num_parallel_tree=1,   # set > 1 for boosted random forests
        use_label_encoder=False
    )
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    model.fit(
        X_train, y_train,
        #eval_set=evaluation, 
        #eval_metric="auc",
        #early_stopping_rounds=10,
        verbose=False
    )
    
    accuracy = roc_auc_score(y_test, model.predict(X_test))
    print (f"SCORE:{accuracy}")
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
model = XGBRegressor()
model.fit(X_train, y_train)
# X_test = df_test.loc[:,"f0":"f284"]
# print(X_test.loc[:100, mi_scores > 0.0])
predictions = model.predict(X_test)
roc_auc_score(y_test, predictions)

0.7856667856667856

In [None]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

print("\nThe best hyperparameters are : ")
print(best_hyperparams)

SCORE:0.8096133096133097
SCORE:0.8124123124123125
SCORE:0.8033238033238034
SCORE:0.8036613036613037
SCORE:0.817903817903818
SCORE:0.8156918156918158
SCORE:0.7994772994772994
SCORE:0.8186838186838187
SCORE:0.8115598115598116
SCORE:0.8181118181118181
SCORE:0.8023108023108023
SCORE:0.8183998183998183
SCORE:0.797025297025297
SCORE:0.8159118159118159
SCORE:0.7928972928972928
SCORE:0.8170578170578171
SCORE:0.8152178152178151
SCORE:0.8185718185718186
SCORE:0.8115358115358116
SCORE:0.8184483184483182
SCORE:0.8176548176548176
SCORE:0.8175328175328174
SCORE:0.8173888173888173
SCORE:0.818057818057818
SCORE:0.8186843186843187
SCORE:0.8176448176448177
SCORE:0.8175698175698176
SCORE:0.8134543134543135
SCORE:0.8179458179458179
SCORE:0.8168908168908169
SCORE:0.8184013184013184
SCORE:0.8153648153648154
SCORE:0.8178728178728178
SCORE:0.8182198182198182
SCORE:0.816958816958817
SCORE:0.8102133102133102
SCORE:0.8176493176493176
SCORE:0.817012317012317
SCORE:0.8178378178378177
SCORE:0.8178438178438179
SCORE