In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from category_encoders import MEstimateEncoder
from sklearn.linear_model import ElasticNet, LinearRegression, LogisticRegression
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.mixture import GaussianMixture
from xgboost import XGBRegressor

# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

In [2]:

def make_mi_scores(X, y):
    X = X.copy()
    # All discrete features should now have integer dtypes
    mi_scores = mutual_info_regression(X, y, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

In [3]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(
        100 * (start_mem - end_mem) / start_mem))

    return df


def import_data(file):
    # Reading File
    df = pd.read_csv(file)

    # Reducing Size by Optimizing Dtypes of columns
    df = reduce_mem_usage(df)

    # Converting Bool cols into integer
    bool_cols = []
    for i, col in enumerate(df.columns):
        if df[col].dtypes == bool:
            bool_cols.append(i)
    df.iloc[:, bool_cols] = df.iloc[:, bool_cols].astype(int)

    return df


In [4]:
train_data_path = "train.csv"
df_train = import_data(train_data_path)


Memory usage of dataframe is 2189.64 MB
Memory usage after optimization is: 505.45 MB
Decreased by 76.9%


In [5]:
#df_smaller = df_train.sample(random_state=1, n=10000, axis=0)
X_train, X_test, y_train, y_test = train_test_split(df_train.loc[:,"f0":"f284"], df_train["target"], test_size=0.2, random_state=0)

In [6]:
mi_scores = make_mi_scores(X_train.iloc[:50000], y_train.iloc[:50000])

In [7]:
X_train = X_train.loc[:, mi_scores > 0]
X_test = X_test.loc[:, mi_scores > 0]

In [32]:
xgb_params = dict(
    objective = "binary:logistic",
    eval_metric = "auc",
    max_depth=3,           # maximum depth of each tree - try 2 to 10
    learning_rate=0.015,    # effect of each tree - try 0.0001 to 0.1
    n_estimators=5000,     # number of trees (that is, boosting rounds) - try 1000 to 8000
    min_child_weight=75,    # minimum number of houses in a leaf - try 1 to 10
    colsample_bytree=0.5,  # fraction of features (columns) per tree - try 0.2 to 1.0
    subsample=0.7,         # fraction of instances (rows) per tree - try 0.2 to 1.0
    gamma=8,
    reg_alpha=8,         # L1 regularization (like LASSO) - try 0.0 to 10.0
    reg_lambda=80,        # L2 regularization (like Ridge) - try 0.0 to 10.0
    num_parallel_tree=1,   # set > 1 for boosted random forests
    use_label_encoder=False,
)


In [27]:
space={
    "learning_rate": hp.loguniform("learning_rate", log(0.01), log(0.03)), # effect of each tree - try 0.0001 to 0.1
    "max_depth": hp.quniform("max_depth", 2, 5, 1), # maximum depth of each tree - try 2 to 10
    "gamma": hp.uniform ("gamma", 7, 9),
    "reg_alpha" : hp.uniform("reg_alpha", 5,7), # L1 regularization (like LASSO) - try 0.0 to 10.0
    "reg_lambda" : hp.uniform("reg_lambda", 70,100), # L2 regularization (like Ridge) - try 0.0 to 10.0
    "colsample_bytree" : hp.uniform("colsample_bytree", 0.2,0.6), # fraction of features (columns) per tree - try 0.2 to 1.0
    "min_child_weight" : hp.quniform("min_child_weight", 75, 90, 1), # minimum number of houses in a leaf - try 1 to 10
    "n_estimators": hp.uniform("n_estimators", 4500, 5500), #  number of trees (that is, boosting rounds) - try 1000 to 8000
    "subsample": hp.uniform("n_estimators", 0.2, 1),         # fraction of instances (rows) per tree - try 0.2 to 1.0
    "seed": 0,
    "use_label_encoder":False,
    }

def objective(space):
    model=XGBRegressor(
        objective="binary:logistic",
        eval_metric = "auc",
        max_depth=int(space["max_depth"]),           # maximum depth of each tree - try 2 to 10
        learning_rate=space["learning_rate"],  # effect of each tree - try 0.0001 to 0.1
        n_estimators=int(space["n_estimators"]),     # number of trees (that is, boosting rounds) - try 1000 to 8000
        min_child_weight=int(space["min_child_weight"]),    # minimum number of houses in a leaf - try 1 to 10
        colsample_bytree=int(space["colsample_bytree"]),  # fraction of features (columns) per tree - try 0.2 to 1.0
        subsample=0.7,         # fraction of instances (rows) per tree - try 0.2 to 1.0
        reg_alpha=int(space["reg_alpha"]),         # L1 regularization (like LASSO) - try 0.0 to 10.0
        reg_lambda=int(space["reg_lambda"]),        # L2 regularization (like Ridge) - try 0.0 to 10.0
        gamma = space["gamma"],
        num_parallel_tree=1,   # set > 1 for boosted random forests
        use_label_encoder=False
    )
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    model.fit(
        X_train, y_train,
        eval_set=evaluation, 
        #eval_metric="auc",
        #early_stopping_rounds=10,
        verbose=False
    )
    
    accuracy = roc_auc_score(y_test, model.predict(X_test))
    print (f"SCORE:{accuracy}")
    return {'loss': -accuracy, 'status': STATUS_OK }

In [33]:
model = XGBRegressor(**xgb_params)
model.fit(X_train.iloc[:10000, :], y_train[:10000])
# X_test = df_test.loc[:,"f0":"f284"]
# print(X_test.loc[:100, mi_scores > 0.0])
predictions = model.predict(X_test)
roc_auc_score(y_test, predictions)

0.8479999683931316

In [70]:
Xx_train = X_train.copy()
Xx_test = X_test.copy()
features_to_log = [
    "f2",    "f9",    "f18",    "f29",    "f31",    "f32",    "f37",    "f42",    "f47",    "f50",    "f53",    "f55",    "f58",    "f63",
    "f64",    "f69",    "f71",    "f87",     "f109",    "f112",    "f118",    "f123",    "f128",    "f175",    "f176",    "f183",    "f185",
    "f200",    "f202",    "f211",    "f212",    "f231",    "f236",    "f238",    "f240",
]
for feature in set(Xx_train.columns).intersection(features_to_log):
    print(feature)
    Xx_train[feature] = X_train[feature].map(lambda x: log(x + 0.0001))
for feature in set(Xx_train.columns).intersection(features_to_log):
    Xx_test[feature] = X_test[feature].map(lambda x: log(x + 0.0001))

f32
f71
f231
f18
f37
f42
f202
f240
f185
f69
f58
f211
f9
f53
f212
f238


In [77]:
lmodel = LinearRegression()  # coef_, feature_names_in_
lmodel.fit(Xx_train, y_train)
predictions = lmodel.predict(Xx_test.loc[:,mi_scores > 0])
print(roc_auc_score(y_test, predictions))

logmodel = LogisticRegression(max_iter=10000)
logmodel.fit(X_train, y_train)
predictions = logmodel.predict(X_test.loc[:,mi_scores > 0])
print(roc_auc_score(y_test, predictions))

0.8335109435337977


In [127]:
X_train.shape
X_test.shape

(200000, 148)

In [145]:
#predictions1 = model.predict(X_test)
#predictions2 = lmodel.predict(X_test)
# X_test['a'] = predictions1
# X_test['b'] = predictions2

model3 = XGBRegressor(
    objective="binary:logistic",
    eval_metric = "auc",
)
model3.fit(X_test.loc[:,['a','b']].iloc[:170000,:], y_test[:170000])
print(roc_auc_score(y_test[170000:], model3.predict(X_test.loc[:,['a','b']].iloc[170000:,:])))


0.8465985399233031


In [104]:
predictions2 = map(lambda x: min(round(x), 1), predictions)
print(roc_auc_score(y_test, list(map(lambda x: min(round(x), 1), predictions))))

0.7651669199991975


In [95]:
print([i for i in map(lambda x: x**2, [1,2,3])])

[1, 4, 9]


In [76]:
a = sorted(((coef, i)for i, coef in enumerate(model.feature_importances_)), reverse=True)
print(a)

[(0.58445877, 16), (0.06848967, 94), (0.03419731, 40), (0.02287083, 35), (0.018578827, 72), (0.0151742315, 82), (0.011422357, 46), (0.007808696, 5), (0.007405527, 52), (0.0073559163, 127), (0.007032689, 9), (0.0060448926, 45), (0.0059362054, 71), (0.005806316, 2), (0.005741549, 101), (0.005358314, 14), (0.005191039, 15), (0.0050935843, 43), (0.0050609796, 42), (0.0050144284, 107), (0.0047325264, 34), (0.004626298, 111), (0.0044322186, 54), (0.0043406705, 133), (0.004033894, 1), (0.0038640331, 65), (0.0037717612, 135), (0.0035309475, 56), (0.003530528, 13), (0.003343811, 31), (0.0033052503, 59), (0.0032556343, 103), (0.0030365034, 80), (0.0029186457, 3), (0.0027143513, 11), (0.0025116766, 75), (0.0024716018, 51), (0.0024282, 44), (0.0024144377, 74), (0.0023313363, 78), (0.0023146966, 113), (0.002263072, 67), (0.0022607925, 81), (0.0020896245, 55), (0.0019593267, 32), (0.0018448781, 121), (0.0016720839, 29), (0.0016541507, 10), (0.0016391496, 76), (0.0016262864, 4), (0.0015932797, 41), (

In [67]:
a = sorted(((coef, i)for i, coef in enumerate(lmodel.coef_)), reverse=True)
print(a)

[(0.4895005, 72), (0.28519273, 2), (0.23762928, 15), (0.23277172, 14), (0.23074074, 45), (0.20273374, 65), (0.17734738, 56), (0.17332348, 82), (0.16113313, 13), (0.15722449, 59), (0.1415964, 44), (0.13130072, 29), (0.1020079, 64), (0.09302723, 10), (0.091217436, 62), (0.08320858, 54), (0.08135526, 32), (0.06443855, 106), (0.06384713, 87), (0.06078648, 121), (0.06039912, 124), (0.056001708, 51), (0.046479136, 126), (0.043971904, 55), (0.04039809, 63), (0.03936003, 8), (0.037410572, 18), (0.03605589, 33), (0.03318128, 96), (0.031029081, 120), (0.02978688, 86), (0.028920736, 0), (0.028785594, 66), (0.028783023, 30), (0.027883528, 122), (0.027050273, 93), (0.025730625, 12), (0.023955341, 25), (0.02150826, 50), (0.021487061, 23), (0.02039779, 74), (0.01853353, 53), (0.01827091, 98), (0.01723714, 60), (0.01657787, 116), (0.015267471, 22), (0.014667569, 73), (0.009432383, 83), (0.009048467, 95), (0.008044459, 28), (0.007877145, 38), (0.0076247957, 88), (0.0067775557, 17), (0.0057829116, 108),

In [36]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 20,
                        trials = trials)

print("\nThe best hyperparameters are : ")
print(best_hyperparams)

SCORE:0.8060374201662612
SCORE:0.8057820007143253
SCORE:0.8059321807888494
SCORE:0.8059949513744904
SCORE:0.8059178112077983
SCORE:0.8060443899313222
SCORE:0.8062959749249723
SCORE:0.805849422410017
SCORE:0.8058248517569222
SCORE:0.8062248943713741
SCORE:0.8002816464286098
SCORE:0.8061633181383141
SCORE:0.8053581452484161
SCORE:0.8055882694456915
SCORE:0.8064665466935608
SCORE:0.8057249404410238
SCORE:0.8058866901405487
SCORE:0.8049977880697202
 90%|█████████ | 18/20 [17:59<01:59, 59.95s/trial, best loss: -0.8064665466935608]


KeyboardInterrupt: 