In [5]:
import time
import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingCVRegressor
from mlxtend.classifier import StackingCVClassifier

# sklearn
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, scale
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import roc_auc_score

In [2]:
train_file = 'train_set.csv'
test_file = 'test_set.csv'

data = pd.read_csv(train_file)
train_x, train_y = data.drop(['ID', 'y'], axis=1), data['y']
test_x = pd.read_csv(test_file)

In [3]:
def label_encoding(df_data, is_scale=False):
    try:
        data = df_data.copy()
        cat_cols = data.select_dtypes(['category']).columns
        if len(cat_cols) == 0:
            cat_cols = data.select_dtypes(exclude=['number']).columns
            data[cat_cols] = data[cat_cols].astype('category')
        data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes)
        if is_scale:
            # standarizing 
            data = scale(data)
        return data
    except:
        raise Exception('Label encoding error')

def onehot_encoding(df_data):
    data = df_data.copy()
    return pd.get_dummies(data)

In [4]:
# setup model types

# Random Forest Regressor
rf = RandomForestRegressor(n_estimators=1000,
                          max_depth=15,
                          min_samples_split=5,
                          min_samples_leaf=5,
                          max_features=None,
                          oob_score=True,
                          random_state=42)

lightgbm = LGBMRegressor(max_bin=512,
                         learning_rate=0.01,
                         n_estimators=6000,
                         boosting_type="gbdt",
                         objective="binary",
                         metric="binary_logloss",
                         num_leaves=10,
                         verbose= -1,
                         min_data=100,
                         bagging_fraction=0.8,
                         bagging_freq=4, 
                         bagging_seed=8,
                         feature_fraction=0.2,
                         feature_fraction_seed=8,
                         boost_from_average=True,
                         random_state=42)


# Stack up all the models above, optimized using xgboost
stack_gen = StackingCVRegressor(regressors=(rf, lightgbm),
                                meta_regressor=rf)

In [6]:
start_t = time.perf_counter()
# training/testing split
df_train_x = label_encoding(train_x, is_scale=True)
x_train, x_test, y_train, y_test = train_test_split(df_train_x, train_y, test_size=0.75, random_state=42)
end_t = time.perf_counter()
print('running time: {} ms'.format((end_t-start_t)*1000) )

running time: 68.05323204025626 ms


  # This is added back by InteractiveShellApp.init_path()


In [7]:
start_t = time.perf_counter()
# fit model
rf_model = rf.fit(x_train, y_train)
lightgbm_model = lightgbm.fit(x_train, y_train)
stack_gen_model = stack_gen.fit(x_train, y_train)
end_t = time.perf_counter()
print('running time: {} ms'.format((end_t-start_t)*1000) )

running time: 78230.85947101936 ms


In [8]:
start_t = time.perf_counter()
# auc score on each model
rf_score = roc_auc_score(y_test, rf_model.predict(x_test))
lightgbm_score = roc_auc_score(y_test, lightgbm_model.predict(x_test))
stack_score = roc_auc_score(y_test, stack_gen_model.predict(x_test))
end_t = time.perf_counter()
print('running time: {} ms'.format((end_t-start_t)*1000) )

running time: 3809.59408194758 ms


In [9]:
start_t = time.perf_counter()
# cross validation score
kf = KFold(n_splits=10, random_state=42, shuffle=True)
rf_cross_score = np.mean(cross_val_score(rf, x_train, y_train, cv=kf, scoring='roc_auc'))
lightgbm_cross_score = np.mean(cross_val_score(lightgbm, x_train, y_train, cv=kf, scoring='roc_auc'))
stack_cross_score = np.mean(cross_val_score(stack_gen, x_train, y_train, cv=kf, scoring='roc_auc'))
end_t = time.perf_counter()
print('running time: {} ms'.format((end_t-start_t)*1000) )

running time: 690803.9143639617 ms


In [11]:
df_score = pd.DataFrame({
                'rf': rf_score,
                'lightgbm': lightgbm_score, 
                'stack': stack_score}, 
                index=['auc_score']).transpose()

df_score_weight = df_score.copy()
df_score_weight['auc_weight'] = df_score['auc_score'] / np.sum(df_score['auc_score'])
cv_scores = np.array([rf_cross_score, lightgbm_cross_score, stack_cross_score])
df_score_weight['cv_weight'] = cv_scores / np.sum(cv_scores)
df_score_weight['total_weight'] = 0.5 * (df_score_weight['auc_weight'] + df_score_weight['cv_weight'])
df_score_weight

Unnamed: 0,auc_score,auc_weight,cv_weight,total_weight
rf,0.919789,0.337173,0.335848,0.336511
lightgbm,0.902198,0.330725,0.333103,0.331914
stack,0.905956,0.332102,0.331049,0.331576


In [12]:
# Blend models in order to make the final predictions more robust to overfitting
def blended_predictions(weights, X):
    return ((weights[0] * rf_model.predict(X)) + \
            (weights[1] * lightgbm_model.predict(X)) + \
            (weights[2] * stack_gen_model.predict(X)) 
           )

In [18]:
X = label_encoding(test_x.drop('ID', axis=1), is_scale=True)
y_prediction = blended_predictions(df_score_weight['total_weight'].values, X)

  # This is added back by InteractiveShellApp.init_path()


In [19]:
id_value = test_x['ID']
data_tuples = list(zip(id_value, y_prediction))
df_output = pd.DataFrame(data_tuples, columns=['ID', 'pred'])

In [23]:
df_output.head()

Unnamed: 0,ID,pred
0,25318,0.009446
1,25319,0.008684
2,25320,0.0064
3,25321,0.641019
4,25322,0.016958


In [22]:
df_output.to_csv('result_2.csv')