In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import scale

from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.svm import SVR
import lightgbm as lgb
from lightgbm import LGBMRegressor

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingCVClassifier

# Misc
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale
from sklearn.preprocessing import RobustScaler

In [3]:
train_file = 'train_set.csv'
test_file = 'test_set.csv'

data = pd.read_csv(train_file)
train_x, train_y = data.drop(['ID', 'y'], axis=1), data['y']
test_x = pd.read_csv(test_file)

In [4]:
def label_encoding(df_data):
    try:
        data = df_data.copy()
        cat_cols = data.select_dtypes(['category']).columns
        if len(cat_cols) == 0:
            cat_cols = data.select_dtypes(exclude=['number']).columns
            data[cat_cols] = data[cat_cols].astype('category')
        data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes)
        return data
    except:
        raise Exception('Label encoding error')

def onehot_encoding(df_data):
    data = df_data.copy()
    return pd.get_dummies(data)

In [5]:
df_train_x = label_encoding(train_x)
x_train, x_test, y_train, y_test = train_test_split(df_train_x, train_y, test_size=0.75, random_state=42)

In [7]:
# Support Vector Regressor
# svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003))

# Random Forest Regressor
rf = RandomForestRegressor(n_estimators=500,
                          max_depth=15,
                          min_samples_split=5,
                          min_samples_leaf=5,
                          max_features=None,
                          oob_score=True,
                          random_state=42)

lightgbm = LGBMRegressor(max_bin=512,
                         learning_rate=0.01,
                         n_estimators=6000,
                         boosting_type="gbdt",
                         objective="binary",
                         metric="binary_logloss",
                         num_leaves=10,
                         verbose= -1,
                         min_data=100,
                         bagging_fraction=0.8,
                         bagging_freq=4, 
                         bagging_seed=8,
                         feature_fraction=0.2,
                         feature_fraction_seed=8,
                         boost_from_average=True,
                         random_state=42)


# Stack up all the models above, optimized using xgboost
stack_gen = StackingCVClassifier(classifiers=(rf, lightgbm),
                                meta_classifier=lightgbm)

In [8]:
# svr_model = svr.fit(x_train, y_train)
rf_model = rf.fit(x_train, y_train)
lightgbm_model = lightgbm.fit(x_train, y_train)
stack_gen_model = stack_gen.fit(x_train, y_train)

In [9]:
# svr_score = roc_auc_score(y_test, svr_model.predict(x_test))
rf_score = roc_auc_score(y_test, rf_model.predict(x_test))
lightgbm_score = roc_auc_score(y_test, lightgbm_model.predict(x_test))
stack_score = roc_auc_score(y_test, stack_gen_model.predict(x_test))

In [10]:
df_score = pd.DataFrame({
#                 'svr':svr_score, 
                'rf': rf_score,
                'lightgbm': lightgbm_score, 
                'stack': stack_score}, 
                index=['auc_score']).transpose()

df_score_weight = df_score.copy()
df_score_weight['weight'] = df_score['auc_score'] / np.sum(df_score['auc_score'])
df_score_weight

Unnamed: 0,auc_score,weight
rf,0.91917,0.336839
lightgbm,0.901612,0.330405
stack,0.908027,0.332756


In [11]:
# Blend models in order to make the final predictions more robust to overfitting
def blended_predictions(weights, X):
    return ((weights[0] * rf_model.predict(X)) + \
            (weights[1] * lightgbm_model.predict(X)) + \
            (weights[2] * stack_gen_model.predict(X)) 
           )

In [12]:
X = label_encoding(test_x.drop('ID', axis=1))
y_prediction = blended_predictions(df_score_weight['weight'].values, X)

In [13]:
id_value = test_x['ID']
data_tuples = list(zip(id_value, y_prediction))
df_output = pd.DataFrame(data_tuples, columns=['ID', 'pred'])

In [14]:
df_output.head()

Unnamed: 0,ID,pred
0,25318,0.060961
1,25319,0.006117
2,25320,0.005795
3,25321,0.724066
4,25322,0.029099


In [16]:
df_output.to_csv('result.csv')