## Submission Generation

In [1]:
import numpy as np
import pandas as pd

In [2]:
class paths:
    TRAIN = './data/train.csv'
    TEST = './data/test.csv'

In [3]:
from engineering import full_feature_engineering

train_df = pd.read_csv(paths.TRAIN)
test_df = pd.read_csv(paths.TEST)

In [4]:
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# From EnsembleOptimizing
rf_weights = [0.44, 0.20, 0.14, 0.24, 0.36]

gkf = GroupKFold(n_splits=5)

rf_cols = ['Y_Maximum', 'Y_Minimum', 'X_Maximum', 'X_Minimum',
           'pca_37', 'pca_7']
rf_df = full_feature_engineering(train_df, drop_cols=rf_cols)
rf_test = full_feature_engineering(train_df, test_df=test_df, drop_cols=rf_cols)

xgb_cols = ['Y_Maximum', 'Y_Minimum', 'X_Maximum', 'X_Minimum',
            'Log_Outside_X_Index', 'Log_X_Perimeter', 'Log_Y_Perimeter',
            'Log_Width', 'Log_Lum', 'Log_Height', 'Height', 'pca_3',
            'pca_28', 'pca_25', 'pca_30', 'pca_33', 'pca_36', 'pca_31',
            'pca_37', 'pca_35', 'pca_26', 'pca_23', 'pca_20', 'pca_19',
            'pca_17', 'pca_16', 'pca_9', 'pca_2', 'pca_1', 'LogOfAreas',
            'Edges_X_Index', 'pca_13', 'pca_29', 'pca_22', 'pca_14', 'pca_32',
            'pca_34', 'pca_18', 'pca_27', 'pca_15', 'pca_11', 'Outside_Global_Index',
            'pca_21', 'pca_10', 'pca_8', 'pca_7', 'Log_X_Index', 'pca_24', 'Log_Y_Index']
xgb_df = full_feature_engineering(train_df, drop_cols=xgb_cols)
xgb_test = full_feature_engineering(train_df, test_df=test_df, drop_cols=xgb_cols)

y_cols = ['No Defect', 'Pastry', 'Z_Scratch', 'K_Scatch',
          'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
real_cols = ['Pastry', 'Z_Scratch', 'K_Scatch',
             'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
y = rf_df[y_cols]
ids = train_df['id']

X_rf = rf_df.drop(columns=[*y_cols, 'id'])
X_xgb = xgb_df.drop(columns=[*y_cols, 'id'])

rf_params = {
    'n_estimators': 5000,
    'max_samples': 0.20,
    'max_features': 15,
    'n_jobs': -1,
    'random_state': 0,
    'class_weight': 'balanced',
}

xgb_params = {
    'n_estimators': 1500,
    'n_jobs': -1,
    'early_stopping_rounds': 9,
    'lambda': 1e-7,
    'learning_rate': 0.0123,
    'alpha': 3.059, 
    'subsample': 0.712
}

total_logits = None
for fold, (train_index, valid_index) in enumerate(gkf.split(X_rf, y, ids)):
    train_X_rf = X_rf.loc[train_index]
    valid_X_rf = X_rf.loc[valid_index]

    train_X_xgb = X_xgb.loc[train_index]
    valid_X_xgb = X_xgb.loc[valid_index]

    train_y = y.loc[train_index]
    valid_y = y.loc[valid_index]

    # RF
    rf_model = RandomForestClassifier(**rf_params)
    rf_model.fit(train_X_rf, train_y)

    rf_test_preds = np.array(rf_model.predict_proba(rf_test.drop(columns='id')))[:, :, 1].T
    rf_logits = np.log((0.00001 + rf_test_preds)/(1.00001 - rf_test_preds))

    # XGB
    xgb_model = XGBClassifier(**xgb_params)
    xgb_model.fit(train_X_xgb, train_y, eval_set=[(valid_X_xgb, valid_y)], verbose=0)

    xgb_test_preds = np.array(xgb_model.predict_proba(xgb_test.drop(columns='id')))
    xgb_logits = np.log((0.00001 + xgb_test_preds)/(1.00001 - xgb_test_preds))

    # Averaging logits according to weight
    logits = rf_weights[fold] * rf_logits + (1 - rf_weights[fold]) * xgb_logits
    if total_logits is not None:
        total_logits += logits
    else:
        total_logits = logits

exp_logits = np.exp(total_logits / 5)
predictions = exp_logits / (1 + exp_logits)
predictions /= predictions.sum(axis=1).reshape(-1,1)

submission_df = pd.DataFrame(test_df['id'], columns=['id'])
submission_df[y_cols[1:]] = predictions[:,1:]
submission_df.to_csv('./data/submission.csv', index=None)