# Install libraries

# Import modules

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn import metrics
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.core.metrics import make_scorer
import gc
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
from sklearn.model_selection import *


# Configurations

In [None]:
CONFIG = {
    "n_folds" : 4,
    "fold" : 2,
    "seed" : 69,
    "drop_cols" : [],
    "target" : "class",
    "presets" : "optimize_for_deployment",
    "binary_threshold" : 0.5,
}

# Load data

In [None]:
DATA_PATH = '/home/nischay/auto4/data'
train_path = f'{DATA_PATH}/train.csv'
test_path = f'{DATA_PATH}/test.csv'
sub_path = f'{DATA_PATH}/sample_submission.csv'

In [None]:
df_train = pd.read_csv(train_path)
df_train.shape

# Preprocessing

In [None]:
dict_class2label = {"e":0, "p":1}
dict_label2class = {0:"e", "p":1}
df_train[CONFIG["target"]] = df_train[CONFIG["target"]].apply(lambda x: dict_class2label[x])

# Split data with KFold

In [None]:
kf = StratifiedKFold(n_splits=CONFIG['n_folds'], shuffle=True, random_state=CONFIG["seed"])

for fold, ( _, val_) in enumerate(kf.split(df_train, df_train[CONFIG["target"]])):
      df_train.loc[val_ , "kfold"] = int(fold)

In [None]:
def prepare_datasets(df, fold, drop_columns=[]):
    _df_train = df[df.kfold != fold].reset_index(drop=True)
    _df_valid = df[df.kfold == fold].reset_index(drop=True)

    # drop "id" column
    _df_train = _df_train.drop(columns=drop_columns)
    _df_valid = _df_valid.drop(columns=drop_columns)
    
    train_data = TabularDataset(_df_train)
    valid_data = TabularDataset(_df_valid)
    return train_data, valid_data

In [None]:
train_data, valid_data = prepare_datasets(
    df_train, CONFIG["fold"], drop_columns=CONFIG["drop_cols"]
)

# Custom metrics (MCC)

In [None]:
ag_mcc_scorer = make_scorer(name='mcc',
                                 score_func=metrics.matthews_corrcoef,
                                 optimum=1,
                                 greater_is_better=True)

In [None]:
hyperparameters = {
	'NN_TORCH': {},
	'GBM': {'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}},
	'CAT': {'max_depth':12},
	'XGB': {'grow_policy': 'depthwise', 'learning_rate': 0.0626750932713805277,
              'n_estimators': 240, 'max_depth': 20, 'reg_lambda': 0.27,
              'min_child_weight': 4.952710346179816, 'subsample': 0.953, 
              'colsample_bytree': 0.7539110455959812, 'tree_method': 'hist',
              'enable_categorical': True, 'gamma': 0,},
	'FASTAI': {},
	'RF': {'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}},
}


# Training model

In [None]:
predictor = TabularPredictor(
    label=CONFIG["target"],
    problem_type="binary",
    path="exp1_f3/predictor",
)

In [None]:
%%time
predictor.fit(
    train_data,
    tuning_data=valid_data,
    save_space=True,
    hyperparameters = hyperparameters,
    presets=CONFIG["presets"],
    use_bag_holdout=True,
    ag_args_fit={'num_gpus': 3},
)

# Model summary (Leaderboard)

In [None]:
predictor.leaderboard(valid_data, extra_metrics=[ag_mcc_scorer], silent=True)

In [None]:
predictor

# Inference

In [None]:
df_test = pd.read_csv(test_path)
df_test = df_test.drop(columns=CONFIG["drop_cols"])
test_data = TabularDataset(df_test)

In [None]:
test_data['kfold'] = CONFIG["fold"]

In [None]:
y_pred = predictor.predict_proba(test_data).iloc[:, 1].values
df_test["pred"] = y_pred


In [None]:
df_test[['id','pred']].to_csv(f"exp1_f3/raw_preds.csv", index=False)


In [None]:
df_test[['id','pred']]