# Feature Engineering

Some codes related to feature engineering can be seen in this notebook

### Count Encoding

In [1]:
import numpy as np
import pandas as pd

def getCountVar(compute_df, count_df, var_name, count_var="v1"):
    """
    compute_df : Data frame for which the count encoding should be done
    count_df : Data frame from which the counts should be taken
    var_name : categorical variable for count encoding
    count_var : some other variable from the dataset (used as dummy variable to get count)
    """
    grouped_df = count_df.groupby(var_name, as_index=False)[count_var].agg('count')
    grouped_df.columns = [var_name, "var_count"]
    merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
    merged_df.fillna(-1, inplace=True)
    return list(merged_df["var_count"])

### Target Encoding

In [4]:
from sklearn import model_selection

def getDVEncodeVar(compute_df, target_df, var_name, target_var="RESPONDERS", min_cutoff=1):
    if type(var_name) != type([]):
        var_name = [var_name]
    grouped_df = target_df.groupby(var_name)[target_var].agg(["mean"]).reset_index()
    grouped_df.columns = var_name + ["mean_value"]
    merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
    merged_df.fillna(-1, inplace=True)
    return list(merged_df["mean_value"])


def do_target_encode():
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2018)
    for col in ["ZIP_CODE_FINAL", "DESIGNATION_FINAL"]:
        train_enc_values = np.zeros(train_df.shape[0])
        test_enc_values = 0
        for dev_index, val_index in kf.split(train_df):
            new_train_df = train_df[[col, "RESPONDERS"]]
            dev_X, val_X = new_train_df.iloc[dev_index], new_train_df.iloc[val_index]
            train_enc_values[val_index] =  np.array( getDVEncodeVar(val_X[[col]], dev_X, col))
            test_enc_values += np.array( getDVEncodeVar(test_df[[col]], dev_X, col))
        test_enc_values /= 5.
        train_df[col + "_enc"] = train_enc_values
        test_df[col + "_enc"] = test_enc_values
        print train_df[col + "_enc"].describe()
        print test_df[col + "_enc"].describe()

### Interaction features 

[XGBoost Feature Interactions and Importance](https://github.com/Far0n/xgbfi) by Faron