## using group multilabel stratified kfold CV

In [None]:
!ls ../input/

In [None]:
!python ../input/gq-inf-scripts-gmkf/tf_bert_large_inf.py

In [None]:
!python ../input/gq-inf-scripts-gmkf/tf_bert_base_inf.py

In [None]:
!python ../input/gq-inf-scripts-gmkf/pytorch_feat_inf.py

In [None]:
# !python ../input/gq-inf-scripts/pytorch_albert_inf.py

In [None]:
# !python ../input/gq-inf-scripts/pytorch_bert_inf.py

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr, rankdata
from scipy.optimize import minimize
from collections import defaultdict

kf = np.load("../input/split-dataset/GroupMultilabelStratifiedKfold.npy", allow_pickle=True)

In [None]:
train_df = pd.read_csv('../input/google-quest-challenge/train.csv')
sub_df = pd.read_csv('../input/google-quest-challenge/sample_submission.csv')

In [None]:
# model_names = ["albert", "bert_base", "tf_bert_large", "feat_multi", "feat_nn",
#                "tf_bert_base_epoch1", "tf_bert_base_epoch2", "tf_bert_base_epoch3"]
model_names = ["feat_multi", "feat_nn", "tf_bert_large",
               "tf_bert_base_epoch1", "tf_bert_base_epoch2", "tf_bert_base_epoch3"]

In [None]:
def compute_spearmanr(trues, preds):
    rhos = []
    for col_true, col_pred in zip(trues.T, preds.T):
        rhos.append(spearmanr(col_true, col_pred + np.random.normal(0, 1e-7, col_pred.shape[0])).correlation)
    return np.mean(rhos)

# def compute_spearmanr(trues, preds, columns=None):
#     rhos = []
#     for i, (col_true, col_pred) in enumerate(zip(trues.T, preds.T)):
#         if i == 0:
#             # qa_id
#             continue
#         r = spearmanr(col_true, col_pred).correlation
#         if columns:
#             print("{:.3f}\t{}".format(r, columns[i]))
#         rhos.append(r)
#     return np.mean(rhos)

In [None]:
target_columns = list(sub_df.columns)

results = []
for fold_i in range(1, 6):
    result_df = pd.DataFrame(index=model_names, columns=target_columns[-30:] + ["ave"])
    for model_name in model_names:
        for target in target_columns[-30:]:
            true = train_df[target_columns].iloc[kf[fold_i-1][1]].reset_index(drop=True)
            oof = pd.read_csv(model_name + f"/oof{fold_i}.csv")
            r = spearmanr(true[target].values, oof[target].values + np.random.normal(0, 1e-7, oof.shape[0])).correlation
            result_df.loc[model_name, target] = r
        result_df.loc[model_name, "ave"] = np.mean(result_df.loc[model_name, target_columns[-30:]].values)
    results.append(result_df)


In [None]:
results[0]

In [None]:
# optimize model weight

def norm_with_rankdata(df):
    for col_name in target_columns[-30:]:
        df[col_name] = rankdata(df[col_name].values) / len(df)
    return df

kfold = 5
model_number = len(model_names)

true_dfs = []
for i in range(5):
    oof = train_df.iloc[kf[i][1]]
    true_dfs.append(oof[target_columns])

model_df = defaultdict(lambda: [])
for model_name in model_names:
    print(model_name)

    scores = []
    pred_dfs = []

    for i in range(5):
        pred = pd.read_csv(f"{model_name}/oof{i+1}.csv")
        pred = norm_with_rankdata(pred)
        model_df[model_name].append(pred)
        true = true_dfs[i]
        rho = compute_spearmanr(true.values, pred.values)
        scores.append(rho)
        pred_dfs.append(pred.copy())
    print([f"fold{i+1}: {scores[i]:.4}" for i in range(5)])
    
scores = [{} for _ in range(kfold)]
all_coeffs = []

for col_num, col_name in enumerate(target_columns[-30:]):
    coefficients = np.zeros((kfold, model_number))
    print(col_name, "\n")
    for fold_num, (_, vld_idx) in enumerate(kf): 
        print("\n"+"-"*50+'\n[Fold {}/{}]'.format(fold_num + 1, kfold))
        # optimize function
        def function_spearmanr(x):
            true = true_dfs[fold_num].loc[:, col_name].values
            pred = np.zeros(len(true))
            for k, (model_name, df) in enumerate(model_df.items()):
                pred += x[k] * df[fold_num].loc[:, col_name].values
            return -spearmanr(true, pred).correlation

        # initial weights
        x0 = [1 for _ in range(model_number)]
        # optimize
        res = minimize(function_spearmanr, x0, method='nelder-mead', options={'xtol': 1e-4, 'disp': True})
        # normalize sum(weights) = 1.0
        coeffs = [max(val, 0) for val in res.x]
        coefficients[fold_num,:] = coeffs / np.sum(coeffs)
        print('\nfinal vector: {}'.format(coefficients[fold_num,:]))

        y_true = true_dfs[fold_num].loc[:, col_name].values
        y_pred = np.zeros(len(y_true))
        for k, (model_name, df) in enumerate(model_df.items()):
            y_pred += coefficients[fold_num, k] * df[fold_num].loc[:, col_name].values

        score = spearmanr(y_true, y_pred).correlation
        scores[fold_num][col_name] = score
        print(f"valid's spearmanr: {score:<7.4f} \n")
    all_coeffs.append(coefficients)


In [None]:
np.mean(np.array([list(s.values()) for s in scores]))

In [None]:
scores

In [None]:
# ensamble
sub_df = pd.read_csv("../input/google-quest-challenge/sample_submission.csv")
submission = np.zeros((len(sub_df), 30)).T

# load each submission.csv
FOLD_NUM = 5
inf_sub = []
for fold_num in range(FOLD_NUM):
    l = []
    for model_num, model_name in enumerate(model_names):
        sub_ = pd.read_csv(f"{model_name}/submission{fold_num+1}.csv")
        sub_ = norm_with_rankdata(sub_)
        l.append(sub_.copy())
    inf_sub.append(l.copy())

In [None]:
for col_num, col_name in enumerate(target_columns[-30:]):
    for fold_num in range(FOLD_NUM): 
        for model_num in range(len(model_names)):
            coef = all_coeffs[col_num][fold_num][model_num]
            submission[col_num] += coef / FOLD_NUM * inf_sub[fold_num][model_num][col_name].values

sub_df.iloc[:, -30:] = submission.T
sub_df

In [None]:
print(np.max(sub_df.iloc[:, -30:].values), np.min(sub_df.iloc[:, -30:].values))

In [None]:
def norm_sub(df):
    for col_name in df.columns[-30:]:
        tmp_df = df[col_name].values
        v_max = np.max(tmp_df) + 0.01
        v_min = np.min(tmp_df) - 0.01
        df[col_name] = df[col_name].apply(lambda x: (x - v_min) / (v_max - v_min))
        df[col_name] = df[col_name].values + np.random.normal(0, 1e-7, len(df))
    return df
sub_df = norm_sub(sub_df)

In [None]:
print(np.max(sub_df.iloc[:, -30:].values), np.min(sub_df.iloc[:, -30:].values))

In [None]:
sub_df

In [None]:
# # check bug
# debug_sub_files = ["submission_230.csv", "submission_253.csv", "submission_288.csv", "submission_301.csv", 
#                    "submission_305.csv", "submission_335.csv", "submission_342.csv", "submission_350.csv"]
# for model_num, model_name in enumerate(model_names):
#     try:
#         for i in range(1, 6):
#             tmp = pd.read_csv(f"{model_name}/oof{i}.csv")
#             tmp = pd.read_csv(f"{model_name}/submission{i}.csv")
#     except:
#         debug_sub = pd.read_csv(f"../input/gq-debug/{debug_sub_files[model_num]}")
#         sub_df = pd.merge(sub_df["qa_id"], debug_sub, on="qa_id", how='outer')
#         sub_df = sub_df.fillna(0)
#         sub_df = norm_sub(sub_df)
#         break
# sub_df.to_csv("submission.csv", index=False)

In [None]:
import numpy as np
import scipy as sp
from collections import defaultdict
from functools import partial

def compute_actual_spearmanr(trues, preds):
    rhos = []
    for col_true, col_pred in zip(trues.T, preds.T):
        rhos.append(spearmanr(col_true, col_pred).correlation)
    return np.mean(rhos)

class OptimizedRounder(object):
    # inputs are between 0 to 1
    # n is number of annotator + 1
    def __init__(self, n):
        self.coef_ = 0
        self.n = n

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
                continue
            for i in range(self.n-2):
                if coef[i] <= pred and pred < coef[i+1]:
                    X_p[i] = (i + 1) / (self.n - 1)
                    break
            else:
                X_p[i] = 1

        ll = spearmanr(y, X_p).correlation
        return -ll

    def fit(self, X, y, init_coef=None):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        if init_coef == None:
            initial_coef = [1/self.n * (x+1) for x in range(self.n-1)]
        else:
            initial_coef = init_coef
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
                continue
            for i in range(self.n-2):
                if coef[i] <= pred and pred < coef[i+1]:
                    X_p[i] = (i + 1) / (self.n - 1)
            else:
                X_p[i] = 1
        return X_p

    def coefficients(self):
        return self.coef_['x']

In [None]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

def compute_spearmanr(trues, preds, columns=None):
    rhos = []
    for i, (col_true, col_pred) in enumerate(zip(trues.T, preds.T)):
        if i == 0:
            # qa_id
            continue
        r = spearmanr(col_true, col_pred).correlation
        if columns:
            print("{:.3f}\t{}".format(r, columns[i-1]))
        rhos.append(r)
    return np.mean(rhos)

def compare_spearmanr(trues, preds, pp_preds, columns=None):
    rhos = []
    pp_rhos = []
    for i, (col_true, col_pred, col_pp_pred) in enumerate(zip(trues.T, preds.T, pp_preds.T)):
        if i == 0:
            # qa_id
            continue
        r = spearmanr(col_true, col_pred).correlation
        pp_r = spearmanr(col_true, col_pp_pred).correlation
        if columns:
            if r < pp_r:
                print("{:.3f} ->".format(r) + color.BOLD +  " {:.3f}".format(pp_r) + color.END + " \t{}".format(columns[i-1]))
            elif (r - pp_r) < 10**(-6):
                print("{:.3f} ->".format(r) + " {:.3f}".format(pp_r) +  " \t{}".format(columns[i-1]))
            else:
                print("{:.3f} ->".format(r) + color.RED +  " {:.3f}".format(pp_r) + color.END + " \t{}".format(columns[i-1]))

        rhos.append(r)
        pp_rhos.append(pp_r)
    whole_base = np.mean(rhos)
    whole_pp = np.mean(pp_rhos)
    print("Whole Score")
    if whole_base < whole_pp:
        print("{:.3f} ->".format(whole_base) + color.BOLD +  " {:.3f}".format(whole_pp) + color.END + " +{}".format(abs(whole_pp - whole_base)))
    elif (whole_base - whole_pp ) < 10**(-6):
        print("{:.3f} ->".format(whole_base) + " {:.3f}".format(whole_pp))
    else:
        print("{:.3f} ->".format(whole_base) + color.RED +  " {:.3f}".format(whole_pp) + color.END + " -{}".format(abs(whole_pp - whole_base)))
    return whole_base, whole_pp

def inf_numbert_of_annotators(v):
    s = set(v)
    d = 10

    for v in s:
        for u in s:
            if v != u:
                d = min(abs(v - u), d)
    for i in range(100):
        if d * i >= 1 - 10**(-3):
            return i

def zero_one_scale(v):
    max_x = np.max(v)
    min_x = np.min(v)
    return (v - min_x) / (max_x - min_x)


# check postprocess score with oof
oofs = []
for fold_num in range(FOLD_NUM):
    oof_ = np.zeros((len(model_df[model_names[0]][fold_num]), 30)).T
    for col_num, col_name in enumerate(target_columns[-30:]):
        for model_num, model_name in enumerate(model_names):
            coef = all_coeffs[col_num][fold_num][model_num]
            oof_[col_num] += coef * model_df[model_name][fold_num][col_name].values
    oofs.append(oof_.T)

gold_label_df = train_df[target_columns]
pred_label_df = gold_label_df.copy()

for fold_num in range(5):
    pred_label_df.loc[kf[fold_num][1], target_columns[1:]] = oofs[fold_num]
pred_label_df = norm_with_rankdata(pred_label_df)


def post_process(train_df, train, pred_df):
    # print(len(train), len(pred_df))
    df = pred_df.copy()

    df.loc[(train["host"] != "english.stackexchange.com") & (train["host"] != "ell.stackexchange.com"), "question_type_spelling"] = 0

    df = norm_with_rankdata(df)

    # question_type_spelling not english or ell

    # 0-1 scaling　
    for col_name in target_columns[1:]:
        d = defaultdict(int)
        for v in gold_label_df[col_name].values:
            d[v] += 1
        d = sorted(d.items())
        data_length = len(gold_label_df)

        coefs = []
        for i, (_, v) in enumerate(d[:-1]):
            v_ = v / data_length
            if i > 0:
                v_ += coefs[i-1]
            coefs.append(v_)

        n = inf_numbert_of_annotators(train_df[col_name].values)
        gold = gold_label_df[col_name].values
        pred = pred_label_df[col_name].values
        base_score = spearmanr(gold, pred).correlation

        optR = OptimizedRounder(len(d))
        optR.fit(pred, gold, coefs)
        opt_v = optR.predict(pred, optR.coefficients())
        opt_score = spearmanr(gold, opt_v).correlation
        if base_score < opt_score and n < 10:
            df[col_name] = optR.predict(df[col_name].values, coefs)

        if len(set(df[col_name].values)) == 1:
            df[col_name] = pred_df[col_name].values

    df.loc[((train["host"] == "english.stackexchange.com") | (train["host"] == "ell.stackexchange.com")), "question_type_spelling"] += 0.1

    df["question_type_spelling"] = zero_one_scale(df["question_type_spelling"].values)

    return df

In [None]:
train_df = pd.read_csv('../input/google-quest-challenge/train.csv')
test_df = pd.read_csv('../input/google-quest-challenge/test.csv')

sub_df = post_process(train_df, test_df, sub_df)

In [None]:
cv_score = 0

for fold_num in range(FOLD_NUM):
    fold_base = model_df[model_names[0]][fold_num].copy()
    fold_base.iloc[:, -30:] = oofs[fold_num]
    r = compute_spearmanr(true_dfs[fold_num].iloc[:, -30:].values, fold_base.iloc[:, -30:].values)
    cv_score += r / FOLD_NUM
print(cv_score)

In [None]:
cv_score = 0
for fold_num in range(FOLD_NUM):
    fold_base = model_df[model_names[0]][fold_num].copy()
    fold_base.iloc[:, -30:] = oofs[fold_num]
    fold_base = post_process(train_df, train_df.iloc[kf[fold_num][1]].reset_index(drop=True), fold_base)
    r = compute_actual_spearmanr(true_dfs[fold_num].iloc[:, -30:].values, fold_base.iloc[:, -30:].values)
    cv_score += r / FOLD_NUM
print(cv_score)

In [None]:
sub_df[sub_df["question_type_spelling"] > 0]

In [None]:
print(np.max(sub_df.iloc[:, -30:].values), np.min(sub_df.iloc[:, -30:].values))

In [None]:
sub_df

In [None]:
sub_df.to_csv("submission.csv", index=False)