In [None]:
from pathlib import Path
import jsonlines
import pandas as pd


dump_dir = Path("mbart50_dumps")

all_bt_results = {}
for dump_path in dump_dir.iterdir():
    # if "backtranslation" in dump_path.name and "just_one" in dump_path.name:
    # if "backtranslation" in dump_path.name and "just_one" not in dump_path.name and "en-" in dump_path.name:
    # if "backtranslation" in dump_path.name and "-en" in dump_path.name:
    if "backtranslation" in dump_path.name:
        with jsonlines.open(dump_path, 'r') as reader:
            bt_results = list(reader)
        bt_results = pd.DataFrame(bt_results)
        
        score_cols = [col for col in bt_results.columns if "score" in col and "bt" not in col]
        bt_score_cols = [col for col in bt_results.columns if ("score" in col or "logprob" in col) and "bt" in col]

        all_reranked_scores = []
        for rerank_column in bt_score_cols:
            idxmax = bt_results.groupby("id")[rerank_column].idxmax()
            bt_results_reranked = bt_results.loc[idxmax]
            reranked_scores = bt_results_reranked[score_cols].mean()
            reranked_scores.name = f"reranked_{rerank_column}"
            all_reranked_scores.append(reranked_scores)
        all_reranked_scores = pd.concat(all_reranked_scores, axis=1)
        best_reranked_scores = all_reranked_scores.T.max()
        best_reranked_scores.name = "reranked_best"

        bt_results_ranked_by_confidence = bt_results.drop_duplicates("id", keep="first")
        orig_scores = bt_results_ranked_by_confidence[score_cols].mean()
        orig_scores.name = "orig"

        oracle_best_scores = {}
        oracle_worst_scores = {}
        for score_col in score_cols:
            sorted = bt_results[["id", score_col]].sort_values(["id", score_col])
            oracle_best = sorted.drop_duplicates("id", keep="last")[score_col].mean()
            oracle_worst = sorted.drop_duplicates("id", keep="first")[score_col].mean()
            oracle_best_scores[score_col] = oracle_best
            oracle_worst_scores[score_col] = oracle_worst
        oracle_best_scores = pd.DataFrame({"oracle_best": oracle_best_scores})
        oracle_worst_scores = pd.DataFrame({"oracle_worst": oracle_worst_scores})

        random_ranking_scores = bt_results[score_cols].mean()
        random_ranking_scores.name = "random"

        is_better = best_reranked_scores > orig_scores
        is_better.name = "is_better"

        all_scores = pd.concat([orig_scores, best_reranked_scores, is_better, oracle_best_scores, random_ranking_scores, oracle_worst_scores, all_reranked_scores], axis=1)

        lang_pair = [substr for substr in dump_path.name.split('_') if '-' in substr][0]

        all_bt_results[lang_pair] = bt_results
        
        print()
        print(lang_pair)
        display(all_scores)


#TODO: why so low? Doesn't match paper (https://arxiv.org/pdf/2008.00401.pdf, last page)


In [None]:
from sklearn.model_selection import train_test_split
test_size = 0.5
train_df, val_df = [], []
for lang_pair, lang_df in all_bt_results.items():
    lang_df["lang_pair"] = lang_pair
    ids = lang_df["id"].unique()
    train_ids, val_ids = train_test_split(ids, test_size=test_size, random_state=1)
    is_train = lang_df["id"].isin(train_ids)
    curr_train_df = lang_df.loc[is_train]
    curr_val_df = lang_df.loc[~is_train]
    assert len(set(curr_train_df["id"]).intersection(set(curr_val_df["id"]))) == 0
    train_df.append(curr_train_df)
    val_df.append(curr_val_df)
train_df = pd.concat(train_df, ignore_index=True)
val_df = pd.concat(val_df, ignore_index=True)

In [None]:
import numpy as np

is_pairwise = True

feature_names = ["gen_logprob"] + [col for col in bt_results.columns if ("score" in col or "logprob" in col) and "bt" in col]
target_column = "bertscore_f1"

def merge_df(df):
    df = df[["lang_pair", "id"] + feature_names + [target_column]].reset_index()
    df_merged = df.merge(df, on=["lang_pair", "id"])
    df_merged = df_merged.loc[df_merged["index_x"] != df_merged["index_y"]]

    target_diff = df_merged[f"{target_column}_x"] - df_merged[f"{target_column}_y"]
    target_comparison = np.digitize(target_diff, [-0.05, 0.05])

    df_merged["target_comparison"] = target_comparison
    return df_merged

if is_pairwise:
    train_pairwise_df = merge_df(train_df)
    val_pairwise_df = merge_df(val_df)
    print(pd.Series(train_pairwise_df["target_comparison"] - 1).value_counts().sort_index())
    print(pd.Series(val_pairwise_df["target_comparison"] - 1).value_counts().sort_index())

In [None]:
from sklearn.preprocessing import RobustScaler, FunctionTransformer

if is_pairwise:
    feature_cols = [col for col in train_pairwise_df.columns if any([name in col for name in feature_names])]
    train_X = train_pairwise_df[feature_cols]
    train_y = train_pairwise_df["target_comparison"]
    val_X = val_pairwise_df[feature_cols]
    val_y = val_pairwise_df["target_comparison"]

    # scaler_class = FunctionTransformer
    scaler_class = RobustScaler
    features_scaler = scaler_class().fit(train_X)

    train_X = features_scaler.transform(train_X)
    val_X = features_scaler.transform(val_X)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# model = LinearSVC(verbose=2)
model = RandomForestClassifier(n_estimators=3, verbose=2, class_weight="balanced")
model.fit(train_X, train_y)
train_pred = model.predict(train_X)
val_pred = model.predict(val_X)

print(classification_report(y_true=train_y, y_pred=train_pred))
print(classification_report(y_true=val_y, y_pred=val_pred))

In [None]:
train_pairwise_df["pairwise_pred"] = train_pred
val_pairwise_df["pairwise_pred"] = val_pred

In [None]:
sum_y = val_pairwise_df.groupby(["lang_pair", "id", "index_x"])["target_comparison"].sum()
sum_pred = val_pairwise_df.groupby(["lang_pair", "id", "index_x"])["pairwise_pred"].sum()

sum_y = sum_y.sort_values().reset_index(level=2)
sum_pred = sum_pred.sort_values().reset_index(level=2)

In [None]:
actual_best_indices = sum_y.loc[~sum_y.index.duplicated(keep="last")]["index_x"]
predicted_best_indices = sum_pred.loc[~sum_y.index.duplicated(keep="last")]["index_x"]

In [None]:
actual_best = val_df.loc[actual_best_indices].groupby(["lang_pair"])["bertscore_f1"].mean()
predicted_best = val_df.loc[predicted_best_indices].groupby(["lang_pair"])["bertscore_f1"].mean()
orig = val_df.sort_values(["lang_pair", "id", "gen_logprob"]
                        ).drop_duplicates(["lang_pair", "id"], keep="last").groupby(["lang_pair"])["bertscore_f1"].mean()
is_better = predicted_best > orig
pd.DataFrame({"actual_best": actual_best, "predicted_best": predicted_best, "orig": orig, "is_better": is_better})

In [None]:
from sklearn import linear_model
from sklearn.preprocessing import RobustScaler, FunctionTransformer
from sklearn.ensemble import RandomForestRegressor
import numpy as np

if is_pairwise:
    raise

feature_names = ["gen_logprob"] + [col for col in bt_results.columns if ("score" in col or "logprob" in col) and "bt" in col]
# feature_names = ['gen_logprob', 'bt_labels_logprob', 'bt_bertscore_f1']
# feature_names = ['gen_logprob']
target_column = "bertscore_f1"
# target_column = "bleu_score"

train_X = train_df[feature_names]
train_y = train_df[target_column]
val_X = val_df[feature_names]
val_y = val_df[target_column]

scaler_class = FunctionTransformer
# scaler_class = RobustScaler
features_scaler = scaler_class().fit(train_X)
target_scaler = scaler_class().fit(pd.DataFrame(train_y))

train_X = features_scaler.transform(train_X)
train_y = target_scaler.transform(pd.DataFrame(train_y)).squeeze()
val_X = features_scaler.transform(val_X)
val_y = target_scaler.transform(pd.DataFrame(val_y)).squeeze()

# model = linear_model.Lasso(alpha=0.001)
# model = linear_model.Ridge(alpha=0.02)
# model = linear_model.Ridge(alpha=0)
model = RandomForestRegressor(n_estimators=5, max_depth=5, verbose=2)
model.fit(train_X, train_y)
train_pred = model.predict(train_X)
val_pred = model.predict(val_X)
train_mse = np.mean((train_pred - train_y)**2)
val_mse = np.mean((val_pred - val_y)**2)

print("std:", train_pred.std(), val_pred.std())
print("mse:", train_mse, val_mse)
if hasattr(model, "coef_"):
    print(model.coef_)
    print(np.array(feature_names)[model.coef_ != 0])

val_df["model_score"] = val_pred
train_df["model_score"] = train_pred

def rerank(df, rerank_column):
    return (df.sort_values(["lang_pair", "id", rerank_column])
              .drop_duplicates(["lang_pair", "id"], keep="last")
              .groupby("lang_pair")[score_cols].mean().T)

rerank_val = rerank(val_df, "model_score")
orig_val = rerank(val_df, "gen_logprob")
oracle_val = rerank(val_df, "bertscore_f1")
display("rerank_val > orig_val", rerank_val > orig_val)
display("rerank_val - orig_val", rerank_val - orig_val)
display("rerank_val", rerank_val)
display("orig_val", orig_val)
display("oracle_val", oracle_val)

print('\n\n')

rerank_train = rerank(train_df, "model_score")
orig_train = rerank(train_df, "gen_logprob")
oracle_train = rerank(train_df, "bertscore_f1")
display("rerank_train > orig_train", rerank_train > orig_train)
display("rerank_train - orig_train", rerank_train - orig_train)
display("rerank_train", rerank_train)
display("orig_train", orig_train)
display("oracle_train", oracle_train)
