### Ensemble and postprocessing research

In [1]:
import os
import sys

import pandas as pd
import numpy as np
import pickle

import torch
import torch.nn as nn

from tqdm import tqdm
import matplotlib.pyplot as plt

sys.path.append('/home/backe/projects/feedback/')
from utils import seed_everything, moving_average, score_feedback_comp, calc_overlap
from decoder_2000.candidates import decode_predictions

### OOF per model/fold

In [2]:
# VAL_DF
TRAIN_PATH = '../data/train.csv'
train_df = pd.read_csv(TRAIN_PATH)

with open('word_probs_lf.pickle', 'rb') as handle:
    word_probs_lf = pickle.load(handle)
    
with open('word_probs_rl.pickle', 'rb') as handle:
    word_probs_rl = pickle.load(handle)

with open('word_probs_db.pickle', 'rb') as handle:
    word_probs_db = pickle.load(handle)

# lgbm decoder model
with open('../decoder_2000/decoder_model.pickle', 'rb') as handle:
    model_decoder = pickle.load(handle)

In [10]:
word_probs_all = {}

for idx in word_probs_lf.keys():
    
    word_probs_all[idx] = np.mean((word_probs_rl[idx], word_probs_db[idx]), axis=0)
        
#     word_probs_all[idx] = word_probs_db[idx]


In [11]:
preds_df = decode_predictions(model_decoder, word_probs_all)

In [16]:
# take just one fold

fold_idx = 4

# targets
folds = pd.read_csv('../data/folds.csv')
fold_ids = folds.loc[folds['kfold'] == fold_idx, 'id']
fold_df = train_df[train_df['id'].isin(fold_ids)]

# predictions
fold_preds = preds_df[preds_df['id'].isin(fold_ids)]

score_feedback_comp(fold_preds, fold_df, True)

(0.7091085728350814,
 {'Lead': 0.8401486988847584,
  'Position': 0.7234334763948498,
  'Claim': 0.6708228141583802,
  'Evidence': 0.7664355992844365,
  'Concluding Statement': 0.8655643421998562,
  'Counterclaim': 0.5875758991125642,
  'Rebuttal': 0.5097791798107255})

### OOF

In [None]:
# VAL_DF
TRAIN_PATH = '../data/train.csv'
train_df = pd.read_csv(TRAIN_PATH)

with open('word_probs_lf.pickle', 'rb') as handle:
    word_probs_lf = pickle.load(handle)
    
with open('word_probs_rl.pickle', 'rb') as handle:
    word_probs_rl = pickle.load(handle)

with open('word_probs_db.pickle', 'rb') as handle:
    word_probs_db = pickle.load(handle)
    
# lgbm decoder model
with open('../decoder_2000/decoder_model.pickle', 'rb') as handle:
    model_decoder = pickle.load(handle)

In [None]:
%%time

word_probs_all = dict()

for idx in word_probs_lf.keys():
    
    word_probs_all[idx] = np.mean((word_probs_lf[idx], word_probs_rl[idx], word_probs_db[idx]), axis=0)


In [None]:
preds_df = decode_predictions(model_decoder, word_probs_all)
score_feedback_comp(preds_df, train_df, True)

### Find bad examples

In [None]:
CLASSES = preds_df['class'].unique()

In [None]:
discourse_type = 'Claim'

In [None]:
gt_df = train_df.copy()
pred_df = preds_df.copy()

In [None]:
gt_df = gt_df.loc[gt_df['discourse_type'] == discourse_type, 
                  ['id', 'predictionstring']].reset_index(drop=True)
pred_df = pred_df.loc[pred_df['class'] == discourse_type,
                  ['id', 'predictionstring']].reset_index(drop=True)
pred_df['pred_id'] = pred_df.index
gt_df['gt_id'] = gt_df.index
pred_df['predictionstring'] = [set(pred.split(' ')) for pred in pred_df['predictionstring']]
gt_df['predictionstring'] = [set(pred.split(' ')) for pred in gt_df['predictionstring']]

# Step 1. all ground truths and predictions for a given class are compared.
joined = pred_df.merge(gt_df,
                       left_on='id',
                       right_on='id',
                       how='outer',
                       suffixes=('_pred','_gt')
                      )
overlaps = [calc_overlap(*args) for args in zip(joined.predictionstring_pred, 
                                                 joined.predictionstring_gt)]

# 2. If the overlap between the ground truth and prediction is >= 0.5, 
# and the overlap between the prediction and the ground truth >= 0.5,
# the prediction is a match and considered a true positive.
# If multiple matches exist, the match with the highest pair of overlaps is taken.
# we don't need to compute the match to compute the score
TP = joined.loc[overlaps]['gt_id'].nunique()

# 3. Any unmatched ground truths are false negatives
# and any unmatched predictions are false positives.
TPandFP = len(pred_df)
TPandFN = len(gt_df)

#calc microf1
my_f1_score = 2*TP / (TPandFP + TPandFN)


In [None]:
# FALSE NEGATIVE
joined[joined['predictionstring_pred'].isna()].sample(5)

In [None]:
# FALSE POSITIVE
joined[joined['predictionstring_gt'].isna()].sample(5)