In [1]:
import pickle
import numpy as np

class SeqDataset(object):

    def __init__(self, ids, features, labels, groups, wordRanges, truePos):
        '''
        ids are ids of candidate sequences

        each row of features is 13 features corresponding to the following:
        feature_0: pred_end - pred_start so length of span -1
        feature_1: normalized start position (normalized by number of words)
        feature_2: normalized end position (normalized by number of words)
        feature_4-10: 7 evenly spaced quantiles of the distribution of relevant class probabilities for this sequence
        feature_11: The probability that words on either edge of the current sub-sequence belong to the class of interest
        feature_12: The probability that the first word corresponds to a 'B'-egin token

        labels are binary labels corresponding to whether the candidate sequence is an exact match to a true span

        wordRanges are the start and end (inclusive on both sides) indices of the candidate sequence

        truePos are binary labels corresponding to whether the candidate sequence would be considered a true positive (>0.5 overlap)

        '''
        self.features = np.array(features, dtype=np.float32)
        self.labels = np.array(labels)
        self.groups = np.array(groups, dtype=np.int16)
        self.wordRanges = np.array(wordRanges, dtype=np.int16)
        self.truePos = np.array(truePos)
        self.ids=ids

In [2]:
import pandas as pd

disc_types = ['Evidence','Claim','Lead','Position','Counterclaim','Rebuttal','Concluding Statement']

dfs = []

folder= 'cache' #put pickle files in this folder
for fold in range(8):
    with open(f'{folder}/valid_seqds_fold{fold}.p','rb') as f:
        seqdataset=pickle.load(f)
        
        for disc_type in disc_types:
            x = seqdataset[disc_type]

            df = pd.DataFrame()
            df[[f"f_{i}" for i in range(x.features.shape[1])]] = x.features
            df["id"] = x.ids
            df["class"] = disc_type
            df[["begin", "end"]] = x.wordRanges
            df["kfold"] = fold
            
            dfs.append(df)
            
            
len_features = x.features.shape[1]

In [3]:
oof_df = pd.concat(dfs)
print(oof_df.shape)
oof_df.head()

(35724267, 31)


Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_21,f_22,f_23,f_24,f_25,id,class,begin,end,kfold
0,1.0,0.183223,0.18543,0.994907,0.994907,0.994907,0.994907,0.994907,0.994907,0.994907,...,0.064025,0.024539,0.018976,0.002926,0.120611,4AB030046F42,Evidence,83,84,0
1,2.0,0.183223,0.187638,0.994907,0.995471,0.996034,0.996597,0.997161,0.997724,0.998287,...,0.064025,0.024539,0.018976,0.002926,0.120611,4AB030046F42,Evidence,83,85,0
2,3.0,0.183223,0.189845,0.994907,0.996034,0.997161,0.998287,0.998625,0.998963,0.9993,...,0.064025,0.024539,0.018976,0.002926,0.120611,4AB030046F42,Evidence,83,86,0
3,4.0,0.183223,0.192053,0.994907,0.996597,0.998287,0.998794,0.9993,0.999482,0.999665,...,0.064025,0.024539,0.018976,0.002926,0.120611,4AB030046F42,Evidence,83,87,0
4,5.0,0.183223,0.19426,0.994907,0.997161,0.998625,0.9993,0.999502,0.999624,0.999665,...,0.064025,0.024539,0.018976,0.002926,0.120611,4AB030046F42,Evidence,83,88,0


In [4]:
oof_df.sample(3, random_state=0).T

Unnamed: 0,155306,1944845,281859
f_0,178.0,78.0,82.0
f_1,0.0,0.393365,0.206349
f_2,0.364754,0.763033,0.306471
f_3,0.000007,0.063316,0.444897
f_4,0.000009,0.09902,0.484587
f_5,0.000013,0.110496,0.508487
f_6,0.000069,0.584912,0.94751
f_7,0.000243,0.722076,0.957281
f_8,0.073677,0.953462,0.961175
f_9,0.976568,0.970241,0.967838


In [5]:
gt_df = pd.read_csv("../train_folds.csv")
print(gt_df.shape)
gt_df.head()

(144293, 9)


Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring,kfold
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...,1
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59,1
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75,1
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...,1
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...,1


In [None]:
from tqdm import tqdm


ps = []

for begin, end in tqdm(list(zip(oof_df["begin"].values, oof_df["end"].values))):
    #ps.append(" ".join([str(int(x)) for x in np.arange(begin, end)]))
    ps.append(f"{begin} {end-1}")
oof_df["predictionstring"] = ps    
    
#ps = []

# for begin, end in tqdm(list(zip(gt_df["begin"].values, gt_df["end"].values))):
#     #ps.append(" ".join([str(int(x)) for x in np.arange(begin, end)]))
#     ps.append(f"{begin} {end}")    
# gt_df["predictionstring"] = ps


100%|████████████████████████████████████████████████| 35724267/35724267 [01:34<00:00, 376738.62it/s]

In [None]:
oof_df.head()

In [None]:
# from Rob Mulla @robikscube
# https://www.kaggle.com/robikscube/student-writing-competition-twitch
def calc_overlap(row):
    """
    Calculates the overlap between prediction and
    ground truth and overlap percentages used for determining
    true positives.
    """
    set_pred = set(row.predictionstring_pred.split(' '))
    set_gt = set(row.predictionstring_gt.split(' '))
    # Length of each and intersection
    len_gt = len(set_gt)
    len_pred = len(set_pred)
    inter = len(set_gt.intersection(set_pred))
    overlap_1 = inter / len_gt
    overlap_2 = inter/ len_pred
    return [overlap_1, overlap_2]


def score_feedback_comp(pred_df, gt_df):
    """
    A function that scores for the kaggle
        Student Writing Competition

    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    gt_df = gt_df[['id','discourse_type','predictionstring']]         .reset_index(drop=True).copy()
    pred_df = pred_df[['id','class','predictionstring']]         .reset_index(drop=True).copy()
    pred_df['pred_id'] = pred_df.index
    gt_df['gt_id'] = gt_df.index
    # Step 1. all ground truths and predictions for a given class are compared.
    joined = pred_df.merge(gt_df,
                           left_on=['id','class'],
                           right_on=['id','discourse_type'],
                           how='outer',
                           suffixes=('_pred','_gt')
                          )
    joined['predictionstring_gt'] = joined['predictionstring_gt'].fillna(' ')
    joined['predictionstring_pred'] = joined['predictionstring_pred'].fillna(' ')

    joined['overlaps'] = joined.apply(calc_overlap, axis=1)

    # 2. If the overlap between the ground truth and prediction is >= 0.5,
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    joined['overlap1'] = joined['overlaps'].apply(lambda x: eval(str(x))[0])
    joined['overlap2'] = joined['overlaps'].apply(lambda x: eval(str(x))[1])


    joined['potential_TP'] = (joined['overlap1'] >= 0.5) & (joined['overlap2'] >= 0.5)
    joined['max_overlap'] = joined[['overlap1','overlap2']].max(axis=1)
    tp_pred_ids = joined.query('potential_TP')         .sort_values('max_overlap', ascending=False)         .groupby(['id','predictionstring_gt']).first()['pred_id'].values

    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    fp_pred_ids = [p for p in joined['pred_id'].unique() if p not in tp_pred_ids]

    matched_gt_ids = joined.query('potential_TP')['gt_id'].unique()
    unmatched_gt_ids = [c for c in joined['gt_id'].unique() if c not in matched_gt_ids]

    # Get numbers of each type
    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    #calc microf1
    my_f1_score = TP / (TP + 0.5*(FP+FN))
    return my_f1_score

def calc_overlap_shujun(pred, gt):
    """
    Calculates if the overlap between prediction and
    ground truth is enough fora potential True positive
    """
    try:
        g1=pred[1]+1-gt[0]
        g2=gt[1]+1-pred[0]
        l1=pred[1]-pred[0]+1
        l2=gt[1]-gt[0]+1
        #print(g1,g2)
        if g1*g2>=0:
            #g1=abs(g1)+1
            #g2=abs(g2)+1
            inter=min((g1,g2,l1,l2))#/max((g1,g2,l1,l2))
            overlap_1=inter/l1
            overlap_2=inter/l2
            return overlap_1 >= 0.5 and overlap_2 >= 0.5
        else:
            return False
    except:
        return False


   
    

def score_feedback_comp_micro_shujun(pred_df, gt_df, discourse_type):
    """
    A function that scores for the kaggle
        Student Writing Competition

    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    gt_df = gt_df.loc[gt_df['discourse_type'] == discourse_type,
                      ['id', 'predictionstring']].reset_index(drop=True)
    pred_df = pred_df.loc[pred_df['class'] == discourse_type,
                      ['id', 'predictionstring']].reset_index(drop=True)
    pred_df['pred_id'] = pred_df.index
    gt_df['gt_id'] = gt_df.index
    pred_df['predictionstring'] = [(int(pred.split(' ')[0]),int(pred.split(' ')[-1])) for pred in pred_df['predictionstring']]
    gt_df['predictionstring'] = [(int(pred.split(' ')[0]),int(pred.split(' ')[-1])) for pred in gt_df['predictionstring']]


#     print(pred_df[pred_df['predictionstring']!=pred_df['predictionstring']])
#     exit()
    #gt_strings=

    # Step 1. all ground truths and predictions for a given class are compared.
    joined = pred_df.merge(gt_df,
                           left_on='id',
                           right_on='id',
                           how='outer',
                           suffixes=('_pred','_gt')
                          )
    overlaps = [calc_overlap_shujun(*args) for args in zip(list(joined.predictionstring_pred),
                                                     list(joined.predictionstring_gt))]

    # 2. If the overlap between the ground truth and prediction is >= 0.5,
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    # we don't need to compute the match to compute the score
    TP = joined.loc[overlaps]['gt_id'].nunique()

    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    TPandFP = len(pred_df)
    TPandFN = len(gt_df)

    #calc microf1
    my_f1_score = 2*TP / (TPandFP + TPandFN)
    return my_f1_score

def score_feedback_comp_shujun(pred_df, gt_df, return_class_scores=False):
    class_scores = {}
    for discourse_type in gt_df.discourse_type.unique():
        class_score = score_feedback_comp_micro_shujun(pred_df, gt_df, discourse_type)
        class_scores[discourse_type] = class_score
    f1 = np.mean([v for v in class_scores.values()])
    if return_class_scores:
        return f1, class_scores
    return f1

sample_df = oof_df[oof_df["f_7"] > 0.9999].reset_index(drop=True)
print(sample_df.shape)

score_feedback_comp_shujun(sample_df, gt_df, return_class_scores=True)

In [None]:
oof_df["idx"] = np.arange(oof_df.shape[0])

eval_df = oof_df[["idx", "id", "class", "predictionstring"]].merge(gt_df[["id", "discourse_type", "predictionstring"]].rename(columns={"predictionstring": "gt_ps",
                                                                                          "discourse_type": 'class'}), 
                      how="left", on=["id", "class"])
eval_df.shape

In [None]:
eval_df.columns

In [None]:
def calc_overlap_shujun_min(pred, gt):
    """
    Calculates if the overlap between prediction and
    ground truth is enough fora potential True positive
    """
    try:
        pred=[int(pred.split()[0]),int(pred.split()[-1])]
        gt=[int(gt.split()[0]),int(gt.split()[-1])]
        g1=pred[1]+1-gt[0]
        g2=gt[1]+1-pred[0]
        l1=pred[1]-pred[0]+1
        l2=gt[1]-gt[0]+1
        #print(g1,g2)
        if g1*g2>=0:
            #g1=abs(g1)+1
            #g2=abs(g2)+1
            inter=min((g1,g2,l1,l2))#/max((g1,g2,l1,l2))
            overlap_1=inter/l1
            overlap_2=inter/l2
            #return overlap_1 >= 0.5 and overlap_2 >= 0.5
            return min(overlap_1,overlap_2)
        else:
            return 0
    except:
        return 0 
def calc_overlap(predictionstring, gt_ps):

    set_pred = set(str(predictionstring).split(" "))
    set_gt = set(str(gt_ps).split(" "))
    # Length of each and intersection
    len_gt = len(set_gt)
    len_pred = len(set_pred)
    inter = len(set_gt.intersection(set_pred))
    overlap_1 = inter / len_gt
    overlap_2 = inter / len_pred
    return min(overlap_1, overlap_2)

    
    
overlap = []

for predictionstring, gt_ps in tqdm(list(zip(eval_df["predictionstring"].values, eval_df["gt_ps"].values))):
    #break
    overlap.append(calc_overlap_shujun_min(predictionstring, gt_ps))
    
    


In [None]:
eval_df["overlap"] = overlap

In [None]:
eval_df = eval_df.groupby("idx")["overlap"].max().reset_index()
eval_df.shape

In [None]:
eval_df.head()

In [None]:
eval_df.tail()

In [None]:
oof_df.head()["idx"], oof_df.tail()["idx"]

In [None]:
oof_df["overlap"] = eval_df["overlap"].values

oof_df["overlap"].fillna(0.0, inplace=True)

oof_df["overlap"].hist(bins=50)

In [None]:
oof_df.to_parquet(f"{folder}/new_oof_shujun_overlap_calc.parquet", index=False)