In [17]:
import pandas as pd
import numpy as np
df_bl = pd.read_json('./baseline_results.json', lines=False)
df_bl_eq = np.where(df_bl['tags'] == df_bl['predicted'])
tp_bl = len(df_bl_eq[0])
acc_bl = tp_bl/len(df_bl.index)

df = pd.read_json('./baseline_simplified_extended_data_results.json', lines=False)
df_eq = np.where(df['tags'] == df['predicted'])
tp = len(df_eq[0])
acc = tp/len(df.index)

print('Accuracy Baseline: ', acc_bl)
print('Accuracy Simplified Extended: ', acc)

Accuracy Baseline:  0.5275
Accuracy Simplified Extended:  0.5275


In [4]:
import importlib

numbers = ['2','3','4','5','6','7','8','9']
number_words = ['two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'Two', 'Three', 'Four', 'Five', 'Six', 'Seven', 'Eight', 'Nine', 'Ten']
currency_words = ['euro', 'yen', 'Euro','Yen']
currency_signs = ['€', '¥']
selection_multi = ['multi']
selection_phrase = ['phrase']
selection_passage = ['passage']
#baseline = importlib.import_module('transformer-baseline-task-1')

def has_number_word(inputString):
    return any(word in number_words for word in inputString.split())

def has_number(inputString):
    return any(char.isdigit() for char in inputString)

def has_currency_word(inputString):
    return any(word in currency_words for word in inputString.split())

def has_currency_sign(inputString):
    return any(char in currency_signs for char in inputString)

### baseline analysis

In [5]:
baseline_results = pd.read_json('./baseline_results.json', lines=False)

mask_multi_act = baseline_results.tags.apply(lambda x: any(item for item in selection_multi if item in x))
mask_multi_not_act = baseline_results.tags.apply(lambda x: not any(item for item in selection_multi if item in x))
mask_multi_pred = baseline_results.predicted.apply(lambda x: any(item for item in selection_multi if item in x))
mask_multi_not_pred = baseline_results.predicted.apply(lambda x: not any(item for item in selection_multi if item in x))

true_positive = len(baseline_results[mask_multi_pred & mask_multi_act].index)
false_positive = len(baseline_results[mask_multi_pred & mask_multi_not_act].index)
true_negative = len(baseline_results[mask_multi_not_pred & mask_multi_not_act].index)
false_negative = len(baseline_results[mask_multi_not_pred & mask_multi_act].index)

print('TP: ', true_positive)
print('FP: ', false_positive)
print('TN: ', true_negative)
print('FN: ', false_negative)

accuracy = (true_positive + true_negative) / (true_positive + false_positive + true_negative + false_negative)
print('Accuracy: ', accuracy)


TP:  56
FP:  18
TN:  639
FN:  87
Accuracy:  0.86875


### multi predicted by numbers contained in post text analysis

In [6]:
results = pd.read_json('./exported.json', lines=False)

mask_multi_act = results.tags.apply(lambda x: any(item for item in selection_multi if item in x))
mask_multi_not_act = results.tags.apply(lambda x: not any(item for item in selection_multi if item in x))
mask_multi_pred = results.spoilerType_y.apply(lambda x: any(item for item in selection_multi if item in x))
mask_multi_not_pred = results.spoilerType_y.apply(lambda x: not any(item for item in selection_multi if item in x))

true_positive = len(results[mask_multi_pred & mask_multi_act].index)
false_positive = len(results[mask_multi_pred & mask_multi_not_act].index)
true_negative = len(results[mask_multi_not_pred & mask_multi_not_act].index)
false_negative = len(results[mask_multi_not_pred & mask_multi_act].index)

print('TP: ', true_positive)
print('FP: ', false_positive)
print('TN: ', true_negative)
print('FN: ', false_negative)

accuracy = (true_positive + true_negative) / (true_positive + false_positive + true_negative + false_negative)
print('Accuracy: ', accuracy)

TP:  82
FP:  87
TN:  570
FN:  61
Accuracy:  0.815


### detect multipart spoilers by applying multiple conditions

In [7]:
df['st_contains_currency_sign_or_word'] = df['postText'].apply(lambda p: has_currency_sign(p) or has_currency_word(p))
df['st_contains_numbers_or_number_words'] = df['postText'].apply(lambda p: has_number(p) or has_number_word(p))

conditions = [
    ((df['st_contains_numbers_or_number_words'] == True) & (df['st_contains_currency_sign_or_word'] == False))
]

df['spoilerType'] = np.select(conditions,['multi'],'')
mask_multi_pred = df.spoilerType.apply(lambda x: any(item for item in selection_multi if item in x))
index_array = df[mask_multi_pred].index


In [8]:
results2 = pd.read_json('./baseline_results.json', lines=False)
results2.loc[index_array,'predicted'] = '[multi]'

mask_multi_act = results2.tags.apply(lambda x: any(item for item in selection_multi if item in x))
mask_multi_not_act = results2.tags.apply(lambda x: not any(item for item in selection_multi if item in x))
mask_multi_pred = results2.predicted.apply(lambda x: any(item for item in selection_multi if item in x))
mask_multi_not_pred = results2.predicted.apply(lambda x: not any(item for item in selection_multi if item in x))

true_positive = len(results2[mask_multi_pred & mask_multi_act].index)
false_positive = len(results2[mask_multi_pred & mask_multi_not_act].index)
true_negative = len(results2[mask_multi_not_pred & mask_multi_not_act].index)
false_negative = len(results2[mask_multi_not_pred & mask_multi_act].index)

print('TP: ', true_positive)
print('FP: ', false_positive)
print('TN: ', true_negative)
print('FN: ', false_negative)

accuracy = (true_positive + true_negative) / (true_positive + false_positive + true_negative + false_negative)
print('Accuracy: ', accuracy)

TP:  86
FP:  93
TN:  564
FN:  57
Accuracy:  0.8125


### target paragraph analysis

In [9]:
df = pd.read_json('./validation.jsonl', lines=True)
mask_multi_act = df.tags.apply(lambda x: any(item for item in selection_multi if item in x))
mask_phrase_act = df.tags.apply(lambda x: any(item for item in selection_phrase if item in x))
mask_passage_act = df.tags.apply(lambda x: any(item for item in selection_passage if item in x))
df['postText'] = df['postText'].apply(lambda p: p[0])
df['targetParagraphsAmount'] = df['targetParagraphs'].apply(lambda p: len(p))
average_multi_target_amounts = df[mask_multi_act]['targetParagraphsAmount'].mean()
average_phrase_target_amounts = df[mask_phrase_act]['targetParagraphsAmount'].mean()
average_passage_target_amounts = df[mask_passage_act]['targetParagraphsAmount'].mean()
median_multi_target_amounts = df[mask_multi_act]['targetParagraphsAmount'].median()
median_phrase_target_amounts = df[mask_phrase_act]['targetParagraphsAmount'].median()
median_passage_target_amounts = df[mask_passage_act]['targetParagraphsAmount'].median()
stdev_multi_target_amounts = df[mask_multi_act]['targetParagraphsAmount'].std()
stdev_phrase_target_amounts = df[mask_phrase_act]['targetParagraphsAmount'].std()
stdev_passage_target_amounts = df[mask_passage_act]['targetParagraphsAmount'].std()


print('multi: mean / stdev / median: ', average_multi_target_amounts, ' / ', stdev_multi_target_amounts, ' / ', median_multi_target_amounts)
print('phrase: mean / stdev / median: ', average_phrase_target_amounts, ' / ', stdev_phrase_target_amounts, ' / ', median_phrase_target_amounts)
print('passage: mean / stdev / median: ', average_passage_target_amounts, ' / ', stdev_passage_target_amounts, ' / ', median_passage_target_amounts)

multi: mean / stdev / median:  23.503496503496503  /  20.191991902339467  /  19.0
phrase: mean / stdev / median:  11.292537313432836  /  8.99955610071025  /  9.0
passage: mean / stdev / median:  13.869565217391305  /  14.000725586352  /  10.0


In [10]:
results3 = pd.read_json('./ap2_exported.json', lines=False)

mask_multi_act = results3.tags.apply(lambda x: any(item for item in selection_multi if item in x))
mask_multi_not_act = results3.tags.apply(lambda x: not any(item for item in selection_multi if item in x))
mask_multi_pred = results3.spoilerType_y.apply(lambda x: any(item for item in selection_multi if item in x))
mask_multi_not_pred = results3.spoilerType_y.apply(lambda x: not any(item for item in selection_multi if item in x))

true_positive = len(results3[mask_multi_pred & mask_multi_act].index)
false_positive = len(results3[mask_multi_pred & mask_multi_not_act].index)
true_negative = len(results3[mask_multi_not_pred & mask_multi_not_act].index)
false_negative = len(results3[mask_multi_not_pred & mask_multi_act].index)

print('TP: ', true_positive)
print('FP: ', false_positive)
print('TN: ', true_negative)
print('FN: ', false_negative)

accuracy = (true_positive + true_negative) / (true_positive + false_positive + true_negative + false_negative)
print('Accuracy: ', accuracy)

TP:  68
FP:  24
TN:  633
FN:  75
Accuracy:  0.87625


### explicit article enumeration

In [36]:
def contains_recipe_words(inputString):
    return any(word in ['tbsp.', 'Tbsp.', 'tbs.', 'Tbs.', 'oz.', 'Oz.'] for word in inputString.split())

def get_number(inputString):
    for char in inputString:
        return int(char) if char.isdigit() else 0

def contains_explicit_enumeration(targetParagraphs):
    predIsSmallerCounter = 0;
    lastNumber = 0;
    for targetParagraph in targetParagraphs:
        if(contains_recipe_words(targetParagraph)): return False
        firstCharsOfParagraph = targetParagraph[0:2]
        number_exists = has_number(firstCharsOfParagraph)
        if number_exists:
            currentNumber = get_number(firstCharsOfParagraph)
            if lastNumber < currentNumber: predIsSmallerCounter = predIsSmallerCounter + 1;
            lastNumber = currentNumber;
    return True if predIsSmallerCounter >= 2 else False

df = pd.read_json('./validation.jsonl', lines=True)
df['containsExplicitEnumeration'] = df.targetParagraphs.apply(lambda x: contains_explicit_enumeration(x))
conditions = [
    ((df['containsExplicitEnumeration'] == True))
]
df['spoilerType'] = np.select(conditions,['multi'],'')
mask_multi_pred = df.spoilerType.apply(lambda x: any(item for item in selection_multi if item in x))
#index_array = df[mask_multi_pred].index
#results = pd.read_json('./baseline_results.json', lines=False)
#results.loc[index_array,'predicted'] = '[multi]'

mask_multi_act = df.tags.apply(lambda x: any(item for item in selection_multi if item in x))
mask_multi_not_act = df.tags.apply(lambda x: not any(item for item in selection_multi if item in x))
mask_multi_pred = df.spoilerType.apply(lambda x: any(item for item in selection_multi if item in x))
mask_multi_not_pred = df.spoilerType.apply(lambda x: not any(item for item in selection_multi if item in x))

true_positive = len(df[mask_multi_pred & mask_multi_act].index)
false_positive = len(df[mask_multi_pred & mask_multi_not_act].index)
true_negative = len(df[mask_multi_not_pred & mask_multi_not_act].index)
false_negative = len(df[mask_multi_not_pred & mask_multi_act].index)

print('TP: ', true_positive)
print('FP: ', false_positive)
print('TN: ', true_negative)
print('FN: ', false_negative)

accuracy = (true_positive + true_negative) / (true_positive + false_positive + true_negative + false_negative)
print('Accuracy: ', accuracy)

df[mask_multi_pred & mask_multi_not_act]


TP:  39
FP:  28
TN:  629
FN:  104
Accuracy:  0.835


Unnamed: 0,uuid,postId,postText,postPlatform,targetParagraphs,targetTitle,targetDescription,targetKeywords,targetMedia,targetUrl,provenance,spoiler,spoilerPositions,tags,containsExplicitEnumeration,spoilerType
61,8bd60f06-a4cb-4694-91ed-e6c33cbc6183,844109892399324_848006502009663,[This Guy Put An Ice Cream Cone In His Pocket....,Facebook,[Usually if you follow your conscience and kno...,Here Are 11 Ridiculously Strange Laws That Sti...,You probably had no idea that these weird arre...,,[http://boredomtherapy.com/wp-content/themes/b...,http://boredomtherapy.com/weird-american-laws/,"{'source': 'anonymized', 'humanSpoiler': 'Alab...",[If you find yourself so moved to store ice cr...,"[[[19, 0], [19, 142]]]",[passage],True,multi
66,d7a0e8fc-d7ff-40e0-9c49-f4bacb9191b5,1100188040002123_1139002736120653,[Explainer: Is There Any Science Behind Astrol...,Facebook,"[Photo credit: pixabay.com, Looking at the sta...",Explainer: Is There Any Science Behind Astrology?,A hospital in Argentina is reportedly using as...,"is astrology real, is astrology backed by scie...",[http://thescienceexplorer.com/sites/thescienc...,http://thescienceexplorer.com/brain-and-body/e...,"{'source': 'anonymized', 'humanSpoiler': 'Nope...","[""Although astrologers seek to explain the nat...","[[[16, 44], [16, 225]]]",[passage],True,multi
80,bea116e2-551d-4b16-9ed8-873c4efc6c65,812123578455957504,[This company wants you to bring your dog to w...,Twitter,[When we visited Nvidia (NVDA) earlier this mo...,This company wants you to bring your dog to work,When we visited Nvidia (NVDA) earlier this mon...,,,,"{'source': 'anonymized', 'humanSpoiler': None,...",[Nvidia],"[[[0, 16], [0, 22]]]",[phrase],True,multi
83,70b3c783-23e1-4296-8a8f-74e5433c2046,572971196188393472,[This is how much coffee Americans drinks ever...,Twitter,[Coffee is one of life's greatest gifts. Not o...,How Much Coffee Do Americans Drink Every Day?,Coffee is one of life's greatest gifts. Not on...,"coffee,how-much-coffee-is-safe-to-drink,how-mu...",[http://modulous.huffpost.com/2015/11/13/14474...,http://huff.to/1AjdyVP,"{'source': 'anonymized', 'humanSpoiler': '2.1 ...",[2.1 coffee drinks],"[[[4, 0], [4, 17]]]",[phrase],True,multi
160,1c00202b-d168-4183-beaf-149e74a84fa0,388350033805840384,[@LadyGaga and @RKelly team up],Twitter,"[Ever the queen of shock value, Lady Gaga does...",Lady Gaga Has A Surprise Collaborator On 'ARTPOP',"Ever the queen of shock value, Lady Gaga does ...","R. Kelly rapper,lady gaga new album,Lady Gaga ...",[http://i.huffpost.com/gadgets/slideshows/2145...,http://huff.to/19osgBh,"{'source': 'anonymized', 'humanSpoiler': 'TO F...","[on ""Do What U Want""]","[[[2, 256], [2, 275]]]",[phrase],True,multi
235,2a817e10-5c5a-4a14-884e-5fb8f6442ec5,505545482979561_511097069091069,[Antibiotics will soon 'stop working'],Facebook,[Urgent action is needed to control the use of...,Antibiotics will stop working at a 'terrible h...,Urgent action is needed to control the use of ...,"Health, Science, Medicine, Antibiotics, Cancer...",[https://static.independent.co.uk/s3fs-public/...,http://www.independent.co.uk/life-style/health...,"{'source': 'anonymized', 'humanSpoiler': 'Anti...",[Resistance to antibiotics is growing at such ...,"[[[1, 0], [1, 106]]]",[passage],True,multi
278,9e2e47c4-8f21-45ce-b59d-dfaa7dfe1640,408112290924204034,[And the US state with the largest average pen...,Twitter,[Look. As much as you might hear that size doe...,Penis Sizes By State: How Does Yours Measure Up?,Look. As much as you might hear that size does...,"Penis Sizes By State,states ranked by penis si...",[http://i.huffpost.com/gen/1495169/images/n-PE...,http://huff.to/1ckcYiu,"{'source': 'anonymized', 'humanSpoiler': 'Nort...",[1 North Dakota],"[[[3, 0], [3, 14]]]",[phrase],True,multi
306,c667c056-0b3a-4cfc-a5ad-31f7b078c129,396395351667531776,"[Where ""Kanye West"" is more popular than ""Jesu...",Twitter,[Kanye West has used Jesus Christ in his art s...,Is 'Kanye West' More Popular Than 'Jesus Christ'?,Kanye West has used Jesus Christ in his art si...,"kanye west yeezus tour,kanye west yeezus,twitt...",[http://big.assets.huffingtonpost.com/kanyejes...,http://huff.to/1aOFRT8,"{'source': 'anonymized', 'humanSpoiler': 'Twit...",[original Tweets],"[[[6, 67], [6, 82]]]",[phrase],True,multi
326,1c2858d9-a89b-4a7e-8fe7-5da50fe09238,820420235706691584,[This 28-year-old retired with a $2 million ne...,Twitter,"[In 2016, JP Livingston celebrated her 28th bi...",How A 28-Year-Old Who Retired With A $2 Millio...,Check out exactly what her investing strategy ...,,,,"{'source': 'anonymized', 'humanSpoiler': None,...",[transitioning my portfolio into more income-b...,"[[[12, 55], [12, 179]]]",[passage],True,multi
327,6176bc74-00ba-43e6-860b-234127227d2a,549418942511710208,[How many millions did The Interview make onli...,Twitter,"[Strong debuts for Unbroken, Into the Woods. T...",The Hobbit Remains No. 1 at Christmas Weekend ...,"The Hobbit dominated again, but how did Unbrok...",,[http://assets1.ignimgs.com/2014/10/27/hobbitj...,http://go.ign.com/tXBadbB,"{'source': 'anonymized', 'humanSpoiler': '15.'...",[$15 million],"[[[8, 323], [8, 334]]]",[phrase],True,multi
