In [1]:
import pandas as pd
import numpy as np
import importlib

In [2]:
numbers = ['2','3','4','5','6','7','8','9']
number_words = ['two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'Two', 'Three', 'Four', 'Five', 'Six', 'Seven', 'Eight', 'Nine', 'Ten']
currency_words = ['euro', 'yen', 'Euro','Yen']
currency_signs = ['€', '¥']
selection_multi = ['multi']
selection_phrase = ['phrase']
selection_passage = ['passage']
#baseline = importlib.import_module('transformer-baseline-task-1')

def has_number_word(inputString):
    return any(word in number_words for word in inputString.split())

def has_number(inputString):
    return any(char.isdigit() for char in inputString)

def has_currency_word(inputString):
    return any(word in currency_words for word in inputString.split())

def has_currency_sign(inputString):
    return any(char in currency_signs for char in inputString)

df = pd.read_json('./validation.jsonl', lines=True)
df['postText'] = df['postText'].apply(lambda p: p[0])

df['st_contains_currency_sign_or_word'] = df['postText'].apply(lambda p: has_currency_sign(p) or has_currency_word(p))
df['st_contains_numbers_or_number_words'] = df['postText'].apply(lambda p: has_number(p) or has_number_word(p))
conditions = [
    ((df['st_contains_numbers_or_number_words'] == True) & (df['st_contains_currency_sign_or_word'] == False))
]
df['multiByCondition'] = np.select(conditions,['multi'],'')
mask_multi_pred = df.multiByCondition.apply(lambda x: any(item for item in selection_multi if item in x))
index_array_conditional = df[mask_multi_pred].index

In [3]:
mask_multi_act = df.tags.apply(lambda x: any(item for item in selection_multi if item in x))
mask_phrase_act = df.tags.apply(lambda x: any(item for item in selection_phrase if item in x))
mask_passage_act = df.tags.apply(lambda x: any(item for item in selection_passage if item in x))
df['targetParagraphsAmount'] = df['targetParagraphs'].apply(lambda p: len(p))
average_multi_target_amounts = df[mask_multi_act]['targetParagraphsAmount'].mean()
average_phrase_target_amounts = df[mask_phrase_act]['targetParagraphsAmount'].mean()
average_passage_target_amounts = df[mask_passage_act]['targetParagraphsAmount'].mean()

def multiHasLowestDistance(targetParagraphAmount):
    distanceMultiMean = abs(targetParagraphAmount - average_multi_target_amounts)
    distancePassageMean = abs(targetParagraphAmount - average_passage_target_amounts)
    distancePhraseMean = abs(targetParagraphAmount - average_phrase_target_amounts)
    distance_list = [distanceMultiMean, distancePassageMean, distancePhraseMean]
    distance_list.sort()
    if distance_list[0] == distanceMultiMean: return True
    return False

lowest_dist_to_multi_avg = df['targetParagraphs'].apply(lambda p: multiHasLowestDistance(len(p)))
index_array_distance = df[lowest_dist_to_multi_avg].index

In [8]:
def contains_recipe_words(inputString):
    return any(word in ['tbsp.', 'Tbsp.', 'tbs.', 'Tbs.', 'oz.', 'Oz.'] for word in inputString.split())

def get_number(inputString):
    for char in inputString:
        return int(char) if char.isdigit() else 0

def contains_explicit_enumeration(targetParagraphs):
    predIsSmallerCounter = 0;
    lastNumber = 0;
    for targetParagraph in targetParagraphs:
        if(contains_recipe_words(targetParagraph)): return False
        firstCharsOfParagraph = targetParagraph[0:2]
        number_exists = has_number(firstCharsOfParagraph)
        if number_exists:
            currentNumber = get_number(firstCharsOfParagraph)
            if lastNumber < currentNumber: predIsSmallerCounter = predIsSmallerCounter + 1;
            lastNumber = currentNumber;
    return True if predIsSmallerCounter >= 2 else False

df = pd.read_json('./validation.jsonl', lines=True)
df['containsExplicitEnumeration'] = df.targetParagraphs.apply(lambda x: contains_explicit_enumeration(x))
conditions = [
    ((df['containsExplicitEnumeration'] == True))
]
df['spoilerType'] = np.select(conditions,['multi'],'')
mask_multi_pred = df.spoilerType.apply(lambda x: any(item for item in selection_multi if item in x))
index_array_explicit_enum = df[mask_multi_pred].index

In [9]:
indecesMajVoted = []
for index in range(df.index.start, df.index.stop):
    arrayCount = 0
    if index in index_array_conditional: arrayCount = arrayCount + 1
    if index in index_array_distance: arrayCount = arrayCount + 1
    if index in index_array_explicit_enum: arrayCount = arrayCount + 1
    if arrayCount >= 2: indecesMajVoted.append(index)

In [12]:
df.loc[indecesMajVoted,'spoilerType'] = '[multi]'

mask_multi_act = df.tags.apply(lambda x: any(item for item in selection_multi if item in x))
mask_multi_not_act = df.tags.apply(lambda x: not any(item for item in selection_multi if item in x))
mask_multi_pred = df.spoilerType.apply(lambda x: any(item for item in selection_multi if item in x))
mask_multi_not_pred = df.spoilerType.apply(lambda x: not any(item for item in selection_multi if item in x))

true_positive = len(df[mask_multi_pred & mask_multi_act].index)
false_positive = len(df[mask_multi_pred & mask_multi_not_act].index)
true_negative = len(df[mask_multi_not_pred & mask_multi_not_act].index)
false_negative = len(df[mask_multi_not_pred & mask_multi_act].index)

print('TP: ', true_positive)
print('FP: ', false_positive)
print('TN: ', true_negative)
print('FN: ', false_negative)

accuracy = (true_positive + true_negative) / (true_positive + false_positive + true_negative + false_negative)
print('Accuracy: ', accuracy)
df

TP:  59
FP:  35
TN:  622
FN:  84
Accuracy:  0.85125


Unnamed: 0,uuid,postId,postText,postPlatform,targetParagraphs,targetTitle,targetDescription,targetKeywords,targetMedia,targetUrl,provenance,spoiler,spoilerPositions,tags,containsExplicitEnumeration,spoilerType
0,6dc7ddef-4e8e-4a6b-9296-526377518071,800048986762423_885081814925806,[Five Nights at Freddy’s Sequel Delayed for We...,Facebook,[Five Nights at Freddy’s creator Scott Cawthon...,Five Nights at Freddy’s Sequel Delayed for Wei...,Five Nights at Freddy's creator Scott Cawthon ...,,[https://gamerant.com/wp-content/uploads/five-...,https://gamerant.com/five-nights-at-freddys-se...,"{'source': 'anonymized', 'humanSpoiler': 'They...",[some of the plot elements are so disturbing t...,"[[[2, 158], [2, 236]]]",[passage],False,
1,435b24de-56f6-4d4e-9c38-54b8e0630aac,4jp20e,[Why Arizona Sheriff Joe Arpaio’s fate could h...,reddit,[© REUTERS/Laura Segall Maricopa County Sherif...,Why Arizona Sheriff Joe Arpaio’s fate could ha...,"<p>A single word — ""intentionally"" — could tra...",,[https://archive.is/0eGBT/bbf6d79b366fc7e75b8b...,https://archive.is/0eGBT,"{'source': 'anonymized', 'humanSpoiler': '""Int...","[""intentionally"", could transform a court case...","[[[0, 197], [0, 212]], [[0, 215], [0, 328]]]",[multi],False,
2,8091ba93-6376-473a-9117-01d6cf0507e4,857559227384160256,[Here’s how much you should be tipping your ha...,Twitter,[Here’s how much you should be tipping your ha...,Here’s how much you should be tipping your hai...,Remembering how much you’re supposed to tip an...,,,,"{'source': 'anonymized', 'humanSpoiler': None,...",[20%],"[[[3, 58], [3, 61]]]",[phrase],False,
3,8b713277-0e5b-4873-a216-b650f21f3b4c,389091583213453312,"[""Harry Potter"" alums reunite for new movie]",Twitter,[The mythology of punk music's evolution can b...,"Alan Rickman & Rupert Grint On 'CBGB,' Reuniti...",The mythology of punk music's evolution can be...,"Alan Rickman,Hilly Kristal,new rupert grint mo...",[http://s.m.huffpost.com/assets/Logo_Huffingto...,http://huff.to/1ccNwKJ,"{'source': 'anonymized', 'humanSpoiler': 'Alan...","[Alan Rickman & Rupert Grint, CBGB]","[[[-1, 0], [-1, 27]], [[0, 98], [0, 102]]]",[multi],False,
4,a2f91b65-c36c-481f-92b1-2fc77d6411fc,744740593046274048,[A man swallowed a microSD card and you won't ...,Twitter,[PetaPixel is one of my favorite blogs. The wr...,Man swallowed a microSD card and you won't bel...,PetaPixel is one of my favorite blogs. The wri...,,[https://cdn0.vox-cdn.com/uploads/chorus_image...,http://www.theverge.com/circuitbreaker/2016/6/...,"{'source': 'anonymized', 'humanSpoiler': 'This...",[a man who swallowed a 64GB microSD card and t...,"[[[1, 34], [1, 108]]]",[passage],False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,1189d343-42eb-47e7-8395-ff978a683875,428006164904034305,[This is what happens when you leave a hotel c...,Twitter,[Instead of encountering a mound of dirty towe...,This Is What Happens When You Leave A Hotel Cl...,Instead of encountering a mound of dirty towel...,"givebackfilms,give back films,video,random act...",[http://s.m.huffpost.com/assets/Logo_Huffingto...,http://huff.to/1ebARdm,"{'source': 'anonymized', 'humanSpoiler': 'She ...",[The video below shows the stunned cleaner ini...,"[[[3, 0], [3, 150]]]",[passage],False,
796,7912282b-137b-4098-875d-8ad9f19354a8,806153730206892032,[This Texas GOP elector announces that he won'...,Twitter,[A Republican elector in Texas says he will no...,Texas GOP elector announces he won't vote for ...,The Electoral College vote for president on De...,"donald trump, texas, electoral college, faithl...",,,"{'source': 'anonymized', 'humanSpoiler': None,...",[Christopher Suprun],"[[[1, 45], [1, 63]]]",[phrase],False,
797,1fdf71e8-ec14-4c3b-a7c5-ca678c6f8ccb,847331053991813120,[This beauty queen cured her acne with one die...,Twitter,[Her inspirational journey is encouraging othe...,UK beauty queen cured her severe acne with one...,A beauty pageant contestant has taken to Insta...,"acne, Skincare, beauty, Beauty pageant, Dermat...",,,"{'source': 'anonymized', 'humanSpoiler': None,...","[Rachel Crawley, High fat vegan plant based di...","[[[2, 144], [2, 158]], [[6, 56], [6, 124]]]",[multi],False,
798,17f6b540-cf8d-4ddf-8321-1c9ce2315d71,788056531304583168,[WikiLeaks' Julian Assange Reported Dead],Twitter,"[On 16 October 2016, WikiLeaks posted a series...","WikiLeaks’ Julian Assange Isn’t Dead, Just Off...",A series of mysterious tweets from WikiLeaks l...,"dead man's switch, julian assange, wikileaks",[http://static.snopes.com/app/themes/snopes-th...,http://trib.al/leR8lNw,"{'source': 'anonymized', 'humanSpoiler': 'It w...",[Julian Assange’s internet link has been inten...,"[[[11, 0], [11, 78]]]",[passage],False,
