In [2]:
import pandas as pd
import numpy as np
import importlib

In [3]:
numbers = ['2','3','4','5','6','7','8','9']
number_words = ['two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'Two', 'Three', 'Four', 'Five', 'Six', 'Seven', 'Eight', 'Nine', 'Ten']
currency_words = ['euro', 'yen', 'Euro','Yen']
currency_signs = ['€', '¥']
selection_multi = ['multi']
selection_phrase = ['phrase']
selection_passage = ['passage']
#baseline = importlib.import_module('transformer-baseline-task-1')

def has_number_word(inputString):
    return any(word in number_words for word in inputString.split())

def has_number(inputString):
    return any(char.isdigit() for char in inputString)

def has_currency_word(inputString):
    return any(word in currency_words for word in inputString.split())

def has_currency_sign(inputString):
    return any(char in currency_signs for char in inputString)

df = pd.read_json('../data/validation.jsonl', lines=True)
df['postText'] = df['postText'].apply(lambda p: p[0])

df['st_contains_currency_sign_or_word'] = df['postText'].apply(lambda p: has_currency_sign(p) or has_currency_word(p))
df['st_contains_numbers_or_number_words'] = df['postText'].apply(lambda p: has_number(p) or has_number_word(p))
conditions = [
    ((df['st_contains_numbers_or_number_words'] == True) & (df['st_contains_currency_sign_or_word'] == False))
]
df['multiByCondition'] = np.select(conditions,['multi'],'')
mask_multi_pred = df.multiByCondition.apply(lambda x: any(item for item in selection_multi if item in x))
index_array_conditional = df[mask_multi_pred].index

In [4]:
mask_multi_act = df.tags.apply(lambda x: any(item for item in selection_multi if item in x))
mask_phrase_act = df.tags.apply(lambda x: any(item for item in selection_phrase if item in x))
mask_passage_act = df.tags.apply(lambda x: any(item for item in selection_passage if item in x))
df['targetParagraphsAmount'] = df['targetParagraphs'].apply(lambda p: len(p))
average_multi_target_amounts = df[mask_multi_act]['targetParagraphsAmount'].mean()
average_phrase_target_amounts = df[mask_phrase_act]['targetParagraphsAmount'].mean()
average_passage_target_amounts = df[mask_passage_act]['targetParagraphsAmount'].mean()

def multiHasLowestDistance(targetParagraphAmount):
    distanceMultiMean = abs(targetParagraphAmount - average_multi_target_amounts)
    distancePassageMean = abs(targetParagraphAmount - average_passage_target_amounts)
    distancePhraseMean = abs(targetParagraphAmount - average_phrase_target_amounts)
    distance_list = [distanceMultiMean, distancePassageMean, distancePhraseMean]
    distance_list.sort()
    if distance_list[0] == distanceMultiMean: return True
    return False

lowest_dist_to_multi_avg = df['targetParagraphs'].apply(lambda p: multiHasLowestDistance(len(p)))
index_array_distance = df[lowest_dist_to_multi_avg].index

In [6]:
def contains_recipe_words(inputString):
    return any(word in ['tbsp.', 'Tbsp.', 'tbs.', 'Tbs.', 'oz.', 'Oz.'] for word in inputString.split())

def get_number(inputString):
    for char in inputString:
        return int(char) if char.isdigit() else 0

def contains_explicit_enumeration(targetParagraphs):
    predIsSmallerCounter = 0;
    lastNumber = 0;
    for targetParagraph in targetParagraphs:
        if(contains_recipe_words(targetParagraph)): return False
        firstCharsOfParagraph = targetParagraph[0:2]
        number_exists = has_number(firstCharsOfParagraph)
        if number_exists:
            currentNumber = get_number(firstCharsOfParagraph)
            if lastNumber < currentNumber: predIsSmallerCounter = predIsSmallerCounter + 1;
            lastNumber = currentNumber;
    return True if predIsSmallerCounter >= 2 else False

df = pd.read_json('../data/validation.jsonl', lines=True)
df['containsExplicitEnumeration'] = df.targetParagraphs.apply(lambda x: contains_explicit_enumeration(x))
conditions = [
    ((df['containsExplicitEnumeration'] == True))
]
df['spoilerType'] = np.select(conditions,['multi'],'')
mask_multi_pred = df.spoilerType.apply(lambda x: any(item for item in selection_multi if item in x))
index_array_explicit_enum = df[mask_multi_pred].index

In [7]:
indecesMajVoted = []
for index in range(df.index.start, df.index.stop):
    arrayCount = 0
    if index in index_array_conditional: arrayCount = arrayCount + 1
    if index in index_array_distance: arrayCount = arrayCount + 1
    if index in index_array_explicit_enum: arrayCount = arrayCount + 1
    if arrayCount >= 2: indecesMajVoted.append(index)

In [8]:
df.loc[indecesMajVoted,'spoilerType'] = '[multi]'

mask_multi_act = df.tags.apply(lambda x: any(item for item in selection_multi if item in x))
mask_multi_not_act = df.tags.apply(lambda x: not any(item for item in selection_multi if item in x))
mask_multi_pred = df.spoilerType.apply(lambda x: any(item for item in selection_multi if item in x))
mask_multi_not_pred = df.spoilerType.apply(lambda x: not any(item for item in selection_multi if item in x))

true_positive = len(df[mask_multi_pred & mask_multi_act].index)
false_positive = len(df[mask_multi_pred & mask_multi_not_act].index)
true_negative = len(df[mask_multi_not_pred & mask_multi_not_act].index)
false_negative = len(df[mask_multi_not_pred & mask_multi_act].index)

print('TP: ', true_positive)
print('FP: ', false_positive)
print('TN: ', true_negative)
print('FN: ', false_negative)

accuracy = (true_positive + true_negative) / (true_positive + false_positive + true_negative + false_negative)
print('Accuracy: ', accuracy)

TP:  59
FP:  35
TN:  622
FN:  84
Accuracy:  0.85125
