In [8]:
import pandas as pd
import numpy as np
import re
from sklearn.ensemble import GradientBoostingClassifier

### Functions to derive features

In [9]:
numbers = ['2','3','4','5','6','7','8','9']
number_words = ['two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'Two', 'Three', 'Four', 'Five', 'Six', 'Seven', 'Eight', 'Nine', 'Ten']
currency_words = ['euro', 'yen', 'Euro','Yen']
currency_signs = ['€', '¥']

def get_number_from_string(inputString):
    for char in inputString:
        return int(char) if char.isdigit() else 0

def has_number_word(inputString):
    return any(word in number_words for word in inputString.split())

def has_number(inputString):
    return any(char.isdigit() for char in inputString)

def has_currency_word(inputString):
    return any(word in currency_words for word in inputString.split())

def has_currency_sign(inputString):
    return any(char in currency_signs for char in inputString)

def number_of_words(inputString):
    return len(inputString.split())

def amount_lowercase(inputString):
    return len(re.findall(r'[a-z]', inputString))

def amount_uppercase(inputString):
    return len(re.findall(r'[A-Z]', inputString))

def amount_letters(inputString):
    return amount_lowercase(inputString) + amount_uppercase(inputString)

def amount_commas(inputString):
    return len(re.findall(r',', inputString))

def amount_exclamationmarks(inputString):
    return len(re.findall(r'!', inputString))

def amount_dots(inputString):
    return len(re.findall(r'\.', inputString))

def amount_questionmarks(inputString):
    return len(re.findall(r'\?', inputString))

def amount_quotationmarks(inputString):
    return len(re.findall(r'\"', inputString))

def contains_recipe_words(inputString):
    return any(word in ['tbsp.', 'Tbsp.', 'tbs.', 'Tbs.', 'oz.', 'Oz.'] for word in inputString.split())

def contains_explicit_enumeration(targetParagraphs):
    predIsSmallerCounter = 0;
    lastNumber = 0;
    for targetParagraph in targetParagraphs:
        firstCharsOfParagraph = targetParagraph[0:2]
        number_exists = has_number(firstCharsOfParagraph)
        if number_exists:
            currentNumber = get_number_from_string(firstCharsOfParagraph)
            if lastNumber < currentNumber: predIsSmallerCounter = predIsSmallerCounter + 1;
            lastNumber = currentNumber;
    return True if predIsSmallerCounter >= 2 else False

### load training data and calculate features

In [10]:
trainingData = pd.read_json('../data/train.jsonl', lines=True)
trainingData['tags'] = trainingData['tags'].apply(lambda l: l[0])
trainingData['spoilerType'] = trainingData['tags'].apply(lambda r: 'multi' if r == 'multi' else 'non-multi')

trainingData['postText'] = trainingData['postText'].apply(lambda p: p[0])
trainingData['postTextContainsNumber'] = trainingData['postText'].apply(lambda p: 1 if has_number(p) == True else 0)
trainingData['postTextContainsNumberWord'] = trainingData['postText'].apply(lambda p: 1 if has_number_word(p) == True else 0)
trainingData['postTextContainsCurrencyWord'] = trainingData['postText'].apply(lambda p: 1 if has_currency_word(p) == True else 0)
trainingData['postTextContainsCurrencySign'] = trainingData['postText'].apply(lambda p: 1 if has_currency_sign(p) == True else 0)
trainingData['postTextAmountWords'] = trainingData['postText'].apply(lambda p: number_of_words(p))
trainingData['postTextAmountLowerCase'] = trainingData['postText'].apply(lambda p: amount_lowercase(p))
trainingData['postTextAmountUpperCase'] = trainingData['postText'].apply(lambda p: amount_uppercase(p))
trainingData['postTextAmountLetters'] = trainingData['postText'].apply(lambda p: amount_letters(p))
trainingData['postTextAmountCommas'] = trainingData['postText'].apply(lambda p: amount_commas(p))
trainingData['postTextAmountExclMarks'] = trainingData['postText'].apply(lambda p: amount_exclamationmarks(p))
trainingData['postTextAmountDots'] = trainingData['postText'].apply(lambda p: amount_dots(p))
trainingData['postTextAmountQuestionMarks'] = trainingData['postText'].apply(lambda p: amount_questionmarks(p))
trainingData['postTextAmountQuotationMarks'] = trainingData['postText'].apply(lambda p: amount_quotationmarks(p))

trainingData['targetParagraphsConcat'] = trainingData['targetParagraphs'].apply(lambda p: "".join(p)) 
trainingData['targetParagraphsContainNumber'] = trainingData['targetParagraphsConcat'].apply(lambda p: 1 if has_number(p) == True else 0)
trainingData['targetParagraphsContainNumberWord'] = trainingData['targetParagraphsConcat'].apply(lambda p: 1 if has_number_word(p) == True else 0)
trainingData['targetParagraphsContainCurrencyWord'] = trainingData['targetParagraphsConcat'].apply(lambda p: 1 if has_currency_word(p) == True else 0)
trainingData['targetParagraphsContainCurrencySign'] = trainingData['targetParagraphsConcat'].apply(lambda p: 1 if has_currency_sign(p) == True else 0)
trainingData['targetParagraphsAmountWords'] = trainingData['targetParagraphsConcat'].apply(lambda p: number_of_words(p))
trainingData['targetParagraphsAmountLowerCase'] = trainingData['targetParagraphsConcat'].apply(lambda p: amount_lowercase(p))
trainingData['targetParagraphsAmountUpperCase'] = trainingData['targetParagraphsConcat'].apply(lambda p: amount_uppercase(p))
trainingData['targetParagraphsAmountLetters'] = trainingData['targetParagraphsConcat'].apply(lambda p: amount_letters(p))
trainingData['targetParagraphsAmountCommas'] = trainingData['targetParagraphsConcat'].apply(lambda p: amount_commas(p))
trainingData['targetParagraphsAmountExclMarks'] = trainingData['targetParagraphsConcat'].apply(lambda p: amount_commas(p))
trainingData['targetParagraphsAmountDots'] = trainingData['targetParagraphsConcat'].apply(lambda p: amount_dots(p))
trainingData['targetParagraphsAmountQuestionMarks'] = trainingData['targetParagraphsConcat'].apply(lambda p: amount_questionmarks(p))
trainingData['targetParagraphsAmountQuotationMarks'] = trainingData['targetParagraphsConcat'].apply(lambda p: amount_quotationmarks(p))
trainingData['targetParagraphsAreExplicitlyEnumerated'] = trainingData['targetParagraphs'].apply(lambda p: 1 if contains_explicit_enumeration(p) == True else 0)
trainingData['targetParagraphsContainRecipeWord'] = trainingData['targetParagraphsConcat'].apply(lambda p: 1 if contains_recipe_words(p) else 0)

### filter dataframe to have necessary features for model training, prepare datasets

In [11]:
trainingDataFeatureSet = trainingData[['postTextContainsNumber', 'postTextContainsNumberWord', 'postTextContainsCurrencyWord', 'postTextContainsCurrencySign', 'postTextAmountWords', 'postTextAmountLowerCase', 'postTextAmountUpperCase', 'postTextAmountLetters', 'postTextAmountCommas', 'postTextAmountExclMarks', 'postTextAmountDots', 'postTextAmountQuestionMarks', 'postTextAmountQuotationMarks', 'targetParagraphsContainNumber', 'targetParagraphsContainNumberWord', 'targetParagraphsContainCurrencyWord', 'targetParagraphsContainCurrencySign', 'targetParagraphsAmountWords', 'targetParagraphsAmountLowerCase', 'targetParagraphsAmountUpperCase', 'targetParagraphsAmountLetters', 'targetParagraphsAmountCommas', 'targetParagraphsAmountExclMarks', 'targetParagraphsAmountDots', 'targetParagraphsAmountQuestionMarks', 'targetParagraphsAmountQuotationMarks', 'targetParagraphsAreExplicitlyEnumerated', 'targetParagraphsContainRecipeWord']]
trainingDataTarget = trainingData['spoilerType']
clf = GradientBoostingClassifier(n_estimators=200, learning_rate=1.0, max_depth=5, random_state=0)
clf.fit(trainingDataFeatureSet, trainingDataTarget)

### load validation data and calculate feature

In [13]:
validationData = pd.read_json('../data/validation.jsonl', lines=True)
validationData['tags'] = validationData['tags'].apply(lambda v: v[0])
validationData['spoilerType'] = validationData['tags'].apply(lambda r: 'multi' if r == 'multi' else 'non-multi')

validationData['postText'] = validationData['postText'].apply(lambda p: p[0])
validationData['postTextContainsNumber'] = validationData['postText'].apply(lambda p: 1 if has_number(p) == True else 0)
validationData['postTextContainsNumberWord'] = validationData['postText'].apply(lambda p: 1 if has_number_word(p) == True else 0)
validationData['postTextContainsCurrencyWord'] = validationData['postText'].apply(lambda p: 1 if has_currency_word(p) == True else 0)
validationData['postTextContainsCurrencySign'] = validationData['postText'].apply(lambda p: 1 if has_currency_sign(p) == True else 0)
validationData['postTextAmountWords'] = validationData['postText'].apply(lambda p: number_of_words(p))
validationData['postTextAmountLowerCase'] = validationData['postText'].apply(lambda p: amount_lowercase(p))
validationData['postTextAmountUpperCase'] = validationData['postText'].apply(lambda p: amount_uppercase(p))
validationData['postTextAmountLetters'] = validationData['postText'].apply(lambda p: amount_letters(p))
validationData['postTextAmountCommas'] = validationData['postText'].apply(lambda p: amount_commas(p))
validationData['postTextAmountExclMarks'] = validationData['postText'].apply(lambda p: amount_exclamationmarks(p))
validationData['postTextAmountDots'] = validationData['postText'].apply(lambda p: amount_dots(p))
validationData['postTextAmountQuestionMarks'] = validationData['postText'].apply(lambda p: amount_questionmarks(p))
validationData['postTextAmountQuotationMarks'] = validationData['postText'].apply(lambda p: amount_quotationmarks(p))

validationData['targetParagraphsConcat'] = validationData['targetParagraphs'].apply(lambda p: "".join(p)) 
validationData['targetParagraphsContainNumber'] = validationData['targetParagraphsConcat'].apply(lambda p: 1 if has_number(p) == True else 0)
validationData['targetParagraphsContainNumberWord'] = validationData['targetParagraphsConcat'].apply(lambda p: 1 if has_number_word(p) == True else 0)
validationData['targetParagraphsContainCurrencyWord'] = validationData['targetParagraphsConcat'].apply(lambda p: 1 if has_currency_word(p) == True else 0)
validationData['targetParagraphsContainCurrencySign'] = validationData['targetParagraphsConcat'].apply(lambda p: 1 if has_currency_sign(p) == True else 0)
validationData['targetParagraphsAmountWords'] = validationData['targetParagraphsConcat'].apply(lambda p: number_of_words(p))
validationData['targetParagraphsAmountLowerCase'] = validationData['targetParagraphsConcat'].apply(lambda p: amount_lowercase(p))
validationData['targetParagraphsAmountUpperCase'] = validationData['targetParagraphsConcat'].apply(lambda p: amount_uppercase(p))
validationData['targetParagraphsAmountLetters'] = validationData['targetParagraphsConcat'].apply(lambda p: amount_letters(p))
validationData['targetParagraphsAmountCommas'] = validationData['targetParagraphsConcat'].apply(lambda p: amount_commas(p))
validationData['targetParagraphsAmountExclMarks'] = validationData['targetParagraphsConcat'].apply(lambda p: amount_commas(p))
validationData['targetParagraphsAmountDots'] = validationData['targetParagraphsConcat'].apply(lambda p: amount_dots(p))
validationData['targetParagraphsAmountQuestionMarks'] = validationData['targetParagraphsConcat'].apply(lambda p: amount_questionmarks(p))
validationData['targetParagraphsAmountQuotationMarks'] = validationData['targetParagraphsConcat'].apply(lambda p: amount_quotationmarks(p))
validationData['targetParagraphsAreExplicitlyEnumerated'] = validationData['targetParagraphs'].apply(lambda p: 1 if contains_explicit_enumeration(p) == True else 0)
validationData['targetParagraphsContainRecipeWord'] = validationData['targetParagraphsConcat'].apply(lambda p: 1 if contains_recipe_words(p) else 0)

### filter dataframe to have necessary features for model training, prepare datasets

In [14]:
validationDataFeatureSet = validationData[['postTextContainsNumber', 'postTextContainsNumberWord', 'postTextContainsCurrencyWord', 'postTextContainsCurrencySign', 'postTextAmountWords', 'postTextAmountLowerCase', 'postTextAmountUpperCase', 'postTextAmountLetters', 'postTextAmountCommas', 'postTextAmountExclMarks', 'postTextAmountDots', 'postTextAmountQuestionMarks', 'postTextAmountQuotationMarks', 'targetParagraphsContainNumber', 'targetParagraphsContainNumberWord', 'targetParagraphsContainCurrencyWord', 'targetParagraphsContainCurrencySign', 'targetParagraphsAmountWords', 'targetParagraphsAmountLowerCase', 'targetParagraphsAmountUpperCase', 'targetParagraphsAmountLetters', 'targetParagraphsAmountCommas', 'targetParagraphsAmountExclMarks', 'targetParagraphsAmountDots', 'targetParagraphsAmountQuestionMarks', 'targetParagraphsAmountQuotationMarks', 'targetParagraphsAreExplicitlyEnumerated', 'targetParagraphsContainRecipeWord']]
predictedSpoilerTypesArray = clf.predict(validationDataFeatureSet)
predictedSpoilerTypes = pd.DataFrame({'predicted': predictedSpoilerTypesArray})
validationData['predicted'] = predictedSpoilerTypes

### calculate accuracy

In [15]:
correctly_predicted_rows = np.where(validationData['spoilerType'] == validationData['predicted'])
true_positives = len(correctly_predicted_rows[0])
acc = true_positives/len(validationData.index)
print('Accuracy: ', acc)

Accuracy:  0.82875
