In [3]:
import pandas as pd

### Functions to derive features

In [13]:
numbers = ['2','3','4','5','6','7','8','9']
number_words = ['two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'Two', 'Three', 'Four', 'Five', 'Six', 'Seven', 'Eight', 'Nine', 'Ten']
currency_words = ['euro', 'yen', 'Euro','Yen']
currency_signs = ['€', '¥']

def has_number_word(inputString):
    return any(word in number_words for word in inputString.split())

def has_number(inputString):
    return any(char.isdigit() for char in inputString)

def has_currency_word(inputString):
    return any(word in currency_words for word in inputString.split())

def has_currency_sign(inputString):
    return any(char in currency_signs for char in inputString)

def number_of_words(inputString):
    return len(inputString.split())

def amount_lowercase(inputString):
    return inputString.count(r'[a-z]')

def amount_uppercase(inputString):
    return inputString.count(r'[A-Z]')

def amount_letters(inputString):
    return amount_lowercase(inputString) + amount_uppercase(inputString)

def amount_commas(inputString):
    return inputString.count(r',')

def amount_exclamationmarks(inputString):
    return inputString.count(r'!')

def amount_dots(inputString):
    return inputString.count(r'\.')

def amount_questionmarks(inputString):
    return inputString.count(r'\?')

def amount_quotationmarks(inputString):
    return inputString.count(r'\"')

def contains_explicit_enumeration(targetParagraphs):
    predIsSmallerCounter = 0;
    lastNumber = 0;
    for targetParagraph in targetParagraphs:
        if(contains_recipe_words(targetParagraph)): return False
        firstCharsOfParagraph = targetParagraph[0:2]
        number_exists = has_number(firstCharsOfParagraph)
        if number_exists:
            currentNumber = get_number(firstCharsOfParagraph)
            if lastNumber < currentNumber: predIsSmallerCounter = predIsSmallerCounter + 1;
            lastNumber = currentNumber;
    return True if predIsSmallerCounter >= 2 else False

### load training data and calculate features

In [16]:
trainingData = pd.read_json('../data/train.jsonl', lines=True)

trainingData['postText'] = trainingData['postText'].apply(lambda p: p[0])
trainingData['postTextContainsNumber'] = trainingData['postText'].apply(lambda p: 0 if has_number(p) == True else 1)
trainingData['postTextContainsNumberWord'] = trainingData['postText'].apply(lambda p: 0 if has_number_word(p) == True else 1)
trainingData['postTextContainsCurrencyWord'] = trainingData['postText'].apply(lambda p: 0 if has_currency_word(p) == True else 1)
trainingData['postTextContainsCurrencySign'] = trainingData['postText'].apply(lambda p: 0 if has_currency_sign(p) == True else 1)
trainingData['postTextAmountWords'] = trainingData['postText'].apply(lambda p: number_of_words(p))
trainingData['postTextAmountLowerCase'] = trainingData['postText'].apply(lambda p: amount_lowercase(p))
trainingData['postTextAmountUpperCase'] = trainingData['postText'].apply(lambda p: amount_uppercase(p))
trainingData['postTextAmountLetters'] = trainingData['postText'].apply(lambda p: amount_letters(p))
trainingData['postTextAmountCommas'] = trainingData['postText'].apply(lambda p: amount_commas(p))
trainingData['postTextAmountExclMarks'] = trainingData['postText'].apply(lambda p: amount_exclamationmarks(p))
trainingData['postTextAmountDots'] = trainingData['postText'].apply(lambda p: amount_dots(p))
trainingData['postTextAmountQuestionMarks'] = trainingData['postText'].apply(lambda p: amount_questionmarks(p))
trainingData['postTextAmountQuotationMarks'] = trainingData['postText'].apply(lambda p: amount_quotationmarks(p))

trainingData['targetParagraphsConcat'] = trainingData['targetParagraphs'].apply(lambda p: "".join(p)) 
trainingData['targetParagraphsContainNumber'] = trainingData['targetParagraphsConcat'].apply(lambda p: 0 if has_number(p) == True else 1)
trainingData['targetParagraphsContainNumberWord'] = trainingData['targetParagraphsConcat'].apply(lambda p: 0 if has_number_word(p) == True else 1)
trainingData['targetParagraphsContainCurrencyWord'] = trainingData['targetParagraphsConcat'].apply(lambda p: 0 if has_currency_word(p) == True else 1)
trainingData['targetParagraphsContainCurrencySign'] = trainingData['targetParagraphsConcat'].apply(lambda p: 0 if has_currency_sign(p) == True else 1)
trainingData['targetParagraphsAmountWords'] = trainingData['targetParagraphsConcat'].apply(lambda p: number_of_words(p))
trainingData['targetParagraphsAmountLowerCase'] = trainingData['targetParagraphsConcat'].apply(lambda p: amount_lowercase(p))
trainingData['targetParagraphsAmountUpperCase'] = trainingData['targetParagraphsConcat'].apply(lambda p: amount_uppercase(p))
trainingData['targetParagraphsAmountLetters'] = trainingData['targetParagraphsConcat'].apply(lambda p: amount_letters(p))
trainingData['targetParagraphsAmountCommas'] = trainingData['targetParagraphsConcat'].apply(lambda p: amount_commas(p))
trainingData['targetParagraphsAmountExclMarks'] = trainingData['targetParagraphsConcat'].apply(lambda p: amount_commas(p))
trainingData['targetParagraphsAmountDots'] = trainingData['targetParagraphsConcat'].apply(lambda p: amount_dots(p))
trainingData['targetParagraphsAmountQuestionMarks'] = trainingData['targetParagraphsConcat'].apply(lambda p: amount_questionmarks(p))
trainingData['targetParagraphsAmountQuotationMarks'] = trainingData['targetParagraphsConcat'].apply(lambda p: amount_quotationmarks(p))
trainingData['targetParagraphsAreExplicitlyEnumerated'] = trainingData['targetParagraphs'].apply(lambda p: contains_explicit_enumeration(p))