In [1]:
import pandas as pd
import numpy as np
import re
import pickle
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.inspection import permutation_importance
import nltk

### Functions to derive features

In [2]:
numbers = ['2','3','4','5','6','7','8','9']
number_words = ['two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'Two', 'Three', 'Four', 'Five', 'Six', 'Seven', 'Eight', 'Nine', 'Ten']
currency_words = ['euro', 'yen', 'Euro','Yen']
currency_signs = ['€', '¥']

def get_number_from_string(inputString):
    for char in inputString:
        return int(char) if char.isdigit() else 0

def has_number_word(inputString):
    return any(word in number_words for word in inputString.split())

def has_number(inputString):
    return any(char.isdigit() for char in inputString)

def has_currency_word(inputString):
    return any(word in currency_words for word in inputString.split())

def has_currency_sign(inputString):
    return any(char in currency_signs for char in inputString)

def number_of_words(inputString):
    return len(inputString.split())

def amount_target_paragraphs(targetParagraphs):
    return len(targetParagraphs)

def amount_lowercase(inputString):
    return len(re.findall(r'[a-z]', inputString))

def amount_uppercase(inputString):
    return len(re.findall(r'[A-Z]', inputString))

def amount_letters(inputString):
    return amount_lowercase(inputString) + amount_uppercase(inputString)

def amount_commas(inputString):
    return len(re.findall(r',', inputString))

def amount_exclamationmarks(inputString):
    return len(re.findall(r'!', inputString))

def amount_dots(inputString):
    return len(re.findall(r'\.', inputString))

def amount_questionmarks(inputString):
    return len(re.findall(r'\?', inputString))

def amount_quotationmarks(inputString):
    return len(re.findall(r'\"', inputString))

def contains_recipe_words(inputString):
    return any(word in ['tbsp.', 'Tbsp.', 'tbs.', 'Tbs.', 'oz.', 'Oz.'] for word in inputString.split())

def contains_explicit_enumeration(targetParagraphs):
    predIsSmallerCounter = 0;
    lastNumber = 0;
    for targetParagraph in targetParagraphs:
        firstCharsOfParagraph = targetParagraph[0:2]
        number_exists = has_number(firstCharsOfParagraph)
        if number_exists:
            currentNumber = get_number_from_string(firstCharsOfParagraph)
            if lastNumber < currentNumber: predIsSmallerCounter = predIsSmallerCounter + 1;
            lastNumber = currentNumber;
    return True if predIsSmallerCounter >= 2 else False
    

def number_of_nouns(inputString):
    tokens = nltk.word_tokenize(inputString, language='english')
    tags = nltk.pos_tag(tokens)
    return sum(map(lambda l: l[1] == 'NN' or l[1] == 'NNS', tags))

def number_of_adjectives(inputString):
    tokens = nltk.word_tokenize(inputString, language='english')
    tags = nltk.pos_tag(tokens)
    return sum(map(lambda l: l[1] == 'JJ' or l[1] == 'RB', tags))

def number_of_verbs(inputString):
    tokens = nltk.word_tokenize(inputString, language='english')
    tags = nltk.pos_tag(tokens)
    return sum(map(lambda l: l[1] == 'VBN' or l[1] == 'VBP', tags))

def number_of_articles(inputString):
    tokens = nltk.word_tokenize(inputString, language='english')
    tags = nltk.pos_tag(tokens)
    return sum(map(lambda l: l[1] == 'DT', tags))

In [3]:
def add_features(dataframe):
    dataframe['postText'] = dataframe['postText'].apply(lambda p: p[0])
    dataframe['postTextContainsNumber'] = dataframe['postText'].apply(lambda p: 1 if has_number(p) == True else 0)
    dataframe['postTextContainsNumberWord'] = dataframe['postText'].apply(lambda p: 1 if has_number_word(p) == True else 0)
    dataframe['postTextContainsCurrencyWord'] = dataframe['postText'].apply(lambda p: 1 if has_currency_word(p) == True else 0)
    dataframe['postTextContainsCurrencySign'] = dataframe['postText'].apply(lambda p: 1 if has_currency_sign(p) == True else 0)
    dataframe['postTextAmountWords'] = dataframe['postText'].apply(lambda p: number_of_words(p))
    dataframe['postTextAmountLowerCase'] = dataframe['postText'].apply(lambda p: amount_lowercase(p))
    dataframe['postTextAmountUpperCase'] = dataframe['postText'].apply(lambda p: amount_uppercase(p))
    dataframe['postTextAmountLetters'] = dataframe['postText'].apply(lambda p: amount_letters(p))
    dataframe['postTextAmountCommas'] = dataframe['postText'].apply(lambda p: amount_commas(p))
    dataframe['postTextAmountExclMarks'] = dataframe['postText'].apply(lambda p: amount_exclamationmarks(p))
    dataframe['postTextAmountDots'] = dataframe['postText'].apply(lambda p: amount_dots(p))
    dataframe['postTextAmountQuestionMarks'] = dataframe['postText'].apply(lambda p: amount_questionmarks(p))
    dataframe['postTextAmountQuotationMarks'] = dataframe['postText'].apply(lambda p: amount_quotationmarks(p))
    dataframe['postTextNouns'] = dataframe['postText'].apply(lambda p: number_of_nouns(p))
    dataframe['postTextAdjectives'] = dataframe['postText'].apply(lambda p: number_of_adjectives(p))
    dataframe['postTextVerbs'] = dataframe['postText'].apply(lambda p: number_of_verbs(p))
    dataframe['postTextArticles'] = dataframe['postText'].apply(lambda p: number_of_articles(p))

    dataframe['targetParagraphsConcat'] = dataframe['targetParagraphs'].apply(lambda p: "".join(p)) 
    dataframe['targetParagraphsContainNumber'] = dataframe['targetParagraphsConcat'].apply(lambda p: 1 if has_number(p) == True else 0)
    dataframe['targetParagraphsContainNumberWord'] = dataframe['targetParagraphsConcat'].apply(lambda p: 1 if has_number_word(p) == True else 0)
    dataframe['targetParagraphsContainCurrencyWord'] = dataframe['targetParagraphsConcat'].apply(lambda p: 1 if has_currency_word(p) == True else 0)
    dataframe['targetParagraphsContainCurrencySign'] = dataframe['targetParagraphsConcat'].apply(lambda p: 1 if has_currency_sign(p) == True else 0)
    dataframe['targetParagraphsAmountWords'] = dataframe['targetParagraphsConcat'].apply(lambda p: number_of_words(p))
    dataframe['targetParagraphsAmountLowerCase'] = dataframe['targetParagraphsConcat'].apply(lambda p: amount_lowercase(p))
    dataframe['targetParagraphsAmountUpperCase'] = dataframe['targetParagraphsConcat'].apply(lambda p: amount_uppercase(p))
    dataframe['targetParagraphsAmountLetters'] = dataframe['targetParagraphsConcat'].apply(lambda p: amount_letters(p))
    dataframe['targetParagraphsAmountCommas'] = dataframe['targetParagraphsConcat'].apply(lambda p: amount_commas(p))
    dataframe['targetParagraphsAmountExclMarks'] = dataframe['targetParagraphsConcat'].apply(lambda p: amount_commas(p))
    dataframe['targetParagraphsAmountDots'] = dataframe['targetParagraphsConcat'].apply(lambda p: amount_dots(p))
    dataframe['targetParagraphsAmountQuestionMarks'] = dataframe['targetParagraphsConcat'].apply(lambda p: amount_questionmarks(p))
    dataframe['targetParagraphsAmountQuotationMarks'] = dataframe['targetParagraphsConcat'].apply(lambda p: amount_quotationmarks(p))
    dataframe['targetParagraphsAreExplicitlyEnumerated'] = dataframe['targetParagraphs'].apply(lambda p: 1 if contains_explicit_enumeration(p) == True else 0)
    dataframe['targetParagraphsAmount'] = dataframe['targetParagraphs'].apply(lambda p: amount_target_paragraphs(p))
    dataframe['targetParagraphsContainRecipeWord'] = dataframe['targetParagraphsConcat'].apply(lambda p: 1 if contains_recipe_words(p) else 0)
    dataframe['targetParagraphNouns'] = dataframe['targetParagraphsConcat'].apply(lambda p: number_of_nouns(p))
    dataframe['targetParagraphsAdjectives'] = dataframe['targetParagraphsConcat'].apply(lambda p: number_of_adjectives(p))
    dataframe['targetParagraphsVerbs'] = dataframe['targetParagraphsConcat'].apply(lambda p: number_of_verbs(p))
    dataframe['targetParagraphsArticles'] = dataframe['targetParagraphsConcat'].apply(lambda p: number_of_articles(p))
    return dataframe

### load training data and calculate features

In [4]:
trainingData = pd.read_json('../data/train.jsonl', lines=True)
trainingData['tags'] = trainingData['tags'].apply(lambda l: l[0])
trainingData['spoilerType'] = trainingData['tags'].apply(lambda r: 'multi' if r == 'multi' else 'non-multi')

trainingData['postText'] = trainingData['postText'].apply(lambda p: p[0])
trainingData = add_features(trainingData)

### filter dataframe to have necessary features for model training, prepare datasets

In [10]:
trainingDataFeatureSet = trainingData[['postTextContainsNumber', 'postTextContainsNumberWord', 'postTextContainsCurrencyWord', 'postTextContainsCurrencySign', 'postTextAmountWords', 'postTextAmountLowerCase', 'postTextAmountUpperCase', 'postTextAmountLetters', 'postTextAmountCommas', 'postTextAmountExclMarks', 'postTextAmountDots', 'postTextAmountQuestionMarks', 'postTextAmountQuotationMarks', 'targetParagraphsContainNumber', 'targetParagraphsContainNumberWord', 'targetParagraphsContainCurrencyWord', 'targetParagraphsContainCurrencySign', 'targetParagraphsAmountWords', 'targetParagraphsAmount' ,'targetParagraphsAmountLowerCase', 'targetParagraphsAmountUpperCase', 'targetParagraphsAmountLetters', 'targetParagraphsAmountCommas', 'targetParagraphsAmountExclMarks', 'targetParagraphsAmountDots', 'targetParagraphsAmountQuestionMarks', 'targetParagraphsAmountQuotationMarks', 'targetParagraphsAreExplicitlyEnumerated', 'targetParagraphsContainRecipeWord', 'postTextNouns', 'postTextAdjectives', 'postTextVerbs', 'postTextArticles', 'targetParagraphNouns', 'targetParagraphsAdjectives', 'targetParagraphsVerbs', 'targetParagraphsArticles']]
trainingDataTarget = trainingData['spoilerType']
clf = GradientBoostingClassifier(n_estimators=350, learning_rate=1.0, max_depth=5, random_state=0)
clf.fit(trainingDataFeatureSet, trainingDataTarget)
pickle.dump(clf, open("multi.model", 'wb'))

### load validation data and calculate feature

In [15]:
validationData = pd.read_json('../data/validation.jsonl', lines=True)
validationData['tags'] = validationData['tags'].apply(lambda v: v[0])
validationData['spoilerType'] = validationData['tags'].apply(lambda r: 'multi' if r == 'multi' else 'non-multi')

validationData['postText'] = validationData['postText'].apply(lambda p: p[0])
validationData = add_features(validationData)

      postTextContainsNumber  postTextContainsNumberWord  \
0                          0                           0   
1                          0                           0   
2                          0                           0   
3                          0                           0   
4                          0                           0   
...                      ...                         ...   
3195                       0                           0   
3196                       0                           0   
3197                       1                           0   
3198                       0                           0   
3199                       0                           0   

      postTextContainsCurrencyWord  postTextContainsCurrencySign  \
0                                0                             0   
1                                0                             0   
2                                0                             0   
3      

### filter dataframe to have necessary features for model training, prepare datasets

In [12]:
validationDataFeatureSet = validationData[['postTextContainsNumber', 'postTextContainsNumberWord', 'postTextContainsCurrencyWord', 'postTextContainsCurrencySign', 'postTextAmountWords', 'postTextAmountLowerCase', 'postTextAmountUpperCase', 'postTextAmountLetters', 'postTextAmountCommas', 'postTextAmountExclMarks', 'postTextAmountDots', 'postTextAmountQuestionMarks', 'postTextAmountQuotationMarks', 'targetParagraphsContainNumber', 'targetParagraphsContainNumberWord', 'targetParagraphsContainCurrencyWord', 'targetParagraphsContainCurrencySign', 'targetParagraphsAmountWords', 'targetParagraphsAmount', 'targetParagraphsAmountLowerCase', 'targetParagraphsAmountUpperCase', 'targetParagraphsAmountLetters', 'targetParagraphsAmountCommas', 'targetParagraphsAmountExclMarks', 'targetParagraphsAmountDots', 'targetParagraphsAmountQuestionMarks', 'targetParagraphsAmountQuotationMarks', 'targetParagraphsAreExplicitlyEnumerated', 'targetParagraphsContainRecipeWord', 'postTextNouns', 'postTextAdjectives', 'postTextVerbs', 'postTextArticles', 'targetParagraphNouns', 'targetParagraphsAdjectives', 'targetParagraphsVerbs', 'targetParagraphsArticles']]
predictedSpoilerTypesArray = clf.predict(validationDataFeatureSet)
predictedSpoilerTypes = pd.DataFrame({'predicted': predictedSpoilerTypesArray})
validationData['predicted'] = predictedSpoilerTypes

### calculate accuracy

In [13]:
correctly_predicted_rows = np.where(validationData['spoilerType'] == validationData['predicted'])
true_positives = len(correctly_predicted_rows[0])
acc = true_positives/len(validationData.index)
print('Accuracy: ', acc)

Accuracy:  0.83875


### feature importance

In [14]:
r = permutation_importance(clf, validationDataFeatureSet, validationData['spoilerType'], n_repeats=30, random_state=0)
for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{validationDataFeatureSet.columns[i]:<8}"
        f"{r.importances_mean[i]:.3f}"
        f" +/- {r.importances_std[i]:.3f}")

targetParagraphsAmountLowerCase0.215 +/- 0.012
targetParagraphsAmountWords0.116 +/- 0.012
targetParagraphsAmountLetters0.092 +/- 0.010
targetParagraphsAdjectives0.092 +/- 0.011
targetParagraphsAmountDots0.052 +/- 0.008
targetParagraphsArticles0.046 +/- 0.009
targetParagraphNouns0.042 +/- 0.009
targetParagraphsVerbs0.041 +/- 0.008
postTextContainsNumber0.030 +/- 0.005
targetParagraphsAmount0.017 +/- 0.007
targetParagraphsAmountExclMarks0.014 +/- 0.005
targetParagraphsAmountQuotationMarks0.012 +/- 0.005
postTextAmountUpperCase0.003 +/- 0.001
postTextAmountQuotationMarks0.003 +/- 0.001
