# First Project Pipline

## Setup

In [1]:
import spacy
from spacy.tokens import Token
from spacy.tokens import Doc
from spacy.language import Language

In [2]:
from gensim.summarization import keywords

In [3]:
from wordfreq import zipf_frequency
from wordfreq import word_frequency

In [4]:
import numpy as np
import pandas as pd
import regex as re

In [5]:
import nltk
#nltk.download('cmudict')

In [6]:
from nltk.corpus import cmudict

phoneme_dict = dict(cmudict.entries())

def syllable_counter(word):
    '''function that counts a syllable in a word'''
    if word not in phoneme_dict:
        return 0
    syllables = phoneme_dict[word]
    count = len([syllable for syllable in syllables if syllable[-1].isdigit()])
    return round(count, 0)

In [200]:
class Minimum():
    def __init__(self, initial_value):
        self.value = initial_value
    
    def update_minimum(self, potential_min):
        if potential_min < self.value:
            self.value = potential_min
            
class Maximum():
    def __init__(self, initial_value):
        self.value = initial_value
    
    def update_maximum(self, potential_min):
        if potential_min > self.value:
            self.value = potential_min

In [201]:
!python --version

Python 3.8.0


## Data

In [202]:
text_file = '../data/kafka.txt'

In [203]:
#print(open(text_file, "r").read())

# Pipeline Components

## Preprocessing

### Wordcount

In [204]:
def wordcount(doc):
    '''gives an overall word count'''
    
    for token in doc:
        if token.is_alpha:
            doc._.wordcount += 1
    
    print(f'{doc._.wordcount} words in document')
         
    return doc

### Filter

In [205]:
def filter_tokens(doc):
    '''filters all tokens'''           
    
    
    for token in doc:
        # filter stopwords
        if not token.is_alpha or token.is_stop:
            token._.is_excluded = True
            
        # filter part-of-speech
        elif token.pos_ not in ['NOUN', 'VERB', 'ADJ']:
            token._.is_excluded = True
            
        # filter entities
        elif token.ent_type != 0:
                token._.is_excluded = True
                
        # count included words   
        else:
            doc._.included_wordcount += 1

    print(f'{doc._.included_wordcount} words in vocabulary')
         
    return doc

### Eliminating Duplicates

In [206]:
def elim_dup(doc):
    '''eliminates all duplicates and counts the appearance of the included words'''
    already_appeared = {}

    for token in doc:
        if not token._.is_excluded:
            if token.lemma_ in already_appeared.keys():
                already_appeared[token.lemma_]._.appearance += 1
                token._.is_excluded = True
                doc._.included_wordcount -= 1
            else:
                token._.appearance = 1
                already_appeared[token.lemma_] = token
    
    print(f'{doc._.included_wordcount} words in vocabulary without duplicates')
    
    return doc

## Word Difficulty

### Rating Difficulty

In [207]:
def syl_weight(n):
    w = 0
    for i in range(n):
        w += 0.5**(i+1)
    return w

In [208]:
def get_difficulty(doc):
   
    for token in doc:
        if not token._.is_excluded:
            lemma = token.lemma_
            difficulty = 8 - zipf_frequency(lemma, 'en') # score of 0-8
            difficulty += syl_weight(syllable_counter(lemma)) # now score of 0-9
            token._.difficulty = round(difficulty/9, 3) #normalised to 0-1
            
    return doc

## Word Relevance

### Relative Freqency

In [209]:
def calculate_relativ_freq(doc):
    '''calculating the relativ frequency of a included word'''
    
    calculate_last = []
    min_freq = Minimum(1)
    max_score = Maximum(0)
    
    def calc_rel_freq(word_freq, token):
        return round(((token._.appearance/doc._.wordcount) **2) / word_freq, 3)
    
    for token in doc:
        if not token._.is_excluded:
            overall_word_freq = word_frequency(token.lemma_, 'en')
            
            if overall_word_freq == 0:
                calculate_last.append(token)
            else:
                min_freq.update_minimum(overall_word_freq)
                token._.relativ_freq = calc_rel_freq(overall_word_freq, token)
                max_score.update_maximum(token._.relativ_freq)
    
    for token in calculate_last:
        token._.relativ_freq = calc_rel_freq(min_freq.value, token)
        max_score.update_maximum(token._.relativ_freq)
        
    for token in doc:
        if not token._.is_excluded:
            token._.relativ_freq /= max_score.value
    
    
    return doc

### Keywords Extraction

In [235]:
from yake import KeywordExtractor

In [279]:
def kwe(text):
    kw_extractor = KeywordExtractor(lan="en", n=1, top=100)
    return kw_extractor.extract_keywords(text)


0.044282636655428344

In [295]:
def check_keyphrases(doc):
    
    #TODO: was ist mit den fehlenden Wortvektoren
    
    kw_extractor = KeywordExtractor(lan="en", n=1, top=100)
    kw = dict(kw_extractor.extract_keywords(doc.text))
    
    keywords_token = []
    already_in_list = []
    for token in doc:
        if not token._.is_excluded:
            if token.lemma_ in kw.keys():
                token._.keyword_score = 1 - kw[token.lemma_]

    return doc

In [296]:
'''         
        if token.text in kw:
            token._.is_keyword = True
            if token.text not in already_in_list:
                keywords_token.append(token)
                already_in_list.append(token.text)
    doc._.keywords = keywords_token
    
    print(f'{len(keywords_token)} words are keywords')
    
    for token in doc:
        for kw in keywords_token:
            kw_score = token.similarity(kw)
            if kw_score > token._.keyword_score:
                token._.keyword_score =  kw_score'''

"         \n        if token.text in kw:\n            token._.is_keyword = True\n            if token.text not in already_in_list:\n                keywords_token.append(token)\n                already_in_list.append(token.text)\n    doc._.keywords = keywords_token\n    \n    print(f'{len(keywords_token)} words are keywords')\n    \n    for token in doc:\n        for kw in keywords_token:\n            kw_score = token.similarity(kw)\n            if kw_score > token._.keyword_score:\n                token._.keyword_score =  kw_score"

# Creating the Pipeline

In [297]:
def create_pipeline():
    nlp = spacy.load('en_core_web_lg', disable = ['parser'])
    
    ## Preprocessing
    # wordcount
    Doc.set_extension('wordcount', default=0, force=True)
    nlp.add_pipe(wordcount)
    
    # filter tokens
    Doc.set_extension('included_wordcount', default=0, force=True)
    Token.set_extension('is_excluded', default=False, force=True)
    nlp.add_pipe(filter_tokens)
    
    # eliminate dublicates
    Token.set_extension('appearance', default=np.nan, force=True)
    nlp.add_pipe(elim_dup)
    
    
    ## Word difficulty
    # difficulty
    Token.set_extension('difficulty', default=0, force=True)
    nlp.add_pipe(get_difficulty)
    
    # relative frequency
    Token.set_extension('relativ_freq', default=np.nan, force=True)
    nlp.add_pipe(calculate_relativ_freq)
    
    
    ## Word Relevance
    # keywordscore
    Doc.set_extension('keywords', default=[], force=True)
    Token.set_extension('is_keyword', default=False, force=True)
    Token.set_extension('keyword_score', default=0, force=True)
    nlp.add_pipe(check_keyphrases)
    
    return nlp

In [298]:
nlp = create_pipeline()
nlp.pipe_names

['tagger',
 'ner',
 'wordcount',
 'filter_tokens',
 'elim_dup',
 'get_difficulty',
 'calculate_relativ_freq',
 'check_keyphrases']

# Test Processing

In [349]:
def df_from_doc(doc):
    data = []
    for token in doc:
        if not token._.is_excluded:
            data.append((token, token.lemma_, token._.appearance, token._.difficulty, token._.relativ_freq, token._.keyword_score))
    df = pd.DataFrame(data, columns=['token', 'lemma', 'appearance', 'difficulty', 'relative freqency', 'keyword score'])
    
    df[['difficulty_rank', 'keyword_rank']] = df[['difficulty', 'keyword score']].rank(ascending=False)
    
    df['overall_ranking'] = 3 * df['difficulty'] + df['relative freqency'] + df['keyword score']
    
    return df

In [8]:
def predict_outputsize():
    
    def user_knows_vocab(s):
        response = input(f'Can you translate this word:   {s}   [y/n]  ')
        
        while response not in ['y','n']:
            response = input('Unexpected input. Try again.\n' +
                             f'Can you translate this word:   {s}   [y/n]  ')
        return response == 'y'
    
    for w in ['hallo', 'kennst', 'du', 'mich']:
        print(user_knows_vocab(w))

In [9]:
predict_outputsize()

Can you translate this word:   hallo   [y/n]  y
True
Can you translate this word:   kennst   [y/n]  n
False
Can you translate this word:   du   [y/n]  n
False
Can you translate this word:   mich   [y/n]  y
True


### Test scentence

In [334]:
doc = nlp("Hi I'm a little little boy. Get me a piece of cake or I'll killed your mother. I am the mother of your mother")

24 words in document
9 words in vocabulary
6 words in vocabulary without duplicates


In [351]:
df = df_from_doc(doc)

In [352]:
df.sort_values(by=['overall_ranking'], ascending=False)

Unnamed: 0,token,lemma,appearance,difficulty,relative freqency,keyword score,difficulty_rank,keyword_rank,overall_ranking
78,nonpolynomial,nonpolynomial,1,0.889,1.000000,0.348088,1.0,60.5,4.015088
108,trainability,trainability,1,0.734,0.934694,0.349549,3.0,55.0,3.486243
109,understandability,understandability,1,0.722,0.724685,0.349549,5.0,55.0,3.240234
33,bioinformatics,bioinformatic,1,0.738,1.000000,0.000000,2.0,87.5,3.214000
19,convolutional,convolutional,1,0.703,0.489316,0.521135,7.0,34.0,3.119451
...,...,...,...,...,...,...,...,...,...
84,hand,hand,1,0.344,0.000091,0.000000,103.5,87.5,1.032091
47,cases,case,1,0.322,0.000057,0.000000,105.0,87.5,0.966057
74,showed,show,1,0.318,0.000053,0.000000,106.0,87.5,0.954053
69,use,use,1,0.297,0.000033,0.000000,108.0,87.5,0.891033


### Kafka

In [337]:
with open(text_file, "r") as f:
    doc = nlp(f.read())

25062 words in document
7812 words in vocabulary
1873 words in vocabulary without duplicates


In [338]:
df = df_from_doc(doc)

In [339]:
df.sort_values(by=['overall_ranking'], ascending=False).head(50)

Unnamed: 0,token,lemma,appearance,difficulty,relative freqency,keyword score,word vector,difficulty_rank,keyword_rank,overall_ranking
1335,charwoman,charwoman,8,0.807,1.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",7.0,964.0,3.421
497,fretsaw,fretsaw,1,0.889,0.053115,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3.0,964.0,2.720115
655,soughing,soughing,1,0.889,0.053115,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3.0,964.0,2.720115
396,swinging,swinge,1,0.889,0.053115,0.0,"[0.44726, -0.19518, -0.33376, 0.30719, 0.52885...",3.0,964.0,2.720115
1358,teaboy,teaboy,1,0.889,0.053115,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3.0,964.0,2.720115
1166,startlement,startlement,1,0.889,0.053115,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3.0,964.0,2.720115
441,clerk,clerk,37,0.499,0.085463,0.989837,"[-0.16399, -0.21803, -0.002153, -0.057714, 0.2...",784.5,11.0,2.5723
16,EBook,ebook,3,0.53,0.003195,0.95878,"[-0.83196, 0.03004, 0.039797, 0.57698, 0.10398...",584.5,50.0,2.551975
1181,denuded,denude,1,0.836,0.03754,0.0,"[0.64199, -0.12962, -0.47799, 0.1615, -0.4265,...",6.0,964.0,2.54554
7,COPYRIGHTED,copyright,21,0.515,0.015974,0.972142,"[-0.61777, -0.54471, -0.066428, -0.17851, -0.7...",679.0,32.0,2.533117


### Wordclusters

In [340]:
doc = nlp("Let's take a ride in my new car. I will drive you home in my car. I love cars, which drive very fast and loud. Trucks and vans are the best. You can also cook in the back of my car. Food, meals and drinks are very tasty. My mother is all in to cooking good meals, foods and healthy dishes. This keeps me healthy while. I love the pasta and pies she bakes")

74 words in document
32 words in vocabulary
22 words in vocabulary without duplicates


In [341]:
df = df_from_doc(doc)

In [342]:
df.sort_values(by=['overall_ranking'], ascending=False).head(50)

Unnamed: 0,token,lemma,appearance,difficulty,relative freqency,keyword score,word vector,difficulty_rank,keyword_rank,overall_ranking
14,tasty,tasty,1,0.551,1.0,0.36264,"[-0.5076, -0.32474, 0.44828, 0.18926, 0.017181...",1.0,8.0,3.01564
19,pasta,pasta,1,0.543,0.852215,0.259213,"[-0.47621, -0.12565, 0.57201, -0.41622, 0.0878...",2.0,12.0,2.740427
21,bakes,bake,1,0.519,0.912728,0.0,"[-0.027228, 0.17126, 0.30655, -0.4484, -0.0939...",3.0,17.5,2.469728
16,healthy,healthy,2,0.441,0.409298,0.656656,"[-0.40781, 0.53036, -0.25297, 0.19766, -0.1103...",8.0,5.0,2.388954
3,car,car,4,0.342,0.375376,0.898871,"[0.20987, 0.46481, -0.24238, -0.065751, 0.6085...",16.5,1.0,2.300247
10,cook,cook,2,0.43,0.577998,0.314925,"[-0.34052, -0.24223, 0.5442, -0.069727, 0.0716...",10.0,9.0,2.182923
12,meals,meal,2,0.448,0.836605,0.0,"[-0.32314, -0.24613, 0.41368, -0.28846, -0.097...",6.0,17.5,2.180605
1,ride,ride,1,0.411,0.09778,0.764928,"[0.39289, -0.088632, -0.30353, -0.47467, 0.313...",11.5,2.0,2.095708
4,drive,drive,2,0.379,0.200662,0.737823,"[0.57215, 0.503, 0.068908, -0.41683, 0.081836,...",15.0,3.0,2.075485
6,loud,loud,1,0.443,0.190425,0.466753,"[0.065564, 0.49259, -0.4951, -0.3633, 0.63857,...",7.0,6.0,1.986178


### Wikipedia

In [343]:
text = '''Deep learning (also known as deep structured learning) is part of a broader family 
of machine learning methods based on artificial neural networks with representation 
        learning. Learning can be supervised, semi-supervised or unsupervised.[1][2][3]Deep-learning 
        architectures such as deep neural networks, deep belief networks, recurrent neural networks 
        and convolutional neural networks have been applied to fields including computer vision, 
        machine vision, speech recognition, natural language processing, audio recognition, social 
        network filtering, machine translation, bioinformatics, drug design, medical image analysis, 
        material inspection and board game programs, where they have produced results comparable 
        to and in some cases surpassing human expert performance.[4][5][6]Artificial neural networks 
        (ANNs) were inspired by information processing and distributed communication nodes in biological 
        systems. ANNs have various differences from biological brains. Specifically, neural networks tend 
        to be static and symbolic, while the biological brain of most living organisms is dynamic (plastic) 
        and analogue.[7][8][9]The adjective "deep" in deep learning refers to the use of multiple layers in 
        the network. Early work showed that a linear perceptron cannot be a universal classifier, and then 
        that a network with a nonpolynomial activation function with one hidden layer of unbounded width 
        can on the other hand so be. Deep learning is a modern variation which is concerned with an unbounded
        number of layers of bounded size, which permits practical application and optimized implementation, 
        while retaining theoretical universality under mild conditions. In deep learning the layers are 
        also permitted to be heterogeneous and to deviate widely from biologically informed connectionist 
        models, for the sake of efficiency, trainability and understandability, whence the "structured" part.'''

In [344]:
doc = nlp(text)

257 words in document
152 words in vocabulary
110 words in vocabulary without duplicates


In [345]:
df = df_from_doc(doc)

In [346]:
df.sort_values(by=['overall_ranking'], ascending=False).head(50)

Unnamed: 0,token,lemma,appearance,difficulty,relative freqency,keyword score,word vector,difficulty_rank,keyword_rank,overall_ranking
78,nonpolynomial,nonpolynomial,1,0.889,1.0,0.348088,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,60.5,4.015088
108,trainability,trainability,1,0.734,0.934694,0.349549,"[0.31718, 0.18675, 0.64009, -0.33493, -0.41486...",3.0,55.0,3.486243
109,understandability,understandability,1,0.722,0.724685,0.349549,"[0.05063, -0.12876, -0.084509, 0.12097, -0.498...",5.0,55.0,3.240234
33,bioinformatics,bioinformatic,1,0.738,1.0,0.0,"[-0.15368, -0.16517, 0.61039, -0.39638, 0.5215...",2.0,87.5,3.214
19,convolutional,convolutional,1,0.703,0.489316,0.521135,"[0.27546, 0.031269, -0.065607, -0.6496, -0.164...",7.0,34.0,3.119451
82,unbounded,unbounded,2,0.705,0.270206,0.713559,"[0.24013, 0.086081, 0.22037, -0.1213, 0.34822,...",6.0,14.0,3.098765
104,connectionist,connectionist,1,0.723,0.741101,0.0,"[0.21653, -0.35044, 0.53317, -0.59726, -0.1070...",4.0,87.5,2.910101
10,neural,neural,6,0.57,0.197697,0.9607,"[0.10273, 0.0059362, -0.019216, -0.14545, -0.9...",21.0,1.0,2.868397
9,artificial,artificial,1,0.526,0.001449,0.912344,"[-0.66059, 0.2348, -0.021227, -0.32737, -0.062...",32.0,4.0,2.491793
18,recurrent,recurrent,1,0.612,0.009786,0.564295,"[-0.46358, -0.012294, -0.2189, 0.0030803, -0.0...",14.5,20.5,2.410081


In [237]:
kwe(text)

[('neural', 0.039299857763947725),
 ('deep', 0.044282636655428344),
 ('networks', 0.05401423159410843),
 ('learning', 0.055954096724447976),
 ('artificial', 0.08765559185960589)]