# First Project Pipline

## Setup

In [1]:
import spacy
from spacy.tokens import Token
from spacy.tokens import Doc
from spacy.language import Language

In [2]:
from gensim.summarization import keywords

In [3]:
from wordfreq import zipf_frequency
from wordfreq import word_frequency

In [4]:
import numpy as np
import pandas as pd
import regex as re

In [5]:
import nltk
#nltk.download('cmudict')

In [6]:
from nltk.corpus import cmudict

phoneme_dict = dict(cmudict.entries())

def syllable_counter(word):
    '''function that counts a syllable in a word'''
    if word not in phoneme_dict:
        return 0
    syllables = phoneme_dict[word]
    count = len([syllable for syllable in syllables if syllable[-1].isdigit()])
    return round(count, 0)

In [200]:
class Minimum():
    def __init__(self, initial_value):
        self.value = initial_value
    
    def update_minimum(self, potential_min):
        if potential_min < self.value:
            self.value = potential_min
            
class Maximum():
    def __init__(self, initial_value):
        self.value = initial_value
    
    def update_maximum(self, potential_min):
        if potential_min > self.value:
            self.value = potential_min

In [201]:
!python --version

Python 3.8.0


## Data

In [202]:
text_file = '../data/kafka.txt'

In [203]:
#print(open(text_file, "r").read())

# Pipeline Components

## Preprocessing

### Wordcount

In [204]:
def wordcount(doc):
    '''gives an overall word count'''
    
    for token in doc:
        if token.is_alpha:
            doc._.wordcount += 1
    
    print(f'{doc._.wordcount} words in document')
         
    return doc

### Filter

In [205]:
def filter_tokens(doc):
    '''filters all tokens'''           
    
    
    for token in doc:
        # filter stopwords
        if not token.is_alpha or token.is_stop:
            token._.is_excluded = True
            
        # filter part-of-speech
        elif token.pos_ not in ['NOUN', 'VERB', 'ADJ']:
            token._.is_excluded = True
            
        # filter entities
        elif token.ent_type != 0:
                token._.is_excluded = True
                
        # count included words   
        else:
            doc._.included_wordcount += 1

    print(f'{doc._.included_wordcount} words in vocabulary')
         
    return doc

### Eliminating Duplicates

In [206]:
def elim_dup(doc):
    '''eliminates all duplicates and counts the appearance of the included words'''
    already_appeared = {}

    for token in doc:
        if not token._.is_excluded:
            if token.lemma_ in already_appeared.keys():
                already_appeared[token.lemma_]._.appearance += 1
                token._.is_excluded = True
                doc._.included_wordcount -= 1
            else:
                token._.appearance = 1
                already_appeared[token.lemma_] = token
    
    print(f'{doc._.included_wordcount} words in vocabulary without duplicates')
    
    return doc

## Word Difficulty

### Rating Difficulty

In [207]:
def syl_weight(n):
    w = 0
    for i in range(n):
        w += 0.5**(i+1)
    return w

In [208]:
def get_difficulty(doc):
   
    for token in doc:
        if not token._.is_excluded:
            lemma = token.lemma_
            difficulty = 8 - zipf_frequency(lemma, 'en') # score of 0-8
            difficulty += syl_weight(syllable_counter(lemma)) # now score of 0-9
            token._.difficulty = round(difficulty/9, 3) #normalised to 0-1
            
    return doc

### Relative Freqency

In [209]:
def calculate_relativ_freq(doc):
    '''calculating the relativ frequency of a included word'''
    
    calculate_last = []
    min_freq = Minimum(1)
    max_score = Maximum(0)
    
    def calc_rel_freq(word_freq, token):
        return round(((token._.appearance/doc._.wordcount) **2) / word_freq, 3)
    
    for token in doc:
        if not token._.is_excluded:
            overall_word_freq = word_frequency(token.lemma_, 'en')
            
            if overall_word_freq == 0:
                calculate_last.append(token)
            else:
                min_freq.update_minimum(overall_word_freq)
                token._.relativ_freq = calc_rel_freq(overall_word_freq, token)
                max_score.update_maximum(token._.relativ_freq)
    
    for token in calculate_last:
        token._.relativ_freq = calc_rel_freq(min_freq.value, token)
        max_score.update_maximum(token._.relativ_freq)
        
    for token in doc:
        if not token._.is_excluded:
            token._.relativ_freq /= max_score.value
    
    
    return doc

## Word Relevance

### Keywords Extraction

In [210]:
def check_keyphrases(doc):
    
    #TODO: was ist mit den fehlenden Wortvektoren
    
    kw = keywords(doc.text, split=True)
    
    keywords_token = []
    already_in_list = []
    for token in doc:
        if token.text in kw:
            token._.is_keyword = True
            if token.text not in already_in_list:
                keywords_token.append(token)
                already_in_list.append(token.text)
    doc._.keywords = keywords_token
    
    print(f'{len(keywords_token)} words are keywords')
    
    for token in doc:
        for kw in keywords_token:
            kw_score = token.similarity(kw)
            if kw_score > token._.keyword_score:
                token._.keyword_score =  kw_score
    
    return doc

# Creating the Pipeline

In [211]:
def create_pipeline():
    nlp = spacy.load('en_core_web_lg', disable = ['parser'])
    
    ## Preprocessing
    # wordcount
    Doc.set_extension('wordcount', default=0, force=True)
    nlp.add_pipe(wordcount)
    
    # filter tokens
    Doc.set_extension('included_wordcount', default=0, force=True)
    Token.set_extension('is_excluded', default=False, force=True)
    nlp.add_pipe(filter_tokens)
    
    # eliminate dublicates
    Token.set_extension('appearance', default=np.nan, force=True)
    nlp.add_pipe(elim_dup)
    
    
    ## Word difficulty
    # difficulty
    Token.set_extension('difficulty', default=0, force=True)
    nlp.add_pipe(get_difficulty)
    
    # relative frequency
    Token.set_extension('relativ_freq', default=np.nan, force=True)
    nlp.add_pipe(calculate_relativ_freq)
    
    
    ## Word Relevance
    # keywordscore
    Doc.set_extension('keywords', default=[], force=True)
    Token.set_extension('is_keyword', default=False, force=True)
    Token.set_extension('keyword_score', default=0, force=True)
    # nlp.add_pipe(check_keyphrases)
    
    return nlp

In [212]:
nlp = create_pipeline()
nlp.pipe_names

['tagger',
 'ner',
 'wordcount',
 'filter_tokens',
 'elim_dup',
 'get_difficulty',
 'calculate_relativ_freq']

# Test Processing

In [223]:
def df_from_doc(doc):
    data = []
    for token in doc:
        if not token._.is_excluded:
            data.append((token, token.lemma_, token._.appearance, token._.difficulty, token._.relativ_freq, token._.keyword_score, token._.is_keyword))
    df = pd.DataFrame(data, columns=['token', 'lemma', 'appearance', 'difficulty', 'relative freqency', 'keyword score', 'is keyword'])
    
    df[['difficulty_rank', 'keyword_rank']] = df[['difficulty', 'keyword score']].rank(ascending=False)
    
    df['overall_ranking'] = df['difficulty'] + df['relative freqency']
    
    return df

### Test scentence

In [214]:
doc = nlp("Hi I'm a little little boy. Get me a piece of cake or I'll killed your mother. I am the mother of your mother")

24 words in document
9 words in vocabulary
6 words in vocabulary without duplicates


In [215]:
df = df_from_doc(doc)

In [216]:
df.sort_values(by=['overall_ranking'], ascending=False)

Unnamed: 0,token,lemma,appearance,difficulty,relativ freqency,keyword score,is keyword,difficulty_rank,keyword_rank,overall_ranking
3,cake,cake,1,0.452,0.788927,0,False,1.0,3.5,0.452
2,piece,piece,1,0.392,0.227465,0,False,2.0,3.5,0.392
5,mother,mother,3,0.386,1.0,0,False,3.0,3.5,0.386
4,killed,kill,1,0.379,0.172542,0,False,4.0,3.5,0.379
1,boy,boy,1,0.367,0.134318,0,False,5.0,3.5,0.367
0,little,little,2,0.333,0.151052,0,False,6.0,3.5,0.333


________________________________

### Kafka

In [217]:
with open(text_file, "r") as f:
    doc = nlp(f.read())

25062 words in document
7812 words in vocabulary
1873 words in vocabulary without duplicates


In [224]:
df = df_from_doc(doc)

In [225]:
df.sort_values(by=['overall_ranking'], ascending=False).head(50)

Unnamed: 0,token,lemma,appearance,difficulty,relative freqency,keyword score,is keyword,difficulty_rank,keyword_rank,overall_ranking
1335,charwoman,charwoman,8,0.807,1.0,0,False,7.0,937.0,1.807
1166,startlement,startlement,1,0.889,0.053115,0,False,3.0,937.0,0.942115
1358,teaboy,teaboy,1,0.889,0.053115,0,False,3.0,937.0,0.942115
497,fretsaw,fretsaw,1,0.889,0.053115,0,False,3.0,937.0,0.942115
396,swinging,swinge,1,0.889,0.053115,0,False,3.0,937.0,0.942115
655,soughing,soughing,1,0.889,0.053115,0,False,3.0,937.0,0.942115
1181,denuded,denude,1,0.836,0.03754,0,False,6.0,937.0,0.87354
1804,unenforceability,unenforceability,1,0.769,0.053115,0,False,11.0,937.0,0.822115
1215,entombed,entomb,1,0.8,0.017971,0,False,8.0,937.0,0.817971
1194,glowering,glower,1,0.796,0.016374,0,False,9.0,937.0,0.812374


### Wordclusters

In [226]:
doc = nlp("Let's take a ride in my new car. I will drive you home in my car. I love cars, which drive very fast and loud. Trucks and vans are the best. You can also cook in the back of my car. Food, meals and drinks are very tasty. My mother is all in to cooking good meals, foods and healthy dishes. This keeps me healthy while. I love the pasta and pies she bakes")

74 words in document
32 words in vocabulary
22 words in vocabulary without duplicates


In [227]:
df = df_from_doc(doc)

In [228]:
df.sort_values(by=['overall_ranking'], ascending=False).head(50)

Unnamed: 0,token,lemma,appearance,difficulty,relative freqency,keyword score,is keyword,difficulty_rank,keyword_rank,overall_ranking
14,tasty,tasty,1,0.551,1.0,0,False,1.0,11.5,1.551
21,bakes,bake,1,0.519,0.912728,0,False,3.0,11.5,1.431728
19,pasta,pasta,1,0.543,0.852215,0,False,2.0,11.5,1.395215
12,meals,meal,2,0.448,0.836605,0,False,6.0,11.5,1.284605
10,cook,cook,2,0.43,0.577998,0,False,10.0,11.5,1.007998
17,dishes,dish,1,0.482,0.425516,0,False,4.0,11.5,0.907516
20,pies,pie,1,0.48,0.408622,0,False,5.0,11.5,0.888622
16,healthy,healthy,2,0.441,0.409298,0,False,8.0,11.5,0.850298
3,car,car,4,0.342,0.375376,0,False,16.5,11.5,0.717376
6,loud,loud,1,0.443,0.190425,0,False,7.0,11.5,0.633425


### Wikipedia

In [229]:
text = '''Deep learning (also known as deep structured learning) is part of a broader family 
of machine learning methods based on artificial neural networks with representation 
        learning. Learning can be supervised, semi-supervised or unsupervised.[1][2][3]Deep-learning 
        architectures such as deep neural networks, deep belief networks, recurrent neural networks 
        and convolutional neural networks have been applied to fields including computer vision, 
        machine vision, speech recognition, natural language processing, audio recognition, social 
        network filtering, machine translation, bioinformatics, drug design, medical image analysis, 
        material inspection and board game programs, where they have produced results comparable 
        to and in some cases surpassing human expert performance.[4][5][6]Artificial neural networks 
        (ANNs) were inspired by information processing and distributed communication nodes in biological 
        systems. ANNs have various differences from biological brains. Specifically, neural networks tend 
        to be static and symbolic, while the biological brain of most living organisms is dynamic (plastic) 
        and analogue.[7][8][9]The adjective "deep" in deep learning refers to the use of multiple layers in 
        the network. Early work showed that a linear perceptron cannot be a universal classifier, and then 
        that a network with a nonpolynomial activation function with one hidden layer of unbounded width 
        can on the other hand so be. Deep learning is a modern variation which is concerned with an unbounded
        number of layers of bounded size, which permits practical application and optimized implementation, 
        while retaining theoretical universality under mild conditions. In deep learning the layers are 
        also permitted to be heterogeneous and to deviate widely from biologically informed connectionist 
        models, for the sake of efficiency, trainability and understandability, whence the "structured" part.'''

In [230]:
doc = nlp(text)

257 words in document
152 words in vocabulary
110 words in vocabulary without duplicates


In [231]:
df = df_from_doc(doc)

In [232]:
df.sort_values(by=['overall_ranking'], ascending=False).head(50)

Unnamed: 0,token,lemma,appearance,difficulty,relative freqency,keyword score,is keyword,difficulty_rank,keyword_rank,overall_ranking
78,nonpolynomial,nonpolynomial,1,0.889,1.0,0,False,1.0,55.5,1.889
33,bioinformatics,bioinformatic,1,0.738,1.0,0,False,2.0,55.5,1.738
108,trainability,trainability,1,0.734,0.934694,0,False,3.0,55.5,1.668694
104,connectionist,connectionist,1,0.723,0.741101,0,False,4.0,55.5,1.464101
109,understandability,understandability,1,0.722,0.724685,0,False,5.0,55.5,1.446685
19,convolutional,convolutional,1,0.703,0.489316,0,False,7.0,55.5,1.192316
82,unbounded,unbounded,2,0.705,0.270206,0,False,6.0,55.5,0.975206
10,neural,neural,6,0.57,0.197697,0,False,21.0,55.5,0.767697
77,classifier,classifier,1,0.63,0.107009,0,False,11.0,55.5,0.737009
98,universality,universality,1,0.675,0.028168,0,False,8.0,55.5,0.703168
