# First Project Pipline

## Setup

In [1]:
import spacy
from spacy.tokens import Token
from spacy.tokens import Doc
from spacy.language import Language

In [2]:
from gensim.summarization import keywords

In [3]:
from wordfreq import zipf_frequency
from wordfreq import word_frequency

In [4]:
import numpy as np
import pandas as pd
import regex as re

In [5]:
import nltk
#nltk.download('cmudict')

In [6]:
from nltk.corpus import cmudict

phoneme_dict = dict(cmudict.entries())

def syllable_counter(word):
    '''function that counts a syllable in a word'''
    if word not in phoneme_dict:
        return 0
    syllables = phoneme_dict[word]
    count = len([syllable for syllable in syllables if syllable[-1].isdigit()])
    return round(count, 0)

In [7]:
!python --version

Python 3.8.0


## Data

In [8]:
text_file = '../data/kafka.txt'

In [9]:
#print(open(text_file, "r").read())

# Pipeline Components

## Preprocessing

### Wordcount

In [10]:
def wordcount(doc):
    '''gives an overall word count'''
    
    for token in doc:
        if token.is_alpha:
            doc._.wordcount += 1
    
    print(f'{doc._.wordcount} words in document')
         
    return doc

### Filter

In [11]:
def filter_tokens(doc):
    '''filters all tokens'''           
    
    
    for token in doc:
        # filter stopwords
        if not token.is_alpha or token.is_stop:
            token._.is_excluded = True
            
        # filter part-of-speech
        elif token.pos_ not in ['NOUN', 'VERB', 'ADJ']:
            token._.is_excluded = True
            
        # filter entities
        elif token.ent_type != 0:
                token._.is_excluded = True
                
        # count included words   
        else:
            doc._.included_wordcount += 1

    print(f'{doc._.included_wordcount} words in vocabulary')
         
    return doc

### Eliminating Duplicates

In [12]:
def elim_dup(doc):
    '''eliminates all duplicates and counts the appearance of the included words'''
    already_appeared = {}

    for token in doc:
        if not token._.is_excluded:
            if token.lemma_ in already_appeared.keys():
                already_appeared[token.lemma_]._.appearance += 1
                token._.is_excluded = True
                doc._.included_wordcount -= 1
            else:
                token._.appearance = 1
                already_appeared[token.lemma_] = token
    
    print(f'{doc._.included_wordcount} words in vocabulary without duplicates')
    
    return doc

## Word Difficulty

### Rating Difficulty

In [28]:
def syl_weight(n):
    w = 0
    for i in range(n):
        w += 0.5**(i+1)
    return w

In [47]:
def get_difficulty(doc):
   
    for token in doc:
        if not token._.is_excluded:
            lemma = token.lemma_
            difficulty = 8 - zipf_frequency(lemma, 'en') # score of 0-8
            difficulty += syl_weight(syllable_counter(lemma)) # now score of 0-9
            token._.difficulty = round(difficulty/9, 3) #normalised to 0-1
            
    return doc

## Relative Freqency

In [13]:
def calculate_relativ_freq(doc):
    '''calculating the relativ frequency of a included word'''
    
    maw = 20 # maximal appearance weight
    
    for token in doc:
        if not token._.is_excluded:
            text_freq = min(maw, token._.appearance) / doc._.wordcount
            overall_freq = word_frequency(token.lemma_, 'en')
            if overall_freq != 0:
                token._.relativ_freq = round(text_freq**2 / overall_freq, 3)
        
    return doc

## Word Relevance

### Keywords Extraction

In [15]:
def check_keyphrases(doc):
    
    #TODO: was ist mit den fehlenden Wortvektoren
    
    kw = keywords(doc.text, split=True)
    
    keywords_token = []
    already_in_list = []
    for token in doc:
        if token.text in kw:
            token._.is_keyword = True
            if token.text not in already_in_list:
                keywords_token.append(token)
                already_in_list.append(token.text)
    doc._.keywords = keywords_token
    
    print(f'{len(keywords_token)} words are keywords')
    
    for token in doc:
        for kw in keywords_token:
            kw_score = token.similarity(kw)
            if kw_score > token._.keyword_score:
                token._.keyword_score =  kw_score
    
    return doc

# Creating the Pipeline

In [20]:
def create_pipeline():
    nlp = spacy.load('en_core_web_lg', disable = ['parser'])
    
    ## Preprocessing
    # wordcount
    Doc.set_extension('wordcount', default=0, force=True)
    nlp.add_pipe(wordcount)
    
    # filter tokens
    Doc.set_extension('included_wordcount', default=0, force=True)
    Token.set_extension('is_excluded', default=False, force=True)
    nlp.add_pipe(filter_tokens)
    
    # eliminate dublicates
    Token.set_extension('appearance', default=np.nan, force=True)
    nlp.add_pipe(elim_dup)
    
    
    ## Word difficulty
    # difficulty
    Token.set_extension('difficulty', default=0, force=True)
    nlp.add_pipe(get_difficulty)
    
    # relative frequency
    Token.set_extension('relativ_freq', default=np.nan, force=True)
    nlp.add_pipe(calculate_relativ_freq)
    
    
    ## Word Relevance
    # keywordscore
    Doc.set_extension('keywords', default=[], force=True)
    Token.set_extension('is_keyword', default=False, force=True)
    Token.set_extension('keyword_score', default=0, force=True)
    # nlp.add_pipe(check_keyphrases)
    
    return nlp

In [48]:
nlp = create_pipeline()
nlp.pipe_names

['tagger',
 'ner',
 'wordcount',
 'filter_tokens',
 'elim_dup',
 'get_difficulty',
 'calculate_relativ_freq']

# Test Processing

In [49]:
def df_from_doc(doc):
    data = []
    for token in doc:
        if not token._.is_excluded:
            data.append((token, token.lemma_, token._.appearance, token._.difficulty, token._.relativ_freq, token._.keyword_score, token._.is_keyword))
    df = pd.DataFrame(data, columns=['token', 'lemma', 'appearance', 'difficulty', 'relativ freqency', 'keyword score', 'is keyword'])
    
    df[['difficulty_rank', 'keyword_rank']] = df[['difficulty', 'keyword score']].rank(ascending=False)
    
    df['overall_ranking'] = df['difficulty']
    
    return df

### Test scentence

In [50]:
doc = nlp("Hi I'm a little little boy. Get me a piece of cake or I'll killed your mother. I am the mother of your mother")

24 words in document
9 words in vocabulary
6 words in vocabulary without duplicates


In [51]:
df = df_from_doc(doc)

In [52]:
df.sort_values(by=['overall_ranking'], ascending=False)

Unnamed: 0,token,lemma,appearance,difficulty,relativ freqency,keyword score,is keyword,difficulty_rank,keyword_rank,overall_ranking
3,cake,cake,1,0.452,64.539,0,False,1.0,3.5,0.452
2,piece,piece,1,0.392,18.608,0,False,2.0,3.5,0.392
5,mother,mother,3,0.386,81.806,0,False,3.0,3.5,0.386
4,killed,kill,1,0.379,14.115,0,False,4.0,3.5,0.379
1,boy,boy,1,0.367,10.988,0,False,5.0,3.5,0.367
0,little,little,2,0.333,12.357,0,False,6.0,3.5,0.333


________________________________

### Kafka

In [53]:
with open(text_file, "r") as f:
    doc = nlp(f.read())

25062 words in document
7812 words in vocabulary
1873 words in vocabulary without duplicates


In [54]:
df = df_from_doc(doc)

In [60]:
df.sort_values(by=['overall_ranking'], ascending=False)

Unnamed: 0,token,lemma,appearance,difficulty,relativ freqency,keyword score,is keyword,difficulty_rank,keyword_rank,overall_ranking
1358,teaboy,teaboy,1,0.889,,0,False,3.0,937.0,0.889
1166,startlement,startlement,1,0.889,,0,False,3.0,937.0,0.889
655,soughing,soughing,1,0.889,,0,False,3.0,937.0,0.889
396,swinging,swinge,1,0.889,,0,False,3.0,937.0,0.889
497,fretsaw,fretsaw,1,0.889,,0,False,3.0,937.0,0.889
...,...,...,...,...,...,...,...,...,...,...
117,times,time,74,0.247,0.0,0,False,1869.0,937.0,0.247
32,like,like,12,0.234,0.0,0,False,1870.0,937.0,0.234
1133,wo,will,1,0.228,0.0,0,False,1871.0,937.0,0.228
1296,ones,one,1,0.224,0.0,0,False,1872.0,937.0,0.224


In [268]:
df.sort_values(by=['difficulty'], ascending=False).head(20)

Unnamed: 0,token,lemma,appearance,difficulty,relativ freqency,cluster
302,swinging,swinge,1,8.0,,6
827,saddened,sadden,1,5.97,0.014879,4
858,assailed,assail,1,5.91,0.012944,4
203,hearer,hearer,1,5.9,0.012636,4
1146,rumination,rumination,1,5.82,0.010544,4
735,incarcerated,incarcerate,1,5.81,0.010272,4
846,perversity,perversity,1,5.76,0.00915,4
711,construed,construe,1,5.76,0.00915,7
582,unthinking,unthinking,1,5.74,0.008748,4
923,protrusions,protrusion,1,5.68,0.007618,6


In [269]:
df.sort_values(by=['relativ freqency'], ascending=False).head(30)

Unnamed: 0,token,lemma,appearance,difficulty,relativ freqency,cluster
1208,distribute,distribute,17,4.21,0.074573,7
461,locksmith,locksmith,4,5.35,0.056988,7
1297,donations,donation,16,4.03,0.043684,7
780,violin,violin,12,4.28,0.043669,6
10,COPYRIGHTED,copyright,21,3.76,0.0366,9
656,couch,couch,17,3.89,0.035668,6
1214,trademark,trademark,11,4.14,0.026608,7
777,clothes,clothe,3,5.23,0.024327,6
554,protruding,protrude,2,5.57,0.023674,6
1238,refund,refund,9,4.22,0.021386,7


In [217]:
df.sort_values(by=['cluster'], ascending=True).head(50)

Unnamed: 0,token,lemma,appearance,difficulty,relativ freqency,cluster
0,Translated,translate,3,4.03,0.001536,0
941,events,event,1,2.85,1.1e-05,0
959,normally,normally,3,3.4,0.00036,0
966,write,write,8,2.99,0.000999,0
967,reading,reading,1,2.92,1.3e-05,0
968,recent,recent,1,2.97,1.5e-05,0
972,wealth,wealth,1,3.48,4.8e-05,0
983,arrange,arrange,3,3.99,0.001405,0
990,ease,ease,1,3.66,7.3e-05,0
994,frequently,frequently,2,3.43,0.000171,0


### Wordclusters

In [242]:
doc = nlp("Let's take a ride in my new car. I will drive you home in my car. I love cars, which drive very fast and loud. Trucks and vans are the best. You can also cook in the back of my car. Food, meals and drinks are very tasty. My mother is all in to cooking good meals, foods and healthy dishes. This keeps me healthy while. I love the pasta and pies she bakes")

74 overall words
34 words included
24 words without duplicate included
Keywords: 3


In [148]:
doc[8]._.is_keyword

True

In [62]:
words = [token.lemma_ for token in doc if token._.is_keyword]
print(words)

[]


In [263]:
data = []
for token in doc:
    if not token._.is_excluded:
        data.append((token, token.lemma_, token._.appearance, token._.difficulty, token._.relativ_freq, token._.cluster))
df = pd.DataFrame(data, columns=['token', 'lemma', 'appearance', 'difficulty', 'relativ freqency', 'cluster'])
df.head(100)

Unnamed: 0,token,lemma,appearance,difficulty,relativ freqency,cluster
0,cars,car,1,2.58,2.261916,3
1,drive,drive,1,2.91,4.836455,3
2,fast,fast,1,2.91,4.836455,3
3,Trucks,truck,1,3.45,16.757296,3
4,vans,van,1,3.2,9.427639,3
5,best,good,2,1.88,1.802679,3
6,cooking,cook,1,3.37,13.93171,1
7,meals,meal,1,3.53,20.165559,1
8,foods,food,1,2.58,2.261916,1
9,healthy,healthy,2,3.22,39.461625,1


In [264]:
doc._.cluster_sizes

Counter({0: 3, 3: 6, 1: 8, 2: 1})

### Wikipedia

In [44]:
text = '''Deep learning (also known as deep structured learning) is part of a broader family 
of machine learning methods based on artificial neural networks with representation 
        learning. Learning can be supervised, semi-supervised or unsupervised.[1][2][3]Deep-learning 
        architectures such as deep neural networks, deep belief networks, recurrent neural networks 
        and convolutional neural networks have been applied to fields including computer vision, 
        machine vision, speech recognition, natural language processing, audio recognition, social 
        network filtering, machine translation, bioinformatics, drug design, medical image analysis, 
        material inspection and board game programs, where they have produced results comparable 
        to and in some cases surpassing human expert performance.[4][5][6]Artificial neural networks 
        (ANNs) were inspired by information processing and distributed communication nodes in biological 
        systems. ANNs have various differences from biological brains. Specifically, neural networks tend 
        to be static and symbolic, while the biological brain of most living organisms is dynamic (plastic) 
        and analogue.[7][8][9]The adjective "deep" in deep learning refers to the use of multiple layers in 
        the network. Early work showed that a linear perceptron cannot be a universal classifier, and then 
        that a network with a nonpolynomial activation function with one hidden layer of unbounded width 
        can on the other hand so be. Deep learning is a modern variation which is concerned with an unbounded
        number of layers of bounded size, which permits practical application and optimized implementation, 
        while retaining theoretical universality under mild conditions. In deep learning the layers are 
        also permitted to be heterogeneous and to deviate widely from biologically informed connectionist 
        models, for the sake of efficiency, trainability and understandability, whence the "structured" part.'''

In [79]:
doc = nlp(text)

257 overall words
156 words included
114 words without duplicate included
17 keywords found


  kw_score = token.similarity(kw)


In [84]:
for token in doc:
    if not token._.is_excluded:
        print(token._.keyword_score)

0.3938454
0.5744245
0.33924755
0.47624484
0.43483993
0.3270241
1.0
0.52090365
0.4847936
0.46523783
0.49933636
0.8762495
0.44071323
0.2955146
0.25625694
0.2955146
0.5744245
0.40140018
0.5100599
0.31690955
0.22550173
0.44786528
0.38716203
0.4775482
0.53911096
0.41549888
0.4988885
1.0
0.5007288
0.4610773
1.0
0.40415663
0.5314103
0.37051588
0.40864474
0.38324967
0.48531094
0.45266506
0.38082403
0.50242513
1.0
0.40681517
0.31428733
1.0
0.43199307
0.4897565
1.0
0.4878846
0.36730403
1.0
0.59545803
0.45384392
0.36053327
1.0
0.46185136
1.0
0.43217084
1.0
0.52928066
0.4549633
0.3777898
0.43885174
0.35806483
0.34795207
0.43768883
0.35460824
0.6406614
0.4451714
0.47006604
0.26633573
0.36964554
0.47620687
0.45806268
1.0
0.3019013
0.4692436
0.44344023
0.34928936
1.0
0.31417662
0.0
1.0
0.48357797
0.31554163
1.0
0.40788516
0.48470917
0.5270915
0.38272974
0.47252935
0.50294
0.49841097
0.34320146
0.44353783
1.0
0.5282285
0.4437383
0.45886028
0.42962694
0.59137577
0.49776393
0.23172696
0.38636324
0.43876