# First Project Pipline

## Setup

In [12]:
import spacy
from spacy.tokens import Token
from spacy.tokens import Doc

In [13]:
from wordfreq import zipf_frequency
from wordfreq import word_frequency

In [14]:
import numpy as np
import pandas as pd
import regex as re

In [15]:
#import nltk
#nltk.download('cmudict')

In [16]:
from nltk.corpus import cmudict

phoneme_dict = dict(cmudict.entries())

def syllable_counter(word):
    '''function that counts a syllable in a word'''
    if word not in phoneme_dict:
        return 0
    syllables = phoneme_dict[word]
    count = len([syllable for syllable in syllables if syllable[-1].isdigit()])
    return count

In [17]:
def get_entities(doc):
    entities = []

    for ent in doc.ents:
        ents = re.findall(r'[a-zA-Z]+', ent.text)
        entities.extend(ents) 
        
    return entities

## Data

In [18]:
text_file = '../data/kafka.txt'

In [20]:
#print(open(text_file, "r").read())

# Create Pipeline

### Base Pipeline

In [255]:
nlp = spacy.load('en_core_web_lg', disable = ['parser'])

In [22]:
nlp.pipe_names

['tagger', 'ner']

### Wordcount

In [23]:
Doc.set_extension('wordcount', default=0, force=True)

In [24]:
def wordcount(doc):
    '''gives an overall word count'''
    wordcount = 0
    
    for token in doc:
        if token.is_alpha:
            wordcount += 1
            
    doc._.wordcount = wordcount
    
    print(f'{wordcount} overall words')
         
    return doc

In [25]:
nlp.add_pipe(wordcount)
nlp.pipe_names

['tagger', 'ner', 'wordcount']

### Filter

In [26]:
Token.set_extension('is_excluded', default=False, force=True)

In [27]:
def filter_tokens(doc):
    '''filters all tokens'''
    entities = get_entities(doc)            
    
    for token in doc:
        # filter stopwords
        if not token.is_alpha or token.is_stop:
            token._.is_excluded = True
        # filter part-of-speech
        if token.pos_ not in ['NOUN', 'VERB', 'ADJ', 'ADV']: # ADV?
            token._.is_excluded = True
        # filter entities
        if token.text in entities:
            token._.is_excluded = True
         
    return doc

In [28]:
def count_words(doc):
    c = 0
    
    for token in doc:
        if not token._.is_excluded:
            c += 1
    print(f'{c} words included')
    
    return doc

In [29]:
nlp.add_pipe(filter_tokens)
nlp.add_pipe(count_words)
nlp.pipe_names

['tagger', 'ner', 'wordcount', 'filter_tokens', 'count_words']

### Eliminating Duplicates

In [30]:
Token.set_extension('appearance', default=np.nan, force=True)

In [31]:
def elim_dup(doc):
    '''eliminates all duplicates and counts the appearance of the included words'''
    already_appeared = {}
    
    for token in doc:
        if not token._.is_excluded:
            if token.lemma_ in already_appeared.keys():
                already_appeared[token.lemma_] += 1
                token._.is_excluded = True
            else: 
                already_appeared[token.lemma_] = 1
    
    for token in doc:
        if not token._.is_excluded:
            token._.appearance = already_appeared[token.lemma_]
    
    
    
    return doc

In [32]:
def count_voc(doc):
    c = 0
    
    for token in doc:
        if not token._.is_excluded:
            c += 1
    print(f'{c} words without duplicate included')
    
    return doc

In [33]:
nlp.add_pipe(elim_dup)
nlp.add_pipe(count_voc)
nlp.pipe_names

['tagger',
 'ner',
 'wordcount',
 'filter_tokens',
 'count_words',
 'elim_dup',
 'count_voc']

### Relative Freqency

In [34]:
Token.set_extension('relativ_freq', default=np.nan, force=True)

In [35]:
def calculate_relativ_freq(doc):
    '''calculating the relativ frequency of a included word'''
    
    maw = 20 # maximal appearance weight
    
    for token in doc:
        if not token._.is_excluded:
            text_freq = min(maw, token._.appearance) / doc._.wordcount
            overall_freq = word_frequency(token.lemma_, 'en')
            if overall_freq != 0:
                token._.relativ_freq = text_freq**2 / overall_freq
        
    return doc

In [36]:
nlp.add_pipe(calculate_relativ_freq)
nlp.pipe_names

['tagger',
 'ner',
 'wordcount',
 'filter_tokens',
 'count_words',
 'elim_dup',
 'count_voc',
 'calculate_relativ_freq']

### Rating Difficulty

In [37]:
Token.set_extension('difficulty', default=0, force=True)

In [38]:
def get_difficulty(doc):
    
    for token in doc:
        if not token._.is_excluded:
            lemma = token.lemma_
            readability = zipf_frequency(lemma, 'en') # score of 1-8
            syl = syllable_counter(lemma)
            if syl > 1 and readability != 0 :
                readability -= 0 #(syl - 2)/2
            token._.difficulty = round(8 - readability, 2)
            
    return doc

In [39]:
nlp.add_pipe(get_difficulty)
nlp.pipe_names

['tagger',
 'ner',
 'wordcount',
 'filter_tokens',
 'count_words',
 'elim_dup',
 'count_voc',
 'calculate_relativ_freq',
 'get_difficulty']

# Test Processing

### Test scentence

In [40]:
doc = nlp("Hi I'm a little little boy. Get me a piece of cake or I'll killed your mother. I am the mother of your mother")

24 overall words
9 words included
6 words without duplicate included


In [42]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [30]:
data = []
for token in doc:
    if not token._.is_excluded:
        data.append((token, token.lemma_, token._.appearance, token._.difficulty, token._.relativ_freq))
df = pd.DataFrame(data, columns=['token', 'lemma', 'appearance', 'difficulty', 'relativ freqency'])
df.head(100)

Unnamed: 0,token,lemma,appearance,difficulty,relativ freqency
0,little,little,2,2.25,12.356663
1,boy,boy,1,2.8,10.988045
2,piece,piece,1,3.03,18.607836
3,cake,cake,1,3.57,64.539447
4,killed,kill,1,2.91,14.114724
5,mother,mother,3,2.72,81.806283


In [31]:
df.sort_values(by=['difficulty'], ascending=False)

Unnamed: 0,token,lemma,appearance,difficulty,relativ freqency
3,cake,cake,1,3.57,64.539447
2,piece,piece,1,3.03,18.607836
4,killed,kill,1,2.91,14.114724
1,boy,boy,1,2.8,10.988045
5,mother,mother,3,2.72,81.806283
0,little,little,2,2.25,12.356663


In [32]:
df.sort_values(by=['relativ freqency'], ascending=False)

Unnamed: 0,token,lemma,appearance,difficulty,relativ freqency
5,mother,mother,3,2.72,81.806283
3,cake,cake,1,3.57,64.539447
2,piece,piece,1,3.03,18.607836
4,killed,kill,1,2.91,14.114724
0,little,little,2,2.25,12.356663
1,boy,boy,1,2.8,10.988045


________________________________

### Kafka

In [33]:
with open(text_file, "r") as f:
    doc = nlp(f.read())

25062 overall words
8282 words included
2029 words without duplicate included


In [34]:
data = []
for token in doc:
    if not token._.is_excluded:
        data.append((token, token.lemma_, token._.appearance, token._.difficulty, token._.relativ_freq))
df = pd.DataFrame(data, columns=['token', 'lemma', 'appearance', 'difficulty', 'relativ freqency'])
df

Unnamed: 0,token,lemma,appearance,difficulty,relativ freqency
0,Translated,translate,3,4.03,0.001536
1,use,use,21,2.17,0.000942
2,cost,cost,6,2.79,0.000354
3,restrictions,restriction,2,4.18,0.000963
4,whatsoever,whatsoever,3,4.01,0.001467
...,...,...,...,...,...
2024,PG,pg,1,4.25,0.000283
2025,search,search,1,2.98,0.000015
2026,facility,facility,1,3.46,0.000046
2027,subscribe,subscribe,1,4.11,0.000205


In [35]:
df.sort_values(by=['difficulty'], ascending=False).head(20)

Unnamed: 0,token,lemma,appearance,difficulty,relativ freqency
716,soughing,soughing,1,8.0,
1754,excusal,excusal,1,8.0,
440,swinging,swinge,1,8.0,
24,METAMORPHOSIS,metamorphosi,2,8.0,
1836,PGLAF,pglaf,1,8.0,
849,outstretched,outstretche,2,8.0,
1999,outdated,outdate,1,8.0,
723,dishevelled,dishevel,1,8.0,
1478,teaboy,teaboy,1,8.0,
542,fretsaw,fretsaw,2,8.0,


In [36]:
df.sort_values(by=['relativ freqency'], ascending=False).head(30)

Unnamed: 0,token,lemma,appearance,difficulty,relativ freqency
1455,charwoman,charwoman,9,6.39,3.16854
735,leant,leant,8,5.61,0.415894
464,swang,swang,2,6.42,0.167589
279,hurriedly,hurriedly,7,5.29,0.152071
1955,unenforceability,unenforceability,1,6.92,0.132674
931,palely,palely,1,6.87,0.117933
1051,unselfconsciously,unselfconsciously,1,6.77,0.093653
1290,denuded,denude,1,6.77,0.093653
268,workshy,workshy,1,6.76,0.0915
962,crawled,crawl,17,4.28,0.087641


### Wordclusters

In [210]:
from gensim.models import Word2Vec

In [249]:
doc = nlp("I love cars, which drive very fast and loud. Trucks and vans are the best. My mother is all in to cooking good meals, foods and healthy dishes. This keeps me healthy while. I love the pasta and pies she bakes")

41 overall words
21 words included
18 words without duplicate included


In [250]:
words = [[token.lemma_ for token in doc if not token._.is_excluded]]
print(words)

[['love', 'car', 'drive', 'fast', 'loud', 'truck', 'van', 'good', 'mother', 'cook', 'meal', 'food', 'healthy', 'dish', 'keep', 'pasta', 'pie', 'bake']]
