In [22]:
import nltk # language processing
import math
from nltk.tokenize import sent_tokenize, word_tokenize

with open("/home/retkowski/data/bbc/business/Great Western rail modernisation costs rocket, says NAO.txt") as f:
    string = f.read()
    f.close()

In [23]:
print("Number of Tokens: ", len(set(word_tokenize(string))))
print("Number of Occurences: ", len(word_tokenize(string)))

Number of Tokens:  342
Number of Occurences:  782


In [24]:
def ttr(text):
    return len(set(word_tokenize(string))) / len(word_tokenize(string))

def aq(text):
    words = word_tokenize(string)
    taggedWords = nltk.pos_tag(words)
    adjectives = [a[0] for a in taggedWords if a[1] in ['JJ', 'JJR', 'JJS']]
    verbs = [a[0] for a in taggedWords if a[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']]
    return len(verbs) / len(adjectives)

def naq(text):
    words = word_tokenize(string)
    taggedWords = nltk.pos_tag(words)
    adjectives = [a[0] for a in taggedWords if a[1] in ['JJ', 'JJR', 'JJS']]
    verbs = [a[0] for a in taggedWords if a[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']]
    return len(verbs) / (len(adjectives) + len(verbs))

def hl(text):
    words = word_tokenize(string)
    fdist = nltk.FreqDist(words)
    hapaxes = fdist.hapaxes()
    return len(hapaxes) / len(words)

def koi(text, n):
    words = word_tokenize(text)
    fdist = nltk.FreqDist(words)
    sum = 0
    for word in fdist.most_common(n):
        sum += word[1]
    return sum / len(words)

def nkoi(text, n, m):
    words = word_tokenize(text)
    h = math.floor(len(words) / m)
    sum = 0
    for i in range(h):
        sum += nkoi_i(words[i*100:(i+1)*100],n)
    return (sum/h)
        
def nkoi_i(words, n):
    fdist = nltk.FreqDist(words)
    sum = 0
    for word in fdist.most_common(n):
        sum += word[1]
    return sum / len(words)

# Quantitative Kennzahlen aus der Linguistik

In [25]:
print("Type-Token-Relation: ", ttr(string), "\n")
print("Aktionsquotient: ", aq(string))
print("Aktionsquotient (normalisiert): ", naq(string), "\n")
print("Einmaligkeitsindex (Hapax legomenon): ", hl(string), "\n")
print("Konzentrationsindex: ", koi(string, 10))
print("Konzentrationsindex (normalisiert): ", nkoi(string, 10, 100))

Type-Token-Relation:  0.4373401534526854 

Aktionsquotient:  2.180327868852459
Aktionsquotient (normalisiert):  0.6855670103092784 

Einmaligkeitsindex (Hapax legomenon):  0.289002557544757 

Konzentrationsindex:  0.2659846547314578
Konzentrationsindex (normalisiert):  0.33142857142857146


# Name Corpus

In [26]:
from nltk.corpus import names
words = word_tokenize(string)
#namesInText = [name for name in names.words() if name in words]
#namesInText
namesInText = [word for word in words if word in names.words()]
namesInText

['Morse', 'Richard', 'Morse', 'Meg']

# Named Entity (NE) – Chunking

In [27]:
def get_human_names(text):
    tokens = nltk.tokenize.word_tokenize(text)
    pos = nltk.pos_tag(tokens)
    sentt = nltk.ne_chunk(pos, binary = False)
    person_list = []
    person = []
    name = ""
    for subtree in sentt.subtrees(filter=lambda t: t.label() == 'PERSON'):
        for leaf in subtree.leaves():
            person.append(leaf[0])
        is_name = False
        for part_of_name in person:
             if part_of_name in namesInText:
                    is_name = True
        if is_name:
            for part in person:
                name += part + ' '
            if name[:-1] not in person_list:
                person_list.append(name[:-1])
            name = ''
        person = []

    return (person_list)

get_human_names(string)

['Amyas Morse', 'Richard Westcott', 'Meg Hillier MP']

In [28]:
def get_entities(text,label):
    tokens = nltk.tokenize.word_tokenize(text)
    pos = nltk.pos_tag(tokens)
    sentt = nltk.ne_chunk(pos, binary = False)
    entity_list = []
    entity = []
    name = ""
    for subtree in sentt.subtrees(filter=lambda t: t.label() == label):
        for leaf in subtree.leaves():
            entity.append(leaf[0])
        for part in entity:
            name += part + ' '
        if name[:-1] not in entity_list:
            entity_list.append(name[:-1])
        name = ''
        entity = []

    return (entity_list)

In [29]:
get_entities(string,"ORGANIZATION")

['Great Western',
 'National Audit',
 'NAO',
 'Department',
 'Paddington',
 'Maidenhead',
 'FirstGroup',
 'Transport']

In [30]:
get_entities(string,"GPE")

['Image', 'England', 'London', 'South Wales', 'Scotland', 'Cardiff']