In [6]:
import nltk
import numpy as np

## build score dictionary

In [7]:
f = open("./data/newsela/lemma.num")

word_scores = {}

for line in f:
    word = line.split()[2]
    rank = int(line.split()[0])
    
    # words may occur multiple times (for example "to" as preposition and infitive marker)
    # in these cases, keep only lowest score
    if not word in word_scores:
        word_scores[word] = np.log(rank)

f.close()

In [26]:
# use .get() to return default value if key not in dictionary
for w in ["the", "of", "it", "she", "far", "severe", "fiscal", "linen", "antidisestablishmentarianism"]:
    print("{}: {}".format(w, word_scores.get(w, 9.)))

the: 0.0
of: 1.0986122886681098
it: 2.1972245773362196
she: 3.332204510175204
far: 5.84354441703136
severe: 7.578656850594762
fiscal: 8.456806041401142
linen: 8.625329850020815
antidisestablishmentarianism: 9.0


## score newsela article

In [54]:
newsela_dir = "./data/newsela/42157-"
newsela_levels = ["560", "830", "920", "1020", "max"]

wnl = nltk.stem.WordNetLemmatizer()
punc = [".", ",", "!", "?", "(", ")", "``", "''"]

In [86]:
print("average word difficulty scores:")
for level in newsela_levels:
    f = open(newsela_dir + level + ".txt")
    score = 0
    num_words = 0
    unknown = 0
    
    for line in f:
        # tokenize and POS-tag every line in the file
        for w, t in nltk.pos_tag(nltk.word_tokenize(line)):
            # ignore punctuation
            if w not in punc:
                # extract POS for correct lemmatization
                tag = t[0].lower()
                tag = tag if tag in ["n", "v", "j", "r"] else None
                tag = "a" if tag == "j" else tag
                
                if not tag:
                    w = lem.lemmatize(w.lower())
                else:
                    w = lem.lemmatize(w.lower(), tag)
                
                # add word difficulty score to total
                num_words += 1
                if w not in word_scores:
                    unknown += 1
                score += word_scores.get(w, 9)
                
                #print unknown words:
                #if word_scores.get(w, 9.) == 9.:
                #    print("{:15} {:4} {} {:.4f}".format(w, t, " " if not tag else tag, word_scores.get(w, 9.)))
    
    # calculate average difficulty score and percentage of unknown words
    score = score / num_words
    unknown = unknown / num_words
    
    print("level {:>4}:  {:.4f} ({:.3f}% of words unknown)".format(level, score, unknown))
    
    f.close()

average word difficulty scores:
level  560:  4.7132 (0.124% of words unknown)
level  830:  4.8617 (0.139% of words unknown)
level  920:  4.8521 (0.130% of words unknown)
level 1020:  4.9233 (0.147% of words unknown)
level  max:  5.0566 (0.179% of words unknown)


### unknown words for ...560.txt:

In [None]:
's              POS    9.0000
deeper          NN   n 9.0000
darker          NN   n 9.0000
colder          VB   v 9.0000
others          NNS  n 9.0000
john            NNP  n 9.0000
spark           NNP  n 9.0000
american        NNP  n 9.0000
york            NNP  n 9.0000
glue            NNP  n 9.0000
jellyfish       NNP  n 9.0000
kakani          NNP  n 9.0000
katija          NNP  n 9.0000
monterey        NNP  n 9.0000
california      NNP  n 9.0000
jellyfish       NN   n 9.0000
katija          NNP  n 9.0000
tag             NNS  n 9.0000
tag             NNS  n 9.0000
tag             NNS  n 9.0000
jellyfish       NN   n 9.0000
tag             NNS  n 9.0000
tag             NNS  n 9.0000
tag             NNS  n 9.0000
tag             JJ   a 9.0000
katija          NNP  n 9.0000
others          NNS  n 9.0000
tag             NN   n 9.0000
katija          NNP  n 9.0000
others          NNS  n 9.0000
katija          NNP  n 9.0000
's              POS    9.0000
backpack        NN   n 9.0000
glue            NN   n 9.0000
hike            VBG  v 9.0000
glue            NN   n 9.0000
tag             NNS  n 9.0000
jellyfish       NN   n 9.0000
glue            NN   n 9.0000
tag             NN   n 9.0000
mini            NNP  n 9.0000
robot           NNP  n 9.0000
plankton        NNP  n 9.0000
plankton        NNP  n 9.0000
jules           NNS  n 9.0000
jaffe           NNP  n 9.0000
scripps         NNP  n 9.0000
oceanography    NNP  n 9.0000
san             NNP  n 9.0000
diego           NNP  n 9.0000
california      NNP  n 9.0000
plankton        JJ   a 9.0000
jaffe           NNP  n 9.0000
mini-underwater JJ   a 9.0000
robot           NNS  n 9.0000
plankton        NN   n 9.0000
two             CD     9.0000
jaffe           NNP  n 9.0000
two             CD     9.0000
jeremy          NNP  n 9.0000
goldbogen       NNP  n 9.0000
stanford        NNP  n 9.0000
's              POS    9.0000
hopkins         NNP  n 9.0000
california      NNP  n 9.0000
tag             NNS  n 9.0000
tag             NNS  n 9.0000
tag             NNS  n 9.0000
suction         NN   n 9.0000
tag             NNS  n 9.0000
underwater      JJ   a 9.0000

proper nouns (katija, california, monterey), wrongly tagged (deeper, darker as NN; colder as VB), number words, possessive 's 