<a href="https://colab.research.google.com/github/ShinAsakawa/2019cnps/blob/master/notebooks/2019cnps_arpabet_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ARPABET の活用

もっとも簡単には `nltk` パッケージを活用して以下のようになります。

In [0]:
import nltk  # nltk パッケージの輸入
nltk.download('cmudict')  # CMU 辞書のダウンロード クラウド上ではその都度必要です
cmu = nltk.corpus.cmudict.dict()  # ダウンロードした辞書の準備
print(cmu['apple']) # 'apple' の発音を調べてみます

In [0]:
# 引き続き cmu 辞書を使って見ます
sentence = 'Neural networks are awesome'.lower().split()
print(sentence)  # 最初に入力文の印字
for word in sentence:
    print(word, ':--->', cmu[word])  # 各単語の発音を印字

In [0]:
# 以下は `nltk` からの蘊蓄です。

# Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

"""
The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6]
ftp://ftp.cs.cmu.edu/project/speech/dict/
Copyright 1998 Carnegie Mellon University

File Format: Each line consists of an uppercased word, a counter
(for alternative pronunciations), and a transcription.  Vowels are
marked for stress (1=primary, 2=secondary, 0=no stress).  E.g.:
NATURAL 1 N AE1 CH ER0 AH0 L

The dictionary contains 127069 entries.  Of these, 119400 words are assigned
a unique pronunciation, 6830 words have two pronunciations, and 839 words have
three or more pronunciations.  Many of these are fast-speech variants.

Phonemes: There are 39 phonemes, as shown below:

Phoneme Example Translation    Phoneme Example Translation
------- ------- -----------    ------- ------- -----------
AA      odd     AA D           AE      at      AE T
AH      hut     HH AH T        AO      ought   AO T
AW      cow     K AW           AY      hide    HH AY D
B       be      B IY           CH      cheese  CH IY Z
D       dee     D IY           DH      thee    DH IY
EH      Ed      EH D           ER      hurt    HH ER T
EY      ate     EY T           F       fee     F IY
G       green   G R IY N       HH      he      HH IY
IH      it      IH T           IY      eat     IY T
JH      gee     JH IY          K       key     K IY
L       lee     L IY           M       me      M IY
N       knee    N IY           NG      ping    P IH NG
OW      oat     OW T           OY      toy     T OY
P       pee     P IY           R       read    R IY D
S       sea     S IY           SH      she     SH IY
T       tea     T IY           TH      theta   TH EY T AH
UH      hood    HH UH D        UW      two     T UW
V       vee     V IY           W       we      W IY
Y       yield   Y IY L D       Z       zee     Z IY
ZH      seizure S IY ZH ER
"""


もう少し精緻な処理をしてみます。

In [0]:
# source: http://compling.hss.ntu.edu.sg/courses/hg2051/code/wk5b.py
import nltk 
from nltk.corpus import stopwords
nltk.download('gutenberg')
nltk.download('stopwords')

###  Find the 50 most frequent bigrams (see Week 6) in Jane Austen's Emma.
###  Then find the 50 most frequent bigrams that do not include a stopword.
emma = nltk.corpus.gutenberg.words('austen-emma.txt')

print ("Find the 50 most frequent bigrams in emma")
print ([b[0] for b in nltk.FreqDist(nltk.bigrams(emma)).most_common(50)])

print ("Find the 50 most frequent bigrams in emma without stopwords")

stopen = stopwords.words('english')

# that do not contain stopwords (or punct)
print (nltk.FreqDist(nltk.bigrams(w for w in emma 
                                 if w not in stopen 
                                 and w.isalpha())).most_common(50))

### find the bigrams first then remove stop words
print ("Find the 50 most frequent bigrams without stopwords in emma")
print (nltk.FreqDist((w1, w2) for (w1, w2) in nltk.bigrams(w for w in emma
                                                          if w.isalpha())
                     if w1 not in stopen and w2  not in stopen).most_common(50))

In [0]:
cmu['awesome']

In [0]:
###
### A Pronouncing Dictionary
###
pron = nltk.corpus.cmudict.entries()

print("First ten entries", pron[:10])

# Find python in the Pronouncing Dictionary and print its pronunciation.
python_pron = [p for w,p in pron if w =='python']
print("Python is pronounced", python_pron)

# Find marathon and print its pronunciation.
marathon_pron = [p for w,p in pron if w =='marathon']
print("Marathon is pronounced", marathon_pron)

# 続いて音素関係の処理と表示です

In [0]:
# Find all the words whose last syllable rhymes with python.
# Find all the words whose last syllable rhymes with marathon. 

for word in ['python', 'marathon']:
    wp = [p for w,p in pron if w ==word]
    print("{} is pronounced {}".format(word, wp))
    print("These words rhyme with it (assuming last three phonemes are ok):")
    print([(w,p[-3:]) for w,p in pron if len(p)>2 and wp[0][-3:]==p[-3:]])

## A better definition of "rhyme" is 
### "identical in pronunciation from the main-stressed vowel to the end",
### See: http://languagelog.ldc.upenn.edu/nll/?p=1946

# WordNet を使った類義語の処理と表示

In [0]:
nltk.download('wordnet')
nltk.download('omw')

### And of course: http://en.wiktionary.org/wiki/Rhymes:English

# â˜… Write a function that converts Arbabet to IPA
# Use it to print the pronunciations of python and marathon

# Load wordnet inside python.
from nltk.corpus import wordnet as wn
print ("\nWordnet\n")

#     Look at the different synsets for bird.
print ("Synsets for bird:")
print (wn.synsets('bird'))

#     How many are there?
print ("# of senses for bird:", len(wn.synsets('bird')))

#     How deep in the hierarchy and what are the definitions?
for s in wn.synsets('bird'):
    print (s.name(), s.min_depth(), s.definition())


print("\nWhich  languages have lemmas for 'bird.n.01'?")
for l in wn.langs():
    ### bug in Croation and Bulgarian, my bad
    # if l in ('hrv', 'bul'):
    #     continue ## skip to the next language
    if wn.synset('bird.n.01').lemmas(lang=l):
        print (l, end=': ')
        print(",".join(wn.synset('bird.n.01').lemma_names(lang=l)))

In [0]:
# For each synset, print out each lemma and its frequency (hint freqency of a lemma is given by lemma.count)
for s in wn.synsets('bird'):
    print(s)
    for l in s.lemmas():
        print (l.name(), l.count())
    print()
    
# Give the total frequency for each synset 
for s in wn.synsets('bird'):
    print (s.name(), sum(l.count() for l in s.lemmas()), s.definition())
    

# â˜… Tabulate the average polysemy per word length for all words in wordnet, and then seperately for each part of speech. (Hint: polysemy is number of synsets/word; you can get all words by [w for w in wn.all_lemma_names()]; for just nouns you can do: [w for w in wn.all_lemma_names('n')]. ) 

In [0]:
def lastsyllable (pron):
    "take the pronunciation from cmudict, return the last syllable"
    # e.g. ['P', 'AY1', 'TH', 'AA0', 'N'] -> ['TH', 'AA0', 'N']
    end = []
    nrop = pron[::-1]
    if len(pron) ==1:
        ## 'I'
        return pron
    for l in nrop:
        end.insert(0,l)
        ## stop when we come to a vowel
        if l[-1].isdigit() and nrop.index(l) != 0:
            ## add the precceding phoneme as well if there is one
            ## don't do this if it is the second vowel 'Hammer'
            if len(end) < len(nrop) and \
                len([c for c in end if c[-1].isdigit()]) < 2:  
                end.insert(0,nrop[len(end)])
            return end

def rhymes(word, pdict):
    for pron in pdict[word]:
        print ("Rhymes for:", word, pron, lastsyllable(pron))
        end = lastsyllable(pron)
        for w, prons in pdict.items():
            for p in prons:
                if p[-len(end):] == end and w != word:
                    print (w, p)
        print()

rhymes('neural', cmu)
rhymes('networks', cmu)
rhymes('are', cmu)
rhymes('awesome', cmu)

In [0]:
sentence = ['cognitive', 'neuro', 'psychology', 'is', 'great', 'as', 'well']
print(sentence)

In [0]:
for word in sentence:
    print(rhymes(word,cmu))