### 1. Import data

In [252]:
import akl
import math
import operator
from pprint import pprint
from collections import defaultdict

In [253]:
akl = list(akl.akl.keys())
PRONS = set([line.strip('\n') for line in open('prons.txt')])
with open('HiFreWords') as f:
    HiFreWords = set(f.readline().split('\t'))

### 2. Data preprocessing

In [254]:
def create_sentence_pattern_list(input_pat):
    pattern = []
    final = []
    for i in input_pat:
        if i != '':
            pattern.append(i)
        else:
            final.append(pattern.copy())
            pattern.clear()

    # Last one
    final.append(pattern)
    return final

In [255]:
# Corpus
corpus = open('corpus_adj_n3.txt', 'r').read().strip('\n').split('\n')
corpus = create_sentence_pattern_list(corpus)

### 3. Extract patterns from corpus

In [302]:
def build_pattern_dict():
    pattern_dict = defaultdict(lambda: defaultdict(list))
    example_sentences = defaultdict(lambda: defaultdict(list))
    for _object in corpus:
        sent = _object[0]
        for c in _object[1:]:
            term, grammar, pattern = c.split('\t')
            pattern_dict[term][grammar] += [pattern]
            example_sentences[term][grammar] += [sent.split()]
            
    return pattern_dict, example_sentences

In [303]:
pattern_dict, example_sentences = build_pattern_dict()

### 4. Check extracted patterns

In [291]:
def check_pattern_dict():
    print('ABILITY -N')
    print('N to v \t\t(pd:%d, label:468)\n' % len(pattern_dict['ABILITY']['N to v']))
    
    print('VALUE -N')
    print('N to v \t\t(pd:%3d, label: 16)\n' % len(pattern_dict['VALUE']['N to v']))
    
    print('DISCUSS -V')
    print('V in n \t\t(pd:%3d, label: 47)' % len(pattern_dict['DISCUSS']['V in n']))
    print('V n \t\t(pd:%3d, label:270)' % len(pattern_dict['DISCUSS']['V n']))
    print('V wh to v \t(pd:%3d, label: 15)\n' % len(pattern_dict['DISCUSS']['V wh to v']))
    
    print('FAVOUR -V')
    print('V n \t\t(pd:%3d, label: 26)' % len(pattern_dict['FAVOUR']['V n']))
    print('V by n \t\t(pd:%3d, label:  5)\n' % len(pattern_dict['FAVOUR']['V by n']))
    
    print('CLASSIFY -V')
    print('V into n \t(pd:%3d, label:  8)' % len(pattern_dict['CLASSIFY']['V into n']))
    print('V as n \t\t(pd:%3d, label: 12)\n' % len(pattern_dict['CLASSIFY']['V as n']))

    print('USEFUL -ADJ')
    print('ADJ to v \t(pd:%3d, label: 30)' % len(pattern_dict['USEFUL']['ADJ to v']))
    print('ADJ for n \t(pd:%3d, label: 20)\n' % len(pattern_dict['USEFUL']['ADJ for n']))
    
    print('CERTAIN -ADJ')
    print('ADJ of n \t(pd:%3d, label: 23)' % len(pattern_dict['CERTAIN']['ADJ of n']))

In [292]:
check_pattern_dict()

ABILITY -N
N to v 		(pd:468, label:468)

VALUE -N
N to v 		(pd: 16, label: 16)

DISCUSS -V
V in n 		(pd: 57, label: 47)
V n 		(pd:270, label:270)
V wh to v 	(pd: 15, label: 15)

FAVOUR -V
V n 		(pd: 26, label: 26)
V by n 		(pd:  5, label:  5)

CLASSIFY -V
V into n 	(pd:  8, label:  8)
V as n 		(pd: 12, label: 12)

USEFUL -ADJ
ADJ to v 	(pd: 30, label: 30)
ADJ for n 	(pd: 20, label: 20)

CERTAIN -ADJ
ADJ of n 	(pd: 23, label: 23)


### 5. Testing

In [293]:
def computeScore(word, sent):
    global PRONS
    global HiFreWords
    
    word = word.lower()
    sent = sent.lower().split()
    length = len(sent)
    
    locationOfWord = -1 if word not in sent else sent.index(word) 
    hiFreWordsScore = len([w for w in sent if w not in HiFreWords])
    pronsScore = len([w for w in sent if w in PRONS])
    
    return locationOfWord - hiFreWordsScore - pronsScore

In [348]:
def get_best_pattern(word):
    avg = 0.0
    stddev = 0.0
    k0 = 1
    
    word = word.upper()
    
    print(word)

    # Total grammar count for the input word
#     N = len(pattern_dict[word].values())
    N = 0
    
#     if N == 0:
#         print('NO RESULT\n')
#         return

    # Calculate sentence length avg of a grammar
    for grammar, sentences in pattern_dict[word].items():
        N += len(sentences)
        for sentence in example_sentences[word][grammar]:
            freqi = len(sentence)
            avg += freqi
    avg /= N
#     print(N)
#     print(avg)

#     ＃ 算每個句子長度的avg, stddev
#     ＃ 再透過條件一去篩選出好的文法與句子
#     ＃
    # Calculate stddev
    for grammar, sentences in pattern_dict[word].items():
        for sentence in example_sentences[word][grammar]:
            freqi = len(sentence)
            stddev += (freqi - avg) ** 2
    stddev = math.sqrt(stddev / N)
    
#     print(stddev)
        
    if stddev == 0:
        print('NO RESULT\n')
        return

    best_score = -999.9
    best_sentence = ''
    
    # Filter good grammar
    for grammar, sentences in pattern_dict[word].items():
        freqi = len(sentences)
        strength = (freqi - avg) / stddev
        if not strength > k0:
            continue

        # Find Good Dictionary Example
        for sentence in sentences:
            score = computeScore(word, sentence)
            if score > best_score:
                best_score = score
                best_sentence = sentence

        print('%s (%d) %s' % (grammar, freqi, best_sentence))
    print()

### 6. Run test cases

In [349]:
def test_case():
    get_best_pattern('ability')
    get_best_pattern('value')
    get_best_pattern('discuss')
    get_best_pattern('favour')
    get_best_pattern('classify')
    get_best_pattern('useful')
    get_best_pattern('certain')

In [350]:
test_case()

ABILITY
N to v (468) its bulk and ability to fly

VALUE

DISCUSS
V in n (57) may discuss in person
V n (270) concerned may have and discuss them

FAVOUR

CLASSIFY

USEFUL

CERTAIN
ADJ that (50) much less certain that

