In [171]:
from __future__ import division
import nltk
import random
import re
import string
import operator
import time
from pprint import pprint
from collections import defaultdict, Counter
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.probability import DictionaryProbDist as D
from nltk.classify import SklearnClassifier
from sklearn.linear_model import LogisticRegression

In [172]:
with open('stopwords.txt', 'r') as f:
    stopwords = [line.strip() for line in f]

In [173]:
lmtzr = WordNetLemmatizer()
def words(text):
    return re.findall(r'\w+', text.lower())

In [174]:
wordnet_data = [line.strip().split('\t') for line in open('wn.in.evp.cat.txt', 'r') if line.strip() != '']

In [175]:
def split_train_test(data):
    random.shuffle(data)
    split_point = len(data) * 9 // 10
    train_set, test_set = data[:split_point], data[split_point:]
    return train_set, test_set

In [176]:
train_set, test_set = split_train_test(wordnet_data)

In [177]:
def wordnet_features(sentence):
    sentence = sentence.replace('||', ' ').replace('; ', ' ')
    features = {}
    for word in sentence.lower().split():
        if word not in stopwords:
            if word not in features:
                features[word] = 1
            else:
                features[word] += 1
    return features

In [178]:
def feature_engineering(train_set):
    org_word = []
    label = []
    features = []
    candidates = []
    for train in train_set:
        c = []
        org_word += [''.join(train[:][0])]
        label += [''.join(train[:][1])]
        features += [wordnet_features(''.join(train[:][2]))]
        for candidate in eval(train[:][3]).values():
            c += [candidate]
        candidates += [c]
    return org_word, label, features, candidates

In [179]:
train_org_word, train_label, train_features, train_candidates = feature_engineering(train_set)
test_org_word, test_label, test_features, test_candidates = feature_engineering(test_set)

In [180]:
label_for_testing = ''.join(train_label[0])
features_for_testing = train_features[0]
candidates_for_testing = train_candidates[0]

In [53]:
print(candidates_for_testing)

['cognition.n.01', 'representation.n.02']


In [72]:
def sk_training_for_testing(train_features, train_label, test_features, test_label, test_candidates):
    print('== SkLearn MaxEnt ==')
    final_result = []
    train_set = []
    test_set = []
    for X, y in zip(train_features, train_label):
        train_set.append((X, y))
        
    for X, y in zip(test_features, test_label):
        test_set.append((X, y))
        
    sklearn_classifier = SklearnClassifier(LogisticRegression(C=10e5)).train(train_set[:10])
    
    for feature in test_features[:1]:
        prediction = sklearn_classifier.prob_classify(features_for_testing)._prob_dict
        pprint(sorted(prediction.items(), key=operator.itemgetter(1), reverse=True))
        for label, prob in sorted(prediction.items(), key=operator.itemgetter(1), reverse=True):
            if label in candidates_for_testing:
                final_result.append(label)
        
    return final_result

In [73]:
print("Testing: counterpart-n-1\nAnswer : cognition.n.01\n")
sk_training_for_testing(train_features, train_label, test_features, test_label, test_candidates)

Testing: counterpart-n-1
Answer : cognition.n.01

== SkLearn MaxEnt ==
[('cognition.n.01', 0.99995799601992952),
 ('lost.a.01', 6.5572203758608934e-06),
 ('fast.a.01', 6.1243954694721308e-06),
 ('constitute.v.01', 6.0927873943277612e-06),
 ('produce.v.02', 5.5419524422340959e-06),
 ('energetic.a.01', 5.309539213976842e-06),
 ('belief.n.01', 5.3024468754546519e-06),
 ('activity.n.01', 2.3634005482595415e-06),
 ('commerce.n.01', 2.3634005482595415e-06),
 ('stimulating.a.01', 2.3488372023547602e-06)]


['cognition.n.01']

In [169]:
def sk_training(train_features, train_label, test_features, test_label, test_candidates):
    print('== SkLearn MaxEnt ==')
    final_result = []
    train_set = []
    test_set = []
    correct = 0
    total = 0
    for X, y in zip(train_features, train_label):
        train_set.append((X, y))
        
    for X, y in zip(test_features, test_label):
        test_set.append((X, y))

    sklearn_classifier = SklearnClassifier(LogisticRegression(C=10e5)).train(train_set[:1000])
    
    for feature, candidate in zip(test_features[:5], test_candidates[:5]):
        prediction = sklearn_classifier.prob_classify(wordnet_features(''.join(feature)))._prob_dict
        sorted_pred = sorted(prediction.items(), key=operator.itemgetter(1), reverse=True)
        for label, prob in sorted_pred:
            if label in candidate:
                final_result.append(label)
                has_ans = True
                break
#         final_result += sklearn_classifier.prob_classify(wordnet_features(''.join(feature)))._prob_dict
        if not has_ans:
            final_result.append(sorted_pred[0][0])
        has_ans = False
        total += 1
        
    print('ANS:')
    print(test_label[:5])
    print('final_result:')
    print(final_result)
    for i in range(len(final_result)):
        if test_label[i] == final_result[i]:
            correct += 1
    
    print('correct = %d, total = %d' %(correct, total))
    print(nltk.classify.accuracy(sklearn_classifier, test_set[:5]))
    return final_result

In [170]:
final_r = sk_training(train_features, train_label, test_org_word, test_features, test_label, test_candidates)

== SkLearn MaxEnt ==
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3


IndexError: list index out of range

In [152]:
def sk_training_all(train_features, train_label, test_features, test_label, test_candidates):
    print('== SkLearn MaxEnt ==')
    final_result = []
    train_set = []
    test_set = []
    correct = 0
    total = 0
    for X, y in zip(train_features, train_label):
        train_set.append((X, y))
        
    for X, y in zip(test_features, test_label):
        test_set.append((X, y))

    sklearn_classifier = SklearnClassifier(LogisticRegression(C=10e5)).train(train_set)
    
    for feature, candidate in zip(test_features, test_candidates):
        prediction = sklearn_classifier.prob_classify(wordnet_features(''.join(feature)))._prob_dict
        sorted_pred = sorted(prediction.items(), key=operator.itemgetter(1), reverse=True)
        for label, prob in sorted_pred:
            if label in candidate:
                final_result.append(label)
                has_ans = True
                break
        if not has_ans:
            final_result.append(sorted_pred[0][0])
        has_ans = False
        total += 1
        
#     print('ANS:')
#     print(test_label)
#     print('final_result:')
#     print(final_result)
    for i in range(len(final_result)):
        if test_label[i] == final_result[i]:
            correct += 1
    
    print('correct = %d, total = %d' %(correct, total))
    print(nltk.classify.accuracy(sklearn_classifier, test_set))

    return final_result

In [153]:
start_time = time.time()
final_r_all = sk_training_all(train_features, train_label, test_features, test_label, test_candidates)
elapsed_time = time.time() - start_time

== SkLearn MaxEnt ==
correct = 825, total = 2320
0.49267241379310345


In [154]:
elapsed_time

476.90998911857605

In [181]:
%save lab05_word_sense_representation_and_disambiguation.py 171-180

The following commands were written to file `lab05_word_sense_representation_and_disambiguation.py`:
from __future__ import division
import nltk
import random
import re
import string
import operator
import time
from pprint import pprint
from collections import defaultdict, Counter
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.probability import DictionaryProbDist as D
from nltk.classify import SklearnClassifier
from sklearn.linear_model import LogisticRegression
with open('stopwords.txt', 'r') as f:
    stopwords = [line.strip() for line in f]
lmtzr = WordNetLemmatizer()
def words(text):
    return re.findall(r'\w+', text.lower())
wordnet_data = [line.strip().split('\t') for line in open('wn.in.evp.cat.txt', 'r') if line.strip() != '']
def split_train_test(data):
    random.shuffle(data)
    split_point = len(data) * 9 // 10
    train_set, test_set = data[:split_point], data[split_point:]
    return train_set, test_set
train_set, test_set = split_train_test(wordnet_data)
de