In [1]:
from __future__ import division
import nltk
import random
import re
import string
import operator
import time
from pprint import pprint
from collections import defaultdict, Counter
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.probability import DictionaryProbDist as D
from nltk.classify import SklearnClassifier
from sklearn.linear_model import LogisticRegression

### Read files

In [2]:
with open('stopwords.txt', 'r') as f:
    stopwords = [line.strip() for line in f]

In [3]:
lmtzr = WordNetLemmatizer()
def words(text):
    return re.findall(r'\w+', text.lower())

In [4]:
# documents
wordnet_data = [line.strip().split('\t') for line in open('wn.in.evp.cat.txt', 'r') if line.strip() != '']

In [5]:
# training = 90% / testing = 10%
def split_train_test(data):
    random.shuffle(data)
    split_point = len(data) * 9 // 10
    train_set, test_set = data[:split_point], data[split_point:]
    return train_set, test_set

In [6]:
train_set, test_set = split_train_test(wordnet_data)

### Features

In [7]:
def wordnet_features(sentence):
    sentence = sentence.replace('||', ' ').replace('; ', ' ')
    features = {}
    for word in sentence.lower().split():
        if word not in stopwords:
            features[word] = True
            if word not in features:
                features[word] = 1
            else:
                features[word] += 1
    return features

In [8]:
def feature_engineering(train_set):
    org_word = []
    label = []
    features = []
    candidates = []
    for train in train_set:
        c = []
        org_word += [''.join(train[:][0])]
        label += [''.join(train[:][1])]
        features += [wordnet_features(''.join(train[:][2]))]
        for candidate in eval(train[:][3]).values():
            c += [candidate]
        candidates += [c]
    return org_word, label, features, candidates

In [9]:
train_org_word, train_label, train_features, train_candidates = feature_engineering(train_set)
test_org_word, test_label, test_features, test_candidates = feature_engineering(test_set)

------------------------------

### Test all data with all training data

In [10]:
def sk_training_all(train_features, train_label, test_features, test_label, test_candidates):
    global sklearn_classifier_all
    print('== SkLearn MaxEnt ==')
    output_candidates = []
    test_set = []
    correct = 0
    N = len(test_label)
    
    for X, y in zip(test_features, test_label):
        test_set.append((X, y))
        
    for i in range(N):
        output_candidates.clear()
        
        feature = test_features[i]
        candidate = test_candidates[i]
        label = test_label[i]
        
        prediction = sklearn_classifier_all.prob_classify(feature)._prob_dict
        sorted_pred = sorted(prediction.items(), key=operator.itemgetter(1), reverse=True)
        
        for result, prob in sorted_pred:
            if result in candidate:
                output_candidates.append((result, prob))
                
        if not output_candidates: 
            continue
            
        top_output_candidate = sorted(output_candidates, key=lambda x: x[1], reverse=True)[0][0]
        
        if top_output_candidate == label:
            correct += 1

    print('hand acc = %.4f' % (correct / N))
    print('nltk acc = %.4f' % nltk.classify.accuracy(sklearn_classifier_all, test_set))

In [11]:
%%time
train_set = []
for X, y in zip(train_features, train_label):
    train_set.append((X, y))
    
sklearn_classifier_all = SklearnClassifier(LogisticRegression(C=10e5)).train(train_set)

CPU times: user 6min 56s, sys: 2.84 s, total: 6min 59s
Wall time: 7min 5s


In [12]:
sk_training_all(train_features, train_label, test_features, test_label, test_candidates)

== SkLearn MaxEnt ==
hand acc = 0.6599
nltk acc = 0.5043


In [13]:
%save lab05_word_sense_representation_and_disambiguation.py 10-12

The following commands were written to file `lab05_word_sense_representation_and_disambiguation.py`:
def sk_training_all(train_features, train_label, test_features, test_label, test_candidates):
    global sklearn_classifier_all
    print('== SkLearn MaxEnt ==')
    output_candidates = []
    test_set = []
    correct = 0
    N = len(test_label)
    
    for X, y in zip(test_features, test_label):
        test_set.append((X, y))
        
    for i in range(N):
        output_candidates.clear()
        
        feature = test_features[i]
        candidate = test_candidates[i]
        label = test_label[i]
        
        prediction = sklearn_classifier_all.prob_classify(feature)._prob_dict
        sorted_pred = sorted(prediction.items(), key=operator.itemgetter(1), reverse=True)
        
        for result, prob in sorted_pred:
            if result in candidate:
                output_candidates.append((result, prob))
                
        if not output_candidates: 
            con