In [None]:
import nltk
import numpy as np
nltk.download('senseval')
nltk.download('stopwords')
from nltk.corpus import senseval
import random
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from nltk.classify import accuracy, NaiveBayesClassifier, MaxentClassifier
from collections import defaultdict

[nltk_data] Downloading package senseval to /root/nltk_data...
[nltk_data]   Unzipping corpora/senseval.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def extract_vocabulary_frequency(data, stopwords=[], n=300):
    freq_dist = nltk.FreqDist()
    for instance in data:
        words = [context_word[0] for context_word in instance.context if not context_word[0].startswith(instance.word)]
        for word in set(words) - set(stopwords):
            freq_dist[word] += 1
    return freq_dist.most_common()[:n+1]

In [None]:
def context_features(instance, vocab, distance=3):
    features = {}
    index = instance.position
    context = instance.context
    for i in range(max(0, index - distance), index):
        features[context[i][0]] = True

    for i in range(index + 1, min(index + distance + 1, len(context))):
        features[context[i][0]] = True

    return features


In [None]:
def train_naive_bayes(data, stopwords_list, training_data, test_data, unique_labels, vocab_size, window_size):
    vocabulary = [word for word, _ in extract_vocabulary_frequency(data, stopwords=stopwords_list, n=vocab_size)]
    label_distribution = {label: [instance[1] for instance in training_data].count(label) / len(training_data) for label in unique_labels}
    predicted_labels = []
    true_labels = []

    for instance in test_data:
        test_instance = instance[0]
        true_label = instance[1]
        features = context_features(test_instance, vocabulary, window_size)
        probabilities = []

        for label in unique_labels:
            feature_occurrence = {word: 0 for word in vocabulary}
            total_occurrence = {word: 0 for word in vocabulary}

            for train_instance in training_data:
                if label == train_instance[1]:
                    words = context_features(train_instance[0], vocabulary, window_size)
                    for word in words:
                        if word in feature_occurrence:
                            feature_occurrence[word] += 1

            for train_instance in training_data:
                words = context_features(train_instance[0], vocabulary, window_size)
                for word in words:
                    if word in total_occurrence:
                        total_occurrence[word] += 1

            theta = {}
            for feature in feature_occurrence:
                theta[feature] = (1 + feature_occurrence[feature]) / (len(unique_labels) + total_occurrence[feature])

            p_c_s = label_distribution[label]
            for feature in features:
                if feature in theta:
                    p_c_s *= theta[feature]

            probabilities.append(p_c_s)

        predicted_labels.append(unique_labels[np.argmax(probabilities)])
        true_labels.append(true_label)

    return predicted_labels, true_labels


In [None]:
def validate_parameters(instances, stopwords_list, training_data, validation_data, unique_labels, window_sizes,
                        vocab_sizes):
    best_window_size = 2
    best_vocab_size = 200
    best_accuracy = 0

    print("=== Validation ===")
    print("{:<15} {:<15} {:<20}".format("Window size", "Vocabulary size", "Validation Accuracy"))
    print("=" * 45)

    for window_size in window_sizes:
        for vocab_size in vocab_sizes:
            predicted_labels, true_labels = train_naive_bayes(
                instances,
                stopwords_list,
                training_data,
                validation_data,
                unique_labels,
                vocab_size,
                window_size
            )

            accuracy = accuracy_score(true_labels, predicted_labels)
            print("{:<15} {:<15} {:.4f}".format(window_size, vocab_size, accuracy))

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_window_size = window_size
                best_vocab_size = vocab_size

    print("\nBest parameters - Window size: {}, Vocabulary size: {}".format(best_window_size, best_vocab_size))
    return best_window_size, best_vocab_size


In [None]:
def test_model(instances, stopwords_list, training_data, test_data, unique_labels, best_vocab_size, best_window_size):
    print("\n=== Testing ===")
    predicted_labels, true_labels = train_naive_bayes(
        instances,
        stopwords_list,
        training_data,
        test_data,
        unique_labels,
        best_vocab_size,
        best_window_size
    )

    accuracy = accuracy_score(true_labels, predicted_labels)
    print("Accuracy: {:.4f}".format(accuracy))
    print("\nConfusion matrix:")
    print(confusion_matrix(true_labels, predicted_labels))

    examples = 0
    print("\nExamples of errors:")
    print("===================")
    for i in range(len(true_labels)):
        if true_labels[i] != predicted_labels[i]:
            print("Test instance:")
            print(test_data[i][0])
            print("True label: {}, Predicted label: {}".format(true_labels[i], predicted_labels[i]))
            print("-------------------")
            examples += 1
        if examples == 10:
            break

In [None]:
def model(word):
    senses = []
    instances = []

    for instance in nltk.corpus.senseval.instances(word):
        senses.append((instance, instance.senses[0]))
        instances.append(instance)

    stopwords_list = nltk.corpus.stopwords.words('english')
    unique_labels = list(set([label for (_, label) in senses]))

    random.seed(100)
    random.shuffle(senses)
    n = len(senses)
    training_data = senses[:int(0.7 * n)]
    validation_data = senses[int(0.7 * n):int(0.75 * n)]
    test_data = senses[int(0.75 * n):n]

    window_sizes = [1, 2, 3, 4]
    vocab_sizes = [100, 200, 300]

    best_window_size, best_vocab_size = validate_parameters(
        instances, stopwords_list, training_data, validation_data, unique_labels, window_sizes, vocab_sizes
    )

    test_model(instances, stopwords_list, training_data, test_data, unique_labels, best_vocab_size, best_window_size)


In [None]:
model('line.pos')

=== Validation ===
Window size     Vocabulary size Validation Accuracy 
1               100             0.5894
1               200             0.5942
1               300             0.5990
2               100             0.5797
2               200             0.5845
2               300             0.5942
3               100             0.5652
3               200             0.5700
3               300             0.5749
4               100             0.5604
4               200             0.5652
4               300             0.5749

Best parameters - Window size: 1, Vocabulary size: 300

=== Testing ===
Accuracy: 0.6152

Confusion matrix:
[[  0   0   1   0  95   0]
 [  0  15   0   0  72   0]
 [  0   0  13   0  66   0]
 [  0   0   0  36  65   0]
 [  0   0   1   0 570   0]
 [  0   1   1   0  97   4]]

Examples of errors:
Test instance:
SensevalInstance(word='line-n', position=28, context=[('overall', 'RB'), ('they', 'PRP'), ('increased', 'VBD'), ('2', 'CD'), ('.', '.'), ('8', 'CD'), ('

In [None]:
model('hard.pos')

=== Validation ===
Window size     Vocabulary size Validation Accuracy 
1               100             0.8750
1               200             0.8889
1               300             0.9074
2               100             0.8565
2               200             0.8657
2               300             0.8750
3               100             0.8611
3               200             0.8796
3               300             0.8935
4               100             0.8519
4               200             0.8704
4               300             0.8796

Best parameters - Window size: 1, Vocabulary size: 300

=== Testing ===
Accuracy: 0.8755

Confusion matrix:
[[872   5   1]
 [ 54  71   0]
 [ 75   0   6]]

Examples of errors:
Test instance:
SensevalInstance(word='hard-a', position=1, context=[('but', 'CC'), ('hard', 'JJ'), ('sided', 'VBD'), ('luggage', 'NN'), (',', ','), ('he', 'PRP'), ('says', 'VBZ'), (',', ','), ('is', 'VBZ'), ('the', 'DT'), ('elegant', 'JJ'), ('type', 'NN'), ('that', 'IN'), ('people', 

In [None]:
model('serve.pos')

=== Validation ===
Window size     Vocabulary size Validation Accuracy 
1               100             0.4612
1               200             0.4703
1               300             0.4703
2               100             0.5297
2               200             0.5616
2               300             0.5708
3               100             0.5890
3               200             0.6073
3               300             0.6347
4               100             0.6393
4               200             0.6712
4               300             0.7078

Best parameters - Window size: 4, Vocabulary size: 300

=== Testing ===
Accuracy: 0.6393

Confusion matrix:
[[437  19   0   1]
 [127 196   3   1]
 [125  26  57   2]
 [ 58  32   1  10]]

Examples of errors:
Test instance:
SensevalInstance(word='serve-v', position=19, context=[('his', 'PRP$'), ('son', 'NN'), ('had', 'VBD'), ('entered', 'VBN'), ('the', 'DT'), ('army', 'NN'), (':', ':'), ('and', 'CC'), ('young', 'JJ'), ('osborne', 'NNP'), ('followed', 'VBD'),

In [None]:
model('interest.pos')

=== Validation ===
Window size     Vocabulary size Validation Accuracy 
1               100             0.5042
1               200             0.5294
1               300             0.5462
2               100             0.5546
2               200             0.5966
2               300             0.6134
3               100             0.5546
3               200             0.5966
3               300             0.6134
4               100             0.5042
4               200             0.5462
4               300             0.5546

Best parameters - Window size: 2, Vocabulary size: 300

=== Testing ===
Accuracy: 0.6791

Confusion matrix:
[[ 23   0   0   1   1  78]
 [  0   0   0   0   0   2]
 [  0   0   0   0   0  17]
 [  0   0   0   7   0  36]
 [  0   0   0   0  63  54]
 [  0   0   0   0   1 309]]

Examples of errors:
Test instance:
SensevalInstance(word='interest-n', position=11, context=[('the', 'DT'), ('meeting', 'NN'), ('offered', 'VBD'), ('stark', 'JJ'), ('evidence', 'NN'), ('o