In [1]:
import nltk
import math
from itertools import combinations
from nltk.corpus import stopwords
import string

In [2]:
# preprocessing function
def preprocessing(filename):
    f = open(filename)
    lines = f.read().splitlines()
    f.close()
    res = []
    for line in lines:
        line = line.split()
        new_lst = []
        new_lst.append(line[0])
        val = " ".join(line[1:])
        val = val.lower()
        table = str.maketrans(string.punctuation, " "*len(string.punctuation))  # OR {key: None for key in string.punctuation}
        val = val.translate(table)  
        new_lst.append(val)
        res.append(new_lst)
    return res

In [3]:
# init files
basstrain = preprocessing("bass_sake_train_test/bass.trn")
basstest = preprocessing("bass_sake_train_test/bass.tst")
saketrain = preprocessing("bass_sake_train_test/sake.trn")
saketest = preprocessing("bass_sake_train_test/sake.tst")

In [4]:
# step 1: find amount of each's useage in the corpus
# step 2: for each case, collect some of the surrounding words
bassfish = []
bassmusic = []
basstotal = len(basstrain)
sakealch = []
sakecause = []
saketotal = len(saketrain)

# sort through bass
for line in basstrain:
    if '*' in line[0]:
        bassfish.append(line[1])
    else:
        bassmusic.append(line[1])

# sort through sake
for line in saketrain:
    if '*' in line[0]:
        sakealch.append(line[1])
    else:
        sakecause.append(line[1])

# don't need the identifiers anymore, so reduce basstrain and saketrain to just
# the sentence strings
basstrain = [line[1] for line in basstrain]
saketrain = [line[1] for line in saketrain]

# also have some numerics stored in variables
numbassfish = len(bassfish)
percentfish = numbassfish/basstotal
numbassmusic = len(bassmusic)
percentmusic = numbassmusic/basstotal
numsakealch = len(sakealch)
percentalch = numsakealch/saketotal
numsakecause = len(sakecause)
percentcause = numsakecause/saketotal

print(sakecause)

['of the society for the sake of their re employment at an', 'sacrifice their souls for the sake of their country    doc id  afe19960518 0073 ', 'however  making controversy for controversy s sake isn t what he s about  he', 'branches of government for the sake of suing    cardinale said    and', 'hurt taking experience for the sake of taking experience    despite that', 'commitment to art for art s sake and a commitment to the', 'to make  sacrifices  for the sake economic stability  she said  she', 'stay together indefinitely for the sake of the kids  in a', 'to killing prostitutes for   the sake of god    the killings have', 'to change them for the sake of pragmatic political gains  what', 'we must play for the sake of the game   kottan said ', 'a major concession for the sake of stability  dropping a demand', 'of history for the future sake of the world  the forces', 'sacrificing your life for the sake or work  or they believe', 'moral minded will take for the sake of survival    so

In [5]:
# step 3: collect features
# feature collecting functions

# create collection of nearby words
def nearby_words(dataset, targetword):
    frequencydict = {}
    for line in dataset:
        for word in line.split():
            if word != 'targetword':
                if word in frequencydict:
                    frequencydict[word] += 1
                else:
                    frequencydict[word] = 1
    return frequencydict

# create collection of words immediately to the left
def left_words(dataset, targetword):  # this could probably be a lot more efficient
    frequencydict = {}
    for line in dataset:
        line_list = line.split()
        for i in range(len(line_list)):
            if line_list[i] == targetword:
                if line_list[i - 1] in frequencydict:
                    frequencydict[line_list[i - 1]] += 1
                else:
                    frequencydict[line_list[i - 1]] = 1
    return frequencydict

# create collection of words immediately to the right
def right_words(dataset, targetword):
    frequencydict = {}
    for line in dataset:
        line_list = line.split()
        for i in range(len(line_list)):
            if line_list[i] == targetword:
                if i == len(line_list) - 1:
                    break  # stops indexing errors
                if line_list[i + 1] in frequencydict:
                    frequencydict[line_list[i + 1]] += 1
                else:
                    frequencydict[line_list[i + 1]] = 1
    return frequencydict

In [6]:
# make freq dicts for each meaning and then for both meanings (need the latter
# for probabilities later)
fishfreq = nearby_words(bassfish, 'bass')
musicfreq = nearby_words(bassmusic, 'bass')
bassfreq = nearby_words(basstrain, 'bass')
alchfreq = nearby_words(sakealch, 'sake')
causefreq = nearby_words(sakecause, 'sake')
sakefreq = nearby_words(saketrain, 'sake')

In [7]:
# make dicts of words to the left and right
fishleft = left_words(bassfish, 'bass')
musicleft = left_words(bassmusic, 'bass')
bassleft = left_words(basstrain, 'bass')
alchleft = left_words(sakealch, 'sake')
causeleft = left_words(sakecause, 'sake')
sakeleft = left_words(saketrain, 'sake')
fishright = right_words(bassfish, 'bass')
musicright = right_words(bassmusic, 'bass')
bassright = right_words(basstrain, 'bass')
alchright = right_words(sakealch, 'sake')
causeright = right_words(sakecause, 'sake')
sakeright = right_words(saketrain, 'sake')

In [8]:
# step 4: make log-likelihood decision lists

# function to find P(word has certain meaning|feature present)
def prob_meaning_given_feature(desiredmeaningfeaturefreq, totalfeaturefreq):
    prob_of_features = {}
    for word in totalfeaturefreq:
        if word in desiredmeaningfeaturefreq:
            prob_of_features[word] = desiredmeaningfeaturefreq[word]/totalfeaturefreq[word]
        else:
            prob_of_features[word] = 0
    return prob_of_features

# probability smoother, using Laplacian Smoothing
def smooth_probabilities(probdict, totaltokens, alpha):
    vocab = len(probdict)
    smoothedprobs = {}
    for prob in probdict:
        smoothedprobs[prob] = (probdict[prob] * totaltokens + alpha) / (totaltokens + alpha * vocab)
        # this is the formula m + alpha / M + alpha*V, except m = prob(m) * M
    return smoothedprobs
    
# function for finding computing log-likelyhood of many features
def log_likelyhood(probdic1, probdic2):
    # the two dicts will have the same keys
    loglikelyhoods = {}
    for word in probdic1:
        loglikelyhoods[word] = abs(math.log(probdic1[word] / probdic2[word]))
    return loglikelyhoods

# function for finding features with the highest log-likelyhoods
def highest_likelyhoods(listoflogdicts, numvalues):
    # has to be given to the function in the order of nearby words, left, then right
    highest_features = ['']
    highest_logs = [0]
    feature = 0
    for dic in listoflogdicts:
        feature += 1
        if feature == 1:
            ftstring = 'nearby words '
        elif ftstring == 2:
            ftstring = 'left '
        else:
            ftstring = 'right '
        for word in dic:
            for n in highest_logs:
                if n < dic[word]:
                    highest_features.insert(highest_logs.index(n), ftstring + word)
                    highest_logs.insert(highest_logs.index(n), dic[word])
                    highest_features = highest_features[:numvalues]
                    highest_logs = highest_logs[:numvalues]
                    break
    best_features = []
    for i in range(len(highest_logs)):
        best_features.append([highest_features[i], highest_logs[i]])
    return best_features

In [9]:
# find smoothed probabilities of each feature (did alpha = .1 after testing
# a few values to see how alphas change the probabilities)
alpha = .5
# nearby words
fishnearbywordsprobs = prob_meaning_given_feature(fishfreq, bassfreq)
fishnearbywordsprobs = smooth_probabilities(fishnearbywordsprobs, basstotal, alpha)
musicnearbywordsprobs = prob_meaning_given_feature(musicfreq, bassfreq)
musicnearbywordsprobs = smooth_probabilities(musicnearbywordsprobs, basstotal, alpha)
alchnearbywordsprobs = prob_meaning_given_feature(alchfreq, sakefreq)
alchnearbywordsprobs = smooth_probabilities(alchnearbywordsprobs, saketotal, alpha)
causenearbywordsprobs = prob_meaning_given_feature(causefreq, sakefreq)
causenearbywordsprobs = smooth_probabilities(causenearbywordsprobs, saketotal, alpha)

# word to the left
fishleftprobs = prob_meaning_given_feature(fishleft, bassleft)
fishleftprobs = smooth_probabilities(fishleftprobs, basstotal, alpha)
musicleftprobs = prob_meaning_given_feature(musicleft, bassleft)
musicleftprobs = smooth_probabilities(musicleftprobs, basstotal, alpha)
alchleftprobs = prob_meaning_given_feature(alchleft, sakeleft)
alchleftprobs = smooth_probabilities(alchleftprobs, saketotal, alpha)
causeleftprobs = prob_meaning_given_feature(causeleft, sakeleft)
causeleftprobs = smooth_probabilities(causeleftprobs, saketotal, alpha)

# word to the right
fishrightprobs = prob_meaning_given_feature(fishright, bassright)
fishrightprobs = smooth_probabilities(fishrightprobs, basstotal, alpha)
musicrightprobs = prob_meaning_given_feature(musicright, bassright)
musicrightprobs = smooth_probabilities(musicrightprobs, basstotal, alpha)
alchrightprobs = prob_meaning_given_feature(alchright, sakeright)
alchrightprobs = smooth_probabilities(alchrightprobs, saketotal, alpha)
causerightprobs = prob_meaning_given_feature(causeright, sakeright)
causerightprobs = smooth_probabilities(causerightprobs, saketotal, alpha)

In [10]:
# compute log likelyhood of features

logoffishnearby = log_likelyhood(fishnearbywordsprobs, musicnearbywordsprobs)
logofmusicnearby = log_likelyhood(musicnearbywordsprobs, fishnearbywordsprobs)
logofalchnearby = log_likelyhood(alchnearbywordsprobs, causenearbywordsprobs)
logofcausenearby = log_likelyhood(causenearbywordsprobs, alchnearbywordsprobs)

logoffishleft = log_likelyhood(fishleftprobs, musicleftprobs)
logofmusicleft = log_likelyhood(musicleftprobs, fishleftprobs)
logofalchleft = log_likelyhood(alchleftprobs, causeleftprobs)
logofcauseleft = log_likelyhood(causeleftprobs, alchleftprobs)

logoffishright = log_likelyhood(fishrightprobs, musicrightprobs)
logofmusicright = log_likelyhood(musicrightprobs, fishrightprobs)
logofalchright = log_likelyhood(alchrightprobs, causerightprobs)
logofcauseright = log_likelyhood(causerightprobs, alchrightprobs)

# note: I realized later that log_likelyhood(a, b) = log_likelyhood(b, a)

In [11]:
# find the best features for identifying each type of the word
bassbestfeatures = highest_likelyhoods([logoffishnearby, logoffishleft, logoffishright], 10)
sakebestfeatures = highest_likelyhoods([logofalchnearby, logofalchleft, logofalchright], 10)
print(bassbestfeatures, '\n', sakebestfeatures)

[['nearby words stephan', 7.496097345175956], ['nearby words weidner', 7.496097345175956], ['nearby words composer', 7.496097345175956], ['nearby words player', 7.496097345175956], ['nearby words boehse', 7.496097345175956], ['nearby words onkelz', 7.496097345175956], ['nearby words valued', 7.496097345175956], ['nearby words 250', 7.496097345175956], ['nearby words trapped', 7.496097345175956], ['nearby words room', 7.496097345175956]] 
 [['nearby words society', 7.496097345175956], ['nearby words their', 7.496097345175956], ['nearby words employment', 7.496097345175956], ['nearby words an', 7.496097345175956], ['nearby words sacrifice', 7.496097345175956], ['nearby words souls', 7.496097345175956], ['nearby words doc', 7.496097345175956], ['nearby words id', 7.496097345175956], ['nearby words afe19960518', 7.496097345175956], ['nearby words 0073', 7.496097345175956]]


In [12]:
# find numerics for test data for model comparison
# bass
percentfishtst = 0
percentmusictst = 0
totalbasstst = 0
for item in basstest:
    totalbasstst += 1
    if '*' in item[0]:
        percentfishtst += 1
    else:
        percentmusictst += 1

percentfishtst = percentfishtst/totalbasstst
percentmusictst = percentmusictst/totalbasstst

# sake
percentalchtst = 0
percentcausetst = 0
totalsaketst = 0
for item in saketest:
    totalsaketst += 1
    if '*' in item[0]:
        percentalchtst += 1
    else:
        percentcausetst += 1

percentalchtst = percentalchtst/totalsaketst
percentcausetst = percentcausetst/totalsaketst

print(percentfishtst, percentmusictst, percentalchtst, percentcausetst)

0.44 0.56 0.06 0.94


In [14]:
# baseline test, label every example as the meaning that shows up 
# most frequently in the training data
if percentfish > percentmusic:
    basssenselabel = 'fish'
    bassbaseaccuracy = percentfishtst
else:
    basssenselabel = 'music'
    bassbaseaccuracy = percentmusictst

if percentalch > percentcause:
    sakesenselabel = 'beer'
    sakebaseaccuracy = percentalchtst
else:
    sakesenselabel = 'cause'
    sakebaseaccuracy = percentcausetst

print("Sense label for bass:", basssenselabel)
print("Sense label for sake:", sakesenselabel)
print("Baseline accuracy for bass:", bassbaseaccuracy)
print("Baseline accuracy for sake:", sakebaseaccuracy)

Sense label for bass: music
Sense label for sake: cause
Baseline accuracy for bass: 0.56
Baseline accuracy for sake: 0.94


In [16]:
# use the decision tree generated to determine meaning of word
fishwords = set(['valued', '250', 'trapped', 'valued'])
musicwords = set(['stephan', 'weidner', 'composer', 'player', 'boehse', 'onkelz', 'room'])
alchwords = set()  # there were legitimately none, all the features came from cause
causewords = set(['society', 'their', 'employment', 'an', 'sacrifice', 'souls', 'doc', 'id', 'afe19960518', '0073'])

basscorrectguesses = 0
for item in basstest:
    wordset = item[1].split()
    wordset = set(wordset)
    if (wordset & fishwords):
        if '*' in item[0]:
            basscorrectguesses += 1
    else: # don't use music words bc just use sensible label if no fishwords are present
        if not '*' in item[0]:
            basscorrectguesses += 1
bassaccuracy = basscorrectguesses/totalbasstst

# since alch has no words, this method is exactly the same as just using the sensible label
# as the algorithm being used is 1) check if any words in the sentence are in the less common
# meaning's word 2) if so, use the less common meaning. if not, use the sensible label
sakeaccuracy = sakebaseaccuracy

print("Accuracy for bass:", bassaccuracy)
print("Accuracy for sake:", sakeaccuracy)

Accuracy for bass: 0.56
Accuracy for sake: 0.94
