In [1]:
import nltk
import math
from itertools import combinations
from nltk.corpus import stopwords
import string

In [2]:
# preprocessing function
def preprocessing(filename):
    f = open(filename)
    lines = f.read().splitlines()
    f.close()
    res = []
    for line in lines:
        line = line.split()
        new_lst = []
        new_lst.append(line[0])
        val = " ".join(line[1:])
        val = val.lower()
        table = str.maketrans(string.punctuation, " "*len(string.punctuation))  # OR {key: None for key in string.punctuation}
        val = val.translate(table)  
        new_lst.append(val)
        res.append(new_lst)
    return res

In [3]:
# init files
basstrain = preprocessing("bass_sake_train_test/bass.trn")
basstest = preprocessing("bass_sake_train_test/bass.tst")
saketrain = preprocessing("bass_sake_train_test/sake.trn")
saketest = preprocessing("bass_sake_train_test/sake.tst")

In [4]:
# step 1: find amount of each's useage in the corpus
# step 2: for each case, collect some of the surrounding words
bassfish = []
bassmusic = []
basstotal = len(basstrain)
sakealch = []
sakecause = []
saketotal = len(saketrain)

# sort through bass
for line in basstrain:
    if '*' in line[0]:
        bassfish.append(line[1])
    else:
        bassmusic.append(line[1])

# sort through sake
for line in saketrain:
    if '*' in line[0]:
        sakealch.append(line[1])
    else:
        sakecause.append(line[1])

# don't need the identifiers anymore, so reduce basstrain and saketrain to just
# the sentence strings
basstrain = [line[1] for line in basstrain]
saketrain = [line[1] for line in saketrain]

# also have some numerics stored in variables
numbassfish = len(bassfish)
percentfish = numbassfish/basstotal
numbassmusic = len(bassmusic)
percentmusic = numbassmusic/basstotal
numsakealch = len(sakealch)
percentalch = numsakealch/saketotal
numsakecause = len(sakecause)
percentcause = numsakecause/saketotal

In [5]:
# step 3: collect features
# feature collecting functions

# create collection of nearby words
def nearby_words(dataset, targetword):
    frequencydict = {}
    for line in dataset:
        for word in line.split():
            if word != 'targetword':
                if word in frequencydict:
                    frequencydict[word] += 1
                else:
                    frequencydict[word] = 1
    return frequencydict

# create collection of words immediately to the left
def left_words(dataset, targetword):  # this could probably be a lot more efficient
    frequencydict = {}
    for line in dataset:
        line_list = line.split()
        for i in range(len(line_list)):
            if line_list[i] == targetword:
                if line_list[i - 1] in frequencydict:
                    frequencydict[line_list[i - 1]] += 1
                else:
                    frequencydict[line_list[i - 1]] = 1
    return frequencydict

# create collection of words immediately to the right
def right_words(dataset, targetword):
    frequencydict = {}
    for line in dataset:
        line_list = line.split()
        for i in range(len(line_list)):
            if line_list[i] == targetword:
                if i == len(line_list) - 1:
                    break  # stops indexing errors
                if line_list[i + 1] in frequencydict:
                    frequencydict[line_list[i + 1]] += 1
                else:
                    frequencydict[line_list[i + 1]] = 1
    return frequencydict

In [6]:
# make freq dicts for each meaning and then for both meanings (need the latter
# for probabilities later)
fishfreq = nearby_words(bassfish, 'bass')
musicfreq = nearby_words(bassmusic, 'bass')
bassfreq = nearby_words(basstrain, 'bass')
alchfreq = nearby_words(sakealch, 'sake')
causefreq = nearby_words(sakecause, 'sake')
sakefreq = nearby_words(saketrain, 'sake')

In [7]:
# make dicts of words to the left and right
fishleft = left_words(bassfish, 'bass')
musicleft = left_words(bassmusic, 'bass')
bassleft = left_words(basstrain, 'bass')
alchleft = left_words(sakealch, 'sake')
causeleft = left_words(sakecause, 'sake')
sakeleft = left_words(saketrain, 'sake')
fishright = right_words(bassfish, 'bass')
musicright = right_words(bassmusic, 'bass')
bassright = right_words(basstrain, 'bass')
alchright = right_words(sakealch, 'sake')
causeright = right_words(sakecause, 'sake')
sakeright = right_words(saketrain, 'sake')

In [8]:
# step 4: make log-likelihood decision lists

# function to find P(word has certain meaning|feature present)
def prob_meaning_given_feature(desiredmeaningfeaturefreq, totalfeaturefreq):
    prob_of_features = {}
    for word in totalfeaturefreq:
        if word in desiredmeaningfeaturefreq:
            prob_of_features[word] = desiredmeaningfeaturefreq[word]/totalfeaturefreq[word]
        else:
            prob_of_features[word] = 0
    return prob_of_features

# probability smoother, using Laplacian Smoothing
def smooth_probabilities(probdict, totaltokens, alpha):
    vocab = len(probdict)
    smoothedprobs = {}
    for prob in probdict:
        smoothedprobs[prob] = (probdict[prob] * totaltokens + alpha) / (totaltokens + alpha * vocab)
        # this is the formula m + alpha / M + alpha*V, except m = prob(m) * M
    return smoothedprobs
    
# function for finding computing log-likelyhood of many features
def log_likelyhood(probdic1, probdic2):
    # the two dicts will have the same keys
    loglikelyhoods = {}
    for word in probdic1:
        loglikelyhoods[word] = abs(math.log(probdic1[word] / probdic2[word]))
    return loglikelyhoods

# function for finding features with the highest log-likelyhoods
def highest_likelyhoods(listoflogdicts, numvalues):
    # has to be given to the function in the order of nearby words, left, then right
    highest_features = ['']
    highest_logs = [0]
    feature = 0
    for dic in listoflogdicts:
        feature += 1
        if feature == 1:
            ftstring = 'nearby words '
        elif ftstring == 2:
            ftstring = 'left '
        else:
            ftstring = 'right '
        for word in dic:
            for n in highest_logs:
                if n < dic[word]:
                    highest_features.insert(highest_logs.index(n), ftstring + word)
                    highest_logs.insert(highest_logs.index(n), dic[word])
                    highest_features = highest_features[:numvalues]
                    highest_logs = highest_logs[:numvalues]
                    break
    best_features = []
    for i in range(len(highest_logs)):
        best_features.append([highest_features[i], highest_logs[i]])
    return best_features

In [9]:
# find smoothed probabilities of each feature (did alpha = .1 after testing
# a few values to see how alphas change the probabilities)
alpha = 1
# nearby words
fishnearbywordsprobs = prob_meaning_given_feature(fishfreq, bassfreq)
fishnearbywordsprobs = smooth_probabilities(fishnearbywordsprobs, basstotal, alpha)
musicnearbywordsprobs = prob_meaning_given_feature(musicfreq, bassfreq)
musicnearbywordsprobs = smooth_probabilities(musicnearbywordsprobs, basstotal, alpha)
alchnearbywordsprobs = prob_meaning_given_feature(alchfreq, sakefreq)
alchnearbywordsprobs = smooth_probabilities(alchnearbywordsprobs, saketotal, alpha)
causenearbywordsprobs = prob_meaning_given_feature(causefreq, sakefreq)
causenearbywordsprobs = smooth_probabilities(causenearbywordsprobs, saketotal, alpha)

# word to the left
fishleftprobs = prob_meaning_given_feature(fishleft, bassleft)
fishleftprobs = smooth_probabilities(fishleftprobs, basstotal, alpha)
musicleftprobs = prob_meaning_given_feature(musicleft, bassleft)
musicleftprobs = smooth_probabilities(musicleftprobs, basstotal, alpha)
alchleftprobs = prob_meaning_given_feature(alchleft, sakeleft)
alchleftprobs = smooth_probabilities(alchleftprobs, saketotal, alpha)
causeleftprobs = prob_meaning_given_feature(causeleft, sakeleft)
causeleftprobs = smooth_probabilities(causeleftprobs, saketotal, alpha)

# word to the right
fishrightprobs = prob_meaning_given_feature(fishright, bassright)
fishrightprobs = smooth_probabilities(fishrightprobs, basstotal, alpha)
musicrightprobs = prob_meaning_given_feature(musicright, bassright)
musicrightprobs = smooth_probabilities(musicrightprobs, basstotal, alpha)
alchrightprobs = prob_meaning_given_feature(alchright, sakeright)
alchrightprobs = smooth_probabilities(alchrightprobs, saketotal, alpha)
causerightprobs = prob_meaning_given_feature(causeright, sakeright)
causerightprobs = smooth_probabilities(causerightprobs, saketotal, alpha)

In [10]:
# compute log likelyhood of features

logoffishnearby = log_likelyhood(fishnearbywordsprobs, musicnearbywordsprobs)
logofmusicnearby = log_likelyhood(musicnearbywordsprobs, fishnearbywordsprobs)
logofalchnearby = log_likelyhood(alchnearbywordsprobs, causenearbywordsprobs)
logofcausenearby = log_likelyhood(causenearbywordsprobs, alchnearbywordsprobs)

logoffishleft = log_likelyhood(fishleftprobs, musicleftprobs)
logofmusicleft = log_likelyhood(musicleftprobs, fishleftprobs)
logofalchleft = log_likelyhood(alchleftprobs, causeleftprobs)
logofcauseleft = log_likelyhood(causeleftprobs, alchleftprobs)

logoffishright = log_likelyhood(fishrightprobs, musicrightprobs)
logofmusicright = log_likelyhood(musicrightprobs, fishrightprobs)
logofalchright = log_likelyhood(alchrightprobs, causerightprobs)
logofcauseright = log_likelyhood(causerightprobs, alchrightprobs)

# note: I realized later that log_likelyhood(a, b) = log_likelyhood(b, a)

In [13]:
# find the best features for identifying each type of the word
bassbestfeatures = highest_likelyhoods([logoffishnearby, logoffishleft, logoffishright], 10)
sakebestfeatures = highest_likelyhoods([logofalchnearby, logofalchleft, logofalchright], 10)
print(bassbestfeatures, '\n', sakebestfeatures)

[['nearby words stephan', 6.803505257608338], ['nearby words weidner', 6.803505257608338], ['nearby words composer', 6.803505257608338], ['nearby words player', 6.803505257608338], ['nearby words boehse', 6.803505257608338], ['nearby words onkelz', 6.803505257608338], ['nearby words valued', 6.803505257608338], ['nearby words 250', 6.803505257608338], ['nearby words trapped', 6.803505257608338], ['nearby words room', 6.803505257608338]] 
 [['nearby words society', 6.803505257608338], ['nearby words their', 6.803505257608338], ['nearby words employment', 6.803505257608338], ['nearby words an', 6.803505257608338], ['nearby words sacrifice', 6.803505257608338], ['nearby words souls', 6.803505257608338], ['nearby words doc', 6.803505257608338], ['nearby words id', 6.803505257608338], ['nearby words afe19960518', 6.803505257608338], ['nearby words 0073', 6.803505257608338]]
