In [136]:
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt

N = 2
UNK_TOKEN = True # else, uniform distribution

train = open("en-de/train.en-de.low.filt.en", "r", encoding="UTF-8").readlines()
test = open("en-de/test.en-de.low.en", "r", encoding="UTF-8").readlines()

In [9]:
padded_train = []
w2i = defaultdict(lambda: len(w2i))
S = w2i["<s>"]
END = w2i["</s>"]
UNK = w2i["UNK"]
for sentence in train:
    padded_train.append([S]*N)
    for word in sentence.strip().split():
        padded_train[-1].append(w2i[word])
    padded_train[-1].append(END)

VOCAB = set(w2i.keys())
i2w = {v: k for k,v in w2i.items()}

In [34]:
padded_test = []

if UNK_TOKEN:
        w2i.default_factory = lambda: UNK
else:
        w2i.default_factory = lambda: np.random.choice(range(3,len(VOCAB)+1))

for sentence in test:
        padded_test.append([S]*N + [w2i[word] for word in sentence.strip().split()] + [END])

In [186]:
def feat_func(sentence):
    # past context window
    return np.hstack([onehot(i, len(VOCAB)) for i in sentence[-N:-1]])[0]

def forward(x, W, b):
    # scores = W@x + b
    scores = np.sum([W[:,j] @ x[j] + b for j in range(W.shape[1]) if x[j] != 0], 0)
    # scores = np.zeros(W.shape[0])
    # for j in range(W.shape[1]):
    #     if x[j] != 0:
    #         mult = W[:, j] * x[j]
    #         biased = mult + b
    #         scores += biased
    probs = softmax(scores)
    return probs

def LL_loss(probs):
    loss = np.sum(np.log(probs))
    return loss

def softmax(scores):
    exp = np.exp(scores)
    return exp/np.sum(exp)

def onehot(indices, size):
    indices = np.array(indices)
    oh_array = np.zeros((indices.size, size))
    oh_array[np.arange(indices.size),indices] = 1
    return oh_array

def step(sentence, W, b, eta = 0.1):
    x = feat_func(sentence)
    probs = forward(x, W, b)
    dldb = probs - onehot(sentence[-1], W.shape[0])[0]
    # dldW = np.vstack([dldb*x[j] for j in range(W.shape[0])]).T
    b -= eta*dldb
    for j in range(W.shape[1]):
        if x[j] != 0:
            W[:,j] -= eta*dldb.T*x[j]
    return W, b


In [195]:
W = np.ones((len(VOCAB),len(VOCAB)*N))
b = np.ones(len(VOCAB))
for i in range(N+3, len(padded_train[0])+1):
    W, b = step(padded_train[0][:i], W, b)

MemoryError: Unable to allocate 25.8 GiB for an array with shape (41590, 83180) and data type float64

In [193]:
W

array([[1., 1.],
       [1., 1.],
       [1., 1.],
       ...,
       [1., 1.],
       [1., 1.],
       [1., 1.]])

In [52]:
train_scores = []
test_token_scores = []
test_uniform_scores = []

for sentence in padded_train:
    sentence_score = 0
    for i in range(len(sentence)):
        if i >= N:
            sentence_score += LM(sentence[i-N:i],[sentence[i]])
    train_scores.append(sentence_score/len(sentence))

for sentence in padded_test_token:
    sentence_score = 0
    for i in range(len(sentence)):
        if i >= N:
            sentence_score += LM(sentence[i-N:i],[sentence[i]])
    test_token_scores.append(sentence_score/len(sentence))

for sentence in padded_test_uniform:
    sentence_score = 0
    for i in range(len(sentence)):
        if i >= N:
            sentence_score += LM(sentence[i-N:i],[sentence[i]])
    test_uniform_scores.append(sentence_score/len(sentence))

_ = plt.hist(train_scores,100,density=True,alpha=0.75,label="Train")
_ = plt.hist(test_token_scores,100,density=True,alpha=0.75,label="Test Token")
_ = plt.hist(test_uniform_scores,100,density=True,alpha=0.75,label="Test Uniform")
plt.legend()
plt.show()

NameError: name 'LM' is not defined

In [46]:
def greedy_search_step(past):
    best_word = ""
    best_score = 0
    for word in list(vocab):
        score = LM(past[N:],[word])
        if score > best_score:
            best_score = score
            best_word = word
    return best_word

def random_search_step(past):
    words = []
    scores = []
    for word in list(vocab):
        words.append(word)
        scores.append(LM(past[N:],[word]))
    probabilities = np.array(scores)/np.sum(np.array(scores))
    return np.random.choice(words, p = probabilities)

def search(seed, length, strat="greedy"):
    sentence = seed.split()
    for _ in range(length):
        if sentence[-1] == "</s>":
            return " ".join(sentence)
        if strat == "greedy":
            prediction = greedy_search_step(sentence)
        if strat == "random":
            prediction = random_search_step(sentence)
        sentence.append(prediction)
    return " ".join(sentence)

In [58]:
seed = "the little man had"
length = 10
print(search(seed, length, "greedy"))
print(search(seed, length, "random"))

the little man had to be a little bit of a sudden , the
the little man had established toward greater complexity . </s>


In [59]:
seed = "the little woman had"
length = 10
print(search(seed, length, "greedy"))
print(search(seed, length, "random"))

the little woman had to be a little bit of a sudden , the
the little woman had productivity go from 44,000 to 180,000 . '' </s>
