In [86]:
import numpy as np
import pandas as pd

from numpy import random as rd
from collections import Counter
from collections import defaultdict
from matplotlib import pyplot as plt

# Function definitions

In [33]:
# We should train on a subset of the corpus, with 100 0000 tokens. Find how many lines this corresponds to

def find_n_lines(filename, encoding, n_tokens):
    
    line_counter = 0
    token_counter = 0

    with open(filename, encoding = encoding) as f:
        for line in f:

            line_counter += 1
            tokens = line.lower().split()

            for token in tokens:
                token_counter += 1

            if token_counter > n_tokens:
                break

    return line_counter

In [62]:
def count_word_frequencies(filename, encoding, n_lines):
    
    freqs = Counter()
    with open(filename, encoding = encoding) as f:
        for i, line in enumerate(f):
            
            tokens = line.lower().split()
            for token in tokens:
                freqs[token] += 1
                
            if i == n_lines:
                break
                
    return freqs

In [58]:
def create_integer_vocabulary(word_freqs, max_voc_size):
    
    """ 
    Create vocabulary where common words are matched to integers. 
    """
    
    word_list = []

    if len(word_freqs.most_common()) > max_voc_size:
        vocab = word_freqs.most_common()[0:max_voc_size]

    else:
        vocab = word_freqs.most_common()

    for i in range(len(vocab)):
        word_list.append(vocab[i][0])

    # Get pairs of elements    
    tmp = zip(word_list, range(1,max_voc_size+1))
    # Make pairs into a dictionary
    vocab = dict(tmp)

    # Create default dictionary - returns 0 if an undefined key is called
    vocab2 = defaultdict(int)
    vocab2.update(vocab)
    
    return vocab2

def find_batch_dimensions(batch_size, filename, ENCODING):
    """
    Find the length of the longest line in each batch.
    """
    
    counter = 0      # will end up being the number of lines in the document
    len_lines = []   # will contain maximum length of a line in each batch
    tmp_lines = []
    
    with open(filename, encoding=ENCODING) as f:
        for line in f:
            counter+=1
            tokens = line.lower().split()
            tmp_lines.append(len(tokens))

            if (counter % batch_size == 0):
                len_lines.append(max(tmp_lines))
                tmp_lines = []
                
        #This takes care of the last batch if number of lines is not an exact multiple of batch_size
        if (counter % batch_size != 0): 
            len_lines.append(max(tmp_lines)) # if at end of the file
            
    return counter, len_lines
    

def create_batches(batch_size, vocabulary, filename, ENCODING):
    """
    Splits the file into batches of a specified size, and transforms common words to integers.
    The batches are outputted in a numpy array padded with zeros. Words not in the vocabulary are set to -1.
    """
    
    counter, len_lines = find_batch_dimensions(batch_size, filename, ENCODING)
    
    with open(filename, encoding=ENCODING) as f:
        batches=[]
        batch_counter=0
        line_counter=0

        for line in f:
            #This creates a temporary array each time we start a new batch
            if line_counter % batch_size == 0:
                tmp_array=np.zeros(shape=(batch_size,len_lines[batch_counter])) #fill this temporary array

            tokens = line.lower().split()
            line_as_int = list(map(vocabulary.get, tokens))
            line_as_int = [-1 if x is None else x for x in line_as_int] # set None values to -1

            tmp_array[line_counter % batch_size,0:(len(line_as_int))]=line_as_int

            line_counter+=1 #when we done
            if line_counter % batch_size ==0:
                batches.append(tmp_array)
                batch_counter+=1

        # again this takes care of the final batch if number of lines is not multiple of batch_size
        if line_counter % batch_size != 0:
            tmp_array=tmp_array[0:(line_counter % batch_size),:]
            batches.append(tmp_array)
        
    return(counter, batches)

# 1: Write your own code for doing Gibbs sampling for LDA

In [67]:
beta = 0.1
alpha = 0.1
n_topics = 10
n_tokens = 10**5

# beta = 0.01
# alpha = 0.01
# n_topics = 50

filename = "books.txt" 
encoding = "ISO-8859-1"

# Find how many lines we need to read to get the desired number of tokens
n_docs = find_n_lines(filename, encoding, n_tokens)
print(n_lines)

# Count word frequencies in this subset of the file
word_frequencies = count_word_frequencies(filename, encoding, n_docs)
word_frequencies.most_common()[100:110]

582


[('been', 121),
 ('world', 120),
 ('best', 117),
 ('really', 116),
 ('work', 116),
 ('because', 115),
 ('even', 115),
 ('them', 114),
 ('these', 111),
 ('new', 106)]

In [119]:
# Create an integer vocabulary. Don't remove any words from the vocabulary.
voc_size = len(word_frequencies)
vocabulary = create_integer_vocabulary(word_frequencies, voc_size)

# Turn the document into batches
lines_in_doc, batches = create_batches(batch_size=n_docs, vocabulary = vocabulary, \
                                    filename = filename, ENCODING = encoding)

In [152]:
# Save only the first batch - this is what we'll analyse
int_matrix = batches[0].astype(int)
print(int_matrix.shape)

(582, 1602)


In [148]:
# Shouldn't these be the same? 
print(len(np.unique(int_matrix)))
print(voc_size)

11625
11628


In [134]:
def initialise_everything(n_docs, n_topics, voc_size):
    """
    Initialise some stuff!
    """

    # Number of times that we observe topic z in document d
    ndz = np.zeros((n_docs, n_topics))

    # Number of times that we observe word w in topic z
    nzw = np.zeros((n_topics, voc_size))

    # Counters for documents and topics
    nd = np.zeros(n_docs)
    nz = np.zeros(n_topics)

    # Create dictionary of topics
    topics = {}

    # iterate over documents 
    for d in range(n_docs):

        # i is the index of the word in the document
        # w is the numerical representation of the word
        for i, w in enumerate(int_matrix[d]):

            # Initialise with a random topic
            z = rd.randint(n_topics)
            topics[(d,i)] = z

            # Increase counters
            ndz[d, z] += 1
            nzw[z, w] += 1

            nd[d] += 1
            nz[z] += 1

    return topics, ndz, nzw, nd, nz

In [135]:
topics, ndz, nzw, nd, nz = initialise_everything(n_docs, n_topics, voc_size)

In [149]:
def cond_topic_prob(ndz, nzw, nz, nd, w, d, alpha, beta, n_topics):
    """
    Conditional probability of topics. Is this the same formula as in lecture notes?
    """

    left = (nzw[:,w] + beta) / (nz + beta * voc_size)
    right = (ndz[d,:] + alpha) / (nd[d] + alpha * n_topics)

    p_z = left * right
    p_z /= np.sum(p_z)
    
    return p_z

In [123]:
def loglikelihood()

(10, 11628)

In [151]:
p_z = cond_topic_prob(ndz, nzw, nz, nd, w, d, alpha, beta, n_topics)
p_z

array([0.10364167, 0.11112728, 0.10974766, 0.10035978, 0.09925849,
       0.09997278, 0.1036732 , 0.08001437, 0.10109396, 0.09111081])

In [53]:
# try ignoring the 20 most common words?
n_ignore = 20
tmp = word_frequencies.most_common()[0:n_ignore]
ignore_list = list(map(list, zip(*tmp)))[0]

vocab_size = len(word_frequencies) - n_ignore

# assign a random topic from 1:n_topics, for each word not on the ignore_list
# maybe save the key words for each line as well, to make it easier to step through

# I am the night -> night

Assess model performance in tables over the ten top words for the topics.

# Object oriented in Python

In [7]:
class Person:
    
    def __init__(self, name, age):
        self.name = name
        self.age = age
    
    def greeting(self):
        print("Hello, my name is " + self.name)

In [9]:
p1 = Person("Sara", 25)
p1.age

p1.greeting()

Hello, my name is Sara


# 2: Write your own code for doing Gibbs samling on Bigram LDA

as in H. M. Wallach: Topic modeling: beyond bag-of-words. ICML(2006) 977-984. http://dirichlet.net/pdf/wallach06topic.pdf

For corpus use the Amazon book reviews corpus that you also used in Assignment 1. You may have to use only a subset of the documents. A corpus of 100 000 tokens is sufficients size.

Run this for different hyperparameters. For LDA you can try α=β=0.1 and α=β=0.01. (A cross-validation search for optimal values will probably be too slow.) Run also for different numbers of topics, e.g. K=10 and K=50.

For the bigram model, see to that you use a larger hyperparameter value on the diagonal of the transition matrix over the topics. Since each document in this model has a transition matrix over topics rather than just a probability distribution, the number of topics cannot be as large as for LDA. Try K=5 and K=10.