In [1]:
import numpy as np
import pandas as pd
import nltk
import time
nltk.download('stopwords')

from numpy import random as rd
from scipy.special import gammaln
from nltk.corpus import stopwords
from collections import Counter
from collections import defaultdict
from matplotlib import pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sara\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Function definitions

## File related

In [2]:
def find_n_lines(filename, encoding, n_tokens):
    """
    We should train on a subset of the corpus, with 100 0000 tokens. Find how many lines this corresponds to.
    """
    
    line_counter = 0
    token_counter = 0

    with open(filename, encoding = encoding) as f:
        for line in f:

            line_counter += 1
            tokens = line.lower().split()

            for token in tokens:
                token_counter += 1

            if token_counter > n_tokens:
                break

    return line_counter


def count_word_frequencies(filename, encoding, n_lines, ignore_list):
    
    freqs = Counter()
    with open(filename, encoding = encoding) as f:
        for i, line in enumerate(f):
            
            tokens = line.lower().split()
            for token in tokens:
                if token not in ignore_list:
                    freqs[token] += 1
                
            if i == n_lines:
                break
                
    return freqs

In [3]:
def list_of_stopwords():
    # Ignore all stopwords in the text!
    ignore_words = stopwords.words('english')
    also_ignore = [",", ".", '"', "(", ")", "-", "'", "!", "?", ":", ";", "/", "n't", "'s", "'m"]

    for item in also_ignore:
        ignore_words.append(item)
        
    return ignore_words

## Batch related

In [4]:
def create_integer_vocabulary(word_freqs, max_voc_size):
    
    """ 
    Create vocabulary where common words are matched to integers. 
    """
    
    word_list = []

    if len(word_freqs.most_common()) > max_voc_size:
        vocab = word_freqs.most_common()[0:max_voc_size]

    else:
        vocab = word_freqs.most_common()

    for i in range(len(vocab)):
        word_list.append(vocab[i][0])

    # Get pairs of elements    
    tmp = zip(word_list, range(1,max_voc_size+1))
    # Make pairs into a dictionary
    vocab = dict(tmp)

    # Create default dictionary - returns 0 if an undefined key is called
    vocab2 = defaultdict(int)
    vocab2.update(vocab)
    
    return vocab2

def find_batch_dimensions(batch_size, filename, ENCODING):
    """
    Find the length of the longest line in each batch.
    """
    
    counter = 0      # will end up being the number of lines in the document
    len_lines = []   # will contain maximum length of a line in each batch
    tmp_lines = []
    
    with open(filename, encoding=ENCODING) as f:
        for line in f:
            counter+=1
            tokens = line.lower().split()
            tmp_lines.append(len(tokens))

            if (counter % batch_size == 0):
                len_lines.append(max(tmp_lines))
                tmp_lines = []
                
        #This takes care of the last batch if number of lines is not an exact multiple of batch_size
        if (counter % batch_size != 0): 
            len_lines.append(max(tmp_lines)) # if at end of the file
            
    return counter, len_lines
    

def create_batches(batch_size, vocabulary, filename, ENCODING):
    """
    Splits the file into batches of a specified size, and transforms common words to integers.
    The batches are outputted in a numpy array padded with zeros. Words not in the vocabulary are set to -1.
    """
    
    counter, len_lines = find_batch_dimensions(batch_size, filename, ENCODING)
    
    with open(filename, encoding=ENCODING) as f:
        batches=[]
        batch_counter=0
        line_counter=0

        for line in f:
            #This creates a temporary array each time we start a new batch
            if line_counter % batch_size == 0:
                tmp_array=np.zeros(shape=(batch_size,len_lines[batch_counter])) #fill this temporary array

            tokens = line.lower().split()
            line_as_int = list(map(vocabulary.get, tokens))
            line_as_int = [-1 if x is None else x for x in line_as_int] # set None values to -1

            tmp_array[line_counter % batch_size,0:(len(line_as_int))]=line_as_int

            line_counter+=1 #when we done
            if line_counter % batch_size ==0:
                batches.append(tmp_array)
                batch_counter+=1

        # again this takes care of the final batch if number of lines is not multiple of batch_size
        if line_counter % batch_size != 0:
            tmp_array=tmp_array[0:(line_counter % batch_size),:]
            batches.append(tmp_array)
        
    return(counter, batches)

def get_matrix(filename, encoding, n_tokens, ignore_words):
    # Find how many lines we need to read to get the desired number of tokens
    n_docs = find_n_lines(filename, encoding, n_tokens)

    # Count word frequencies in this subset of the file
    word_frequencies = count_word_frequencies(filename, encoding, n_docs, ignore_words)

    # Create an integer vocabulary. Don't remove any words from the vocabulary.
    voc_size = len(word_frequencies)
    vocabulary = create_integer_vocabulary(word_frequencies, voc_size)

    # Turn the document into batches
    lines, batches = create_batches(batch_size=n_docs, vocabulary = vocabulary, filename = filename, ENCODING = encoding)

    # Save only the first batch - this is what we'll analyse
    matrix = batches[0].astype(int)
    return n_docs, matrix, vocabulary, voc_size

## LDA Related

In [5]:
def initialise_everything(n_docs, n_topics, voc_size, int_matrix):
    """
    Initialise some stuff!
    """

    # Number of times that we observe topic z in document d
    ndz = np.zeros((n_docs, n_topics))

    # Number of times that we observe word w in topic z
    nzw = np.zeros((n_topics, voc_size))

    # Counters for documents and topics
    nd = np.zeros(n_docs)
    nz = np.zeros(n_topics)

    # Create dictionary of topics
    topics = {}

    # iterate over documents 
    for d in range(n_docs):

        # i is the index of the word in the document
        # w is the numerical representation of the word
        for i, w in enumerate(int_matrix[d]):

            # Initialise with a random topic
            z = rd.randint(n_topics)
            topics[(d,i)] = z

            # Increase counters
            ndz[d, z] += 1
            nzw[z, w] += 1

            nd[d] += 1
            nz[z] += 1

    return topics, ndz, nzw, nd, nz

In [6]:
def cond_topic_prob(ndz, nzw, nz, nd, w, d, alpha, beta, n_topics):
    """
    Conditional probability of topics. Is this the same formula as in lecture notes?
    """

    left = (nzw[:,w] + beta) / (nz + beta * voc_size)
    right = (ndz[d,:] + alpha) / (nd[d] + alpha * n_topics)

    p_z = left * right
    p_z /= np.sum(p_z)
    
    return p_z

def log_multinomial_beta(alpha, K=None):

    if K is None:
        # alpha is assumed to be a vector
        return np.sum(gammaln(alpha)) - gammaln(np.sum(alpha))
    else:
        # alpha is assumed to be a scalar
        return K * gammaln(alpha) - gammaln(K*alpha)

# This should increase as training progresses, show it every few training iterations (?)
def loglikelihood(n_topics, voc_size, alpha, beta, nzw, ndz):
    likelihood = 0
    
    for z in range(n_topics):
        likelihood += log_multinomial_beta(nzw[z,:] + beta)
        likelihood -= log_multinomial_beta(beta, voc_size)
        
    for d in range(n_docs):
        likelihood += log_multinomial_beta(ndz[d,:] + alpha)
        likelihood -= log_multinomial_beta(alpha, n_topics)
        
    return likelihood

In [7]:
def LDA_Gibbs_Sampler(matrix, voc_size, n_docs, n_topics, max_iterations, alpha, beta):

    start_time = time.time()
    topics, ndz, nzw, nd, nz = initialise_everything(n_docs, n_topics, voc_size, matrix)

    for i in range(max_iterations):
        for d in range(n_docs):
             for j, w in enumerate(matrix[d]):

                    z = topics[(d, j)]
                    ndz[d, z] -= 1
                    nzw[z, w] -= 1
                    nd[d] -= 1
                    nz[z] -= 1

                    p_z = cond_topic_prob(ndz, nzw, nz, nd, w, d, alpha, beta, n_topics)
                    z = rd.multinomial(1, p_z).argmax()

                    ndz[d,z] += 1
                    nzw[z,w] += 1
                    nd[d] += 1
                    nz[z] += 1
                    topics[(d, j)] = z

        print("Iteration", i)
        print("Likelihood", loglikelihood(n_topics, voc_size, alpha, beta, nzw, ndz))

    elapsed_time = time.time() - start_time
    print("Elapsed time: ", elapsed_time)
    
    return nzw

In [19]:
def show_words_by_topic(word_topic_prob, vocabulary, typical_len):
    
    n_topics = word_topic_prob.shape[0]
    typical_words = []

    for i in range(n_topics):
        arr = word_topic_prob[i,:]
        typical_ints = arr.argsort()[-typical_len-2:-2][::-1]   # there's some funny business with the last word in vocab
        #print(typical_ints)

        for search_int in typical_ints:
            if search_int in [0, -1]:
                typical_words.append("")
            else:
                for k, v in vocabulary.items(): 
                    if v == search_int:
                        typical_words.append(k)
                        break

    # Print the most common words in each topic
    typical_words = np.reshape(typical_words, [n_topics, -1])
    print(typical_words)


# 1: Write your own code for doing Gibbs sampling for LDA

In [9]:
filename = "books.txt" 
encoding = "ISO-8859-1"
n_tokens = 8*10**4

ignore_words = list_of_stopwords()

n_docs, matrix, vocabulary, voc_size = get_matrix(filename, encoding, n_tokens, ignore_words)

In [10]:
typical_len = 10
max_iterations = 75

### 10 Topics, $\alpha = \beta = 0.1$

In [11]:
n_topics = 10

word_topic_prob_01_10 = LDA_Gibbs_Sampler(matrix, voc_size, n_docs, n_topics, \
                                          max_iterations, alpha = 0.1, beta = 0.1)

Iteration 0
Likelihood -2351002.5387130915
Iteration 1
Likelihood -2335354.9581859065
Iteration 2
Likelihood -2323639.282311263
Iteration 3
Likelihood -2311577.561338965
Iteration 4
Likelihood -2296861.8465751046
Iteration 5
Likelihood -2276834.275514169
Iteration 6
Likelihood -2243067.8784760805
Iteration 7
Likelihood -2199891.9970693695
Iteration 8
Likelihood -2156739.2925651036
Iteration 9
Likelihood -2123050.5709214304
Iteration 10
Likelihood -2098328.7566538285
Iteration 11
Likelihood -2079706.214526914
Iteration 12
Likelihood -2065564.3258698783
Iteration 13
Likelihood -2052815.0407773168
Iteration 14
Likelihood -2041726.5628088245
Iteration 15
Likelihood -2028867.4095321808
Iteration 16
Likelihood -2017191.9350128223
Iteration 17
Likelihood -2005201.353564023
Iteration 18
Likelihood -1994140.9965495174
Iteration 19
Likelihood -1982265.8389743764
Iteration 20
Likelihood -1972151.617760897
Iteration 21
Likelihood -1961846.7944976867
Iteration 22
Likelihood -1952392.4721891806
Iter

In [20]:
show_words_by_topic(word_topic_prob_01_10, vocabulary, typical_len)

[['book' 'read' 'would' 'well' 'good' 'know' 'us' 'two' 'think' 'highly']
 ['read' 'one' 'books' 'people' 'world' 'man' 'works' 'time' 'new'
  'women']
 ['book' 'like' 'reading' 'would' 'new' 'much' 'lot' 'school' 'long'
  'ever']
 ['book' 'books' 'one' 'many' 'history' 'way' 'reading' 'life' 'people'
  'work']
 ['great' 'story' 'like' 'many' 'read' 'world' 'see' 'also' 'novel'
  'john']
 ['one' 'much' 'would' 'also' 'even' 'like' 'sam' 'get' 'time' 'could']
 ['great' 'reader' 'writing' 'love' 'must' 'different' 'information'
  'give' 'patterns' 'could']
 ['book' 'recommend' 'read' 'anyone' 'great' 'story' 'going'
  'interesting' 'information' 'would']
 ['book' 'one' 'good' 'first' 'really' 'best' 'time' 'find' 'work' 'get']
 ['life' 'war' 'new' 'family' 'us' '' 'without' 'theory' 'power'
  'american']]


### 50 Topics, $\alpha = \beta = 0.1$

In [21]:
n_topics = 50

word_topic_prob_01_50 = LDA_Gibbs_Sampler(matrix, voc_size, n_docs, n_topics, \
                                          max_iterations, alpha = 0.1, beta = 0.1)

Iteration 0
Likelihood -3658563.936163828
Iteration 1
Likelihood -3608550.923346697
Iteration 2
Likelihood -3558537.080710233
Iteration 3
Likelihood -3508411.4883670104
Iteration 4
Likelihood -3449413.682229062
Iteration 5
Likelihood -3379673.1660486497
Iteration 6
Likelihood -3301858.0268891277
Iteration 7
Likelihood -3230342.4180519395
Iteration 8
Likelihood -3169399.4883624515
Iteration 9
Likelihood -3119861.807302928
Iteration 10
Likelihood -3077177.2423457718
Iteration 11
Likelihood -3035792.7022777093
Iteration 12
Likelihood -2998331.9405336394
Iteration 13
Likelihood -2963537.9445924326
Iteration 14
Likelihood -2929729.842392472
Iteration 15
Likelihood -2893592.647204069
Iteration 16
Likelihood -2859870.2313416135
Iteration 17
Likelihood -2825669.469561156
Iteration 18
Likelihood -2787576.176527688
Iteration 19
Likelihood -2750477.565235863
Iteration 20
Likelihood -2710736.847433333
Iteration 21
Likelihood -2672623.207860701
Iteration 22
Likelihood -2631332.23425144
Iteration 23

In [22]:
show_words_by_topic(word_topic_prob_01_50, vocabulary, typical_len)

[['monster' 'born' 'horrific' 'killed' 'fell' 'girls' 'memories' 'amelia'
  'telling' 'actual']
 ['cooke' 'biography' 'events' 'official' 'version' 'wolff' 'known'
  'greene' 'car' 'death']
 ['edition' 'thorough' 'girlfriend' 'nature' 'exactly' 'medieval' 'vast'
  'eugenics' 'exhaustive' 'analyzed']
 ['soul' 'human' 'xxiii' 'later' 'thomas' 'care' 'ladder' 'window'
  'monastic' 'grow']
 ['book' 'well' 'first' 'characters' 'good' 'story' 'read' 'writing'
  'enjoyed' 'amazing']
 ['politics' 'lbj' 'loved' 'mets' 'scientific' 'myron' 'bolitar' 'kathy'
  'years' 'career']
 ['one' 'like' 'would' 'also' 'read' 'many' 'people' 'life' 'much' 'time']
 ['energy' 'michelle' 'vital' 'mama' 'vampirism' 'age' 'peter' 'battle'
  'explosive' 'hearing']
 ['arguments' 'department' 'grisham' 'rhetorician' 'thousands' 'original'
  'creator' 'zen' 'motorcycle' 'relationship']
 ['reporter' 'reviewer' '...' 'universe' 'fox' 'mop' 'diverting'
  'plodding' 'time' 'background']
 ['reading' 'would' 'without' 'war

['horse' 'soul' 'path' 'eve' 'experience' 'ladder' 'monastic' 'plains'
  'carlin' 'mystical'] this topic seems to relate to fantasy or something?

[discworld' 'equal' 'rites' 'magic' 'bernard' 'opinions' 'witches'
  'aquinas' 'franciscan' 'become'] Terry Pratchett!

### 10 Topics, $\alpha = \beta = 0.01$

In [23]:
n_topics = 10

word_topic_prob_001_10 = LDA_Gibbs_Sampler(matrix, voc_size, n_docs, n_topics, max_iterations, alpha = 0.01, beta = 0.01)

Iteration 0
Likelihood -2343457.870990367
Iteration 1
Likelihood -2320749.8681018814
Iteration 2
Likelihood -2304569.705220574
Iteration 3
Likelihood -2290597.510808511
Iteration 4
Likelihood -2276621.381299429
Iteration 5
Likelihood -2260341.027312455
Iteration 6
Likelihood -2235869.9136667443
Iteration 7
Likelihood -2202960.071286286
Iteration 8
Likelihood -2165050.2412849767
Iteration 9
Likelihood -2129827.7747944077
Iteration 10
Likelihood -2101752.4814116503
Iteration 11
Likelihood -2080085.3136327907
Iteration 12
Likelihood -2063487.1144998441
Iteration 13
Likelihood -2045536.251464159
Iteration 14
Likelihood -2031064.2508593616
Iteration 15
Likelihood -2016179.4829783826
Iteration 16
Likelihood -2000026.3275540767
Iteration 17
Likelihood -1985799.924806189
Iteration 18
Likelihood -1974548.4329042924
Iteration 19
Likelihood -1960551.3750523948
Iteration 20
Likelihood -1949357.0448735512
Iteration 21
Likelihood -1937563.1301793077
Iteration 22
Likelihood -1923682.851831316
Iterati

In [24]:
show_words_by_topic(word_topic_prob_001_10, vocabulary, typical_len)

[['book' 'good' 'best' 'author' 'new' 'reader' 'every' 'well' 'like'
  'excellent']
 ['book' 'would' 'much' 'even' 'work' 'history' 'two' 'reading' 'like'
  'novel']
 ['work' 'love' 'first' 'wonderful' 'easy' 'real' 'lot' 'use' 'amazing'
  'patterns']
 ['one' 'book' 'characters' 'time' 'world' 'best' 'like' 'stories'
  'could' 'although']
 ['book' 'good' 'pages' 'interesting' 'woman' 'high' 'recommend' 'long'
  'still' 'stories']
 ['book' 'read' 'great' 'many' 'must' 'recommend' 'better' 'also'
  'series' 'students']
 ['books' 'time' 'readers' 'fiction' 'children' 'reader' 'need' 'says'
  'novels' 'american']
 ['book' 'written' 'information' 'world' 'worth' 'highly' 'novel'
  'anyone' 'page' 'complete']
 ['book' 'life' 'people' 'us' 'story' 'also' 'one' 'read' 'first' 'many']
 ['book' 'great' 'would' 'reading' 'years' 'way' 'well' 'writing' 'works'
  'time']]


### 50 Topics, $\alpha = \beta = 0.01$

In [25]:
n_topics = 50

word_topic_prob_001_50 = LDA_Gibbs_Sampler(matrix, voc_size, n_docs, n_topics, max_iterations, alpha = 0.01, beta = 0.01)

Iteration 0
Likelihood -3575222.7566558477
Iteration 1
Likelihood -3511858.157047361
Iteration 2
Likelihood -3453380.977278283
Iteration 3
Likelihood -3396604.8616934363
Iteration 4
Likelihood -3330508.159939607
Iteration 5
Likelihood -3249983.8052112306
Iteration 6
Likelihood -3174941.4953452684
Iteration 7
Likelihood -3108042.3533887807
Iteration 8
Likelihood -3052397.1337047815
Iteration 9
Likelihood -3003682.1830039476
Iteration 10
Likelihood -2959216.6975951935
Iteration 11
Likelihood -2920272.050352145
Iteration 12
Likelihood -2881800.5491413083
Iteration 13
Likelihood -2848358.745659724
Iteration 14
Likelihood -2816436.5785797634
Iteration 15
Likelihood -2786236.584752912
Iteration 16
Likelihood -2757118.728225909
Iteration 17
Likelihood -2726209.2886329475
Iteration 18
Likelihood -2695776.095584607
Iteration 19
Likelihood -2666513.283767589
Iteration 20
Likelihood -2639867.317621984
Iteration 21
Likelihood -2615079.749422195
Iteration 22
Likelihood -2591513.5442678286
Iteration

In [26]:
show_words_by_topic(word_topic_prob_001_50, vocabulary, typical_len)

[['great' 'much' 'way' 'must' 'long' 'series' 'esk' 'put' 'want' 'could']
 ['story' 'books' 'read' 'first' 'daughter' 'us' 'period' 'however'
  'times' 'writer']
 ['one' 'also' 'give' 'first' 'review' 'knowledge' 'need' 'business'
  'serious' 'bring']
 ['history' 'students' 'c' 'second' 'years' 'journey' 'student' 'ago'
  'long' 'fantastic']
 ['book' 'war' 'author' 'getting' 'later' 'third' 'particularly' 'though'
  'example' 'library']
 ['things' 'helped' 'days' 'practical' '&' 'picture' 'understand' 'liked'
  'plan' 'lessons']
 ['one' 'writing' 'could' 'written' 'women' 'recommended' 'recommend'
  'patterns' 'school' 'would']
 ['excellent' 'highly' 'different' 'anyone' 'tale' 'works' 'kind' 'angel'
  'classic' 'good']
 ['people' 'read' 'really' 'experience' 'rest' 'shows' 'society'
  'written' 'myth' 'engineers']
 ['gibbon' 'work' 'could' 'mother' 'going' 'reading' 'fact' 'time' 'new'
  'perhaps']
 ['children' 'life' 'human' 'freddie' 'experience' 'helga' 'soul' 'care'
  'spiritual' 

### Save results!

In [None]:
file = open("nzw_01_10.txt","w") 
file.write(str(word_topic_prob_01_10))
file.close()

file = open("nzw_001_10.txt","w") 
file.write(str(word_topic_prob_001_10))
file.close()

file = open("nzw_01_50.txt","w") 
file.write(str(word_topic_prob_01_50))
file.close()

file = open("nzw_001_50.txt","w") 
file.write(str(word_topic_prob_001_50))
file.close()

In [32]:
word_topic_prob_001_50[0, 0]

np.savetxt("nzw_01_10.csv", word_topic_prob_01_10, delimiter=",")
np.savetxt("nzw_001_50.csv", word_topic_prob_001_10, delimiter=",")
np.savetxt("nzw_001_50.csv", word_topic_prob_01_50, delimiter=",")
np.savetxt("nzw_001_50.csv", word_topic_prob_001_50, delimiter=",")

# Object oriented in Python - would be less messy to do this

In [None]:
class Person:
    
    def __init__(self, name, age):
        self.name = name
        self.age = age
    
    def greeting(self):
        print("Hello, my name is " + self.name)

In [None]:
p1 = Person("Sara", 25)
p1.age

p1.greeting()

# 2: Write your own code for doing Gibbs samling on Bigram LDA

as in H. M. Wallach: Topic modeling: beyond bag-of-words. ICML(2006) 977-984. http://dirichlet.net/pdf/wallach06topic.pdf

For corpus use the Amazon book reviews corpus that you also used in Assignment 1. You may have to use only a subset of the documents. A corpus of 100 000 tokens is sufficients size.

Run this for different hyperparameters. For LDA you can try α=β=0.1 and α=β=0.01. (A cross-validation search for optimal values will probably be too slow.) Run also for different numbers of topics, e.g. K=10 and K=50.

For the bigram model, see to that you use a larger hyperparameter value on the diagonal of the transition matrix over the topics. Since each document in this model has a transition matrix over topics rather than just a probability distribution, the number of topics cannot be as large as for LDA. Try K=5 and K=10.