In [1]:
import numpy as np
import pandas as pd
import nltk
import time
nltk.download('stopwords')

from numpy import random as rd
from scipy.special import gammaln
from nltk.corpus import stopwords
from collections import Counter
from collections import defaultdict
from matplotlib import pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sara\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Function definitions

## File related

In [2]:
def find_n_lines(filename, encoding, n_tokens):
    """
    We should train on a subset of the corpus, with 100 0000 tokens. Find how many lines this corresponds to.
    """
    
    line_counter = 0
    token_counter = 0

    with open(filename, encoding = encoding) as f:
        for line in f:

            line_counter += 1
            tokens = line.lower().split()

            for token in tokens:
                token_counter += 1

            if token_counter > n_tokens:
                break

    return line_counter


def count_word_frequencies(filename, encoding, n_lines, ignore_list):
    
    freqs = Counter()
    with open(filename, encoding = encoding) as f:
        for i, line in enumerate(f):
            
            tokens = line.lower().split()
            for token in tokens:
                if token not in ignore_list:
                    freqs[token] += 1
                
            if i == n_lines:
                break
                
    return freqs

In [98]:
def list_of_stopwords():
    # Ignore all stopwords in the text!
    ignore_words = stopwords.words('english')
    also_ignore = [",", ".", '"', "(", ")", "-", "'", "!", "?", ":", ";", "/", "n't", "'s", "'m"]

    for item in also_ignore:
        ignore_words.append(item)
        
    return ignore_words

## Batch related

In [3]:
def create_integer_vocabulary(word_freqs, max_voc_size):
    
    """ 
    Create vocabulary where common words are matched to integers. 
    """
    
    word_list = []

    if len(word_freqs.most_common()) > max_voc_size:
        vocab = word_freqs.most_common()[0:max_voc_size]

    else:
        vocab = word_freqs.most_common()

    for i in range(len(vocab)):
        word_list.append(vocab[i][0])

    # Get pairs of elements    
    tmp = zip(word_list, range(1,max_voc_size+1))
    # Make pairs into a dictionary
    vocab = dict(tmp)

    # Create default dictionary - returns 0 if an undefined key is called
    vocab2 = defaultdict(int)
    vocab2.update(vocab)
    
    return vocab2

def find_batch_dimensions(batch_size, filename, ENCODING):
    """
    Find the length of the longest line in each batch.
    """
    
    counter = 0      # will end up being the number of lines in the document
    len_lines = []   # will contain maximum length of a line in each batch
    tmp_lines = []
    
    with open(filename, encoding=ENCODING) as f:
        for line in f:
            counter+=1
            tokens = line.lower().split()
            tmp_lines.append(len(tokens))

            if (counter % batch_size == 0):
                len_lines.append(max(tmp_lines))
                tmp_lines = []
                
        #This takes care of the last batch if number of lines is not an exact multiple of batch_size
        if (counter % batch_size != 0): 
            len_lines.append(max(tmp_lines)) # if at end of the file
            
    return counter, len_lines
    

def create_batches(batch_size, vocabulary, filename, ENCODING):
    """
    Splits the file into batches of a specified size, and transforms common words to integers.
    The batches are outputted in a numpy array padded with zeros. Words not in the vocabulary are set to -1.
    """
    
    counter, len_lines = find_batch_dimensions(batch_size, filename, ENCODING)
    
    with open(filename, encoding=ENCODING) as f:
        batches=[]
        batch_counter=0
        line_counter=0

        for line in f:
            #This creates a temporary array each time we start a new batch
            if line_counter % batch_size == 0:
                tmp_array=np.zeros(shape=(batch_size,len_lines[batch_counter])) #fill this temporary array

            tokens = line.lower().split()
            line_as_int = list(map(vocabulary.get, tokens))
            line_as_int = [-1 if x is None else x for x in line_as_int] # set None values to -1

            tmp_array[line_counter % batch_size,0:(len(line_as_int))]=line_as_int

            line_counter+=1 #when we done
            if line_counter % batch_size ==0:
                batches.append(tmp_array)
                batch_counter+=1

        # again this takes care of the final batch if number of lines is not multiple of batch_size
        if line_counter % batch_size != 0:
            tmp_array=tmp_array[0:(line_counter % batch_size),:]
            batches.append(tmp_array)
        
    return(counter, batches)

def get_matrix(filename, encoding, n_tokens, ignore_words):
    # Find how many lines we need to read to get the desired number of tokens
    n_docs = find_n_lines(filename, encoding, n_tokens)

    # Count word frequencies in this subset of the file
    word_frequencies = count_word_frequencies(filename, encoding, n_docs, ignore_words)

    # Create an integer vocabulary. Don't remove any words from the vocabulary.
    voc_size = len(word_frequencies)
    vocabulary = create_integer_vocabulary(word_frequencies, voc_size)

    # Turn the document into batches
    lines, batches = create_batches(batch_size=n_docs, vocabulary = vocabulary, filename = filename, ENCODING = encoding)

    # Save only the first batch - this is what we'll analyse
    matrix = batches[0].astype(int)
    return n_docs, matrix, vocabulary, voc_size

## LDA Related

In [4]:
def initialise_everything(n_docs, n_topics, voc_size, int_matrix):
    """
    Initialise some stuff!
    """

    # Number of times that we observe topic z in document d
    ndz = np.zeros((n_docs, n_topics))

    # Number of times that we observe word w in topic z
    nzw = np.zeros((n_topics, voc_size))

    # Counters for documents and topics
    nd = np.zeros(n_docs)
    nz = np.zeros(n_topics)

    # Create dictionary of topics
    topics = {}

    # iterate over documents 
    for d in range(n_docs):

        # i is the index of the word in the document
        # w is the numerical representation of the word
        for i, w in enumerate(int_matrix[d]):

            # Initialise with a random topic
            z = rd.randint(n_topics)
            topics[(d,i)] = z

            # Increase counters
            ndz[d, z] += 1
            nzw[z, w] += 1

            nd[d] += 1
            nz[z] += 1

    return topics, ndz, nzw, nd, nz

In [5]:
def cond_topic_prob(ndz, nzw, nz, nd, w, d, alpha, beta, n_topics):
    """
    Conditional probability of topics. Is this the same formula as in lecture notes?
    """

    left = (nzw[:,w] + beta) / (nz + beta * voc_size)
    right = (ndz[d,:] + alpha) / (nd[d] + alpha * n_topics)

    p_z = left * right
    p_z /= np.sum(p_z)
    
    return p_z

def log_multinomial_beta(alpha, K=None):

    if K is None:
        # alpha is assumed to be a vector
        return np.sum(gammaln(alpha)) - gammaln(np.sum(alpha))
    else:
        # alpha is assumed to be a scalar
        return K * gammaln(alpha) - gammaln(K*alpha)

# This should increase as training progresses, show it every few training iterations (?)
def loglikelihood(n_topics, voc_size, alpha, beta, nzw, ndz):
    likelihood = 0
    
    for z in range(n_topics):
        likelihood += log_multinomial_beta(nzw[z,:] + beta)
        likelihood -= log_multinomial_beta(beta, voc_size)
        
    for d in range(n_docs):
        likelihood += log_multinomial_beta(ndz[d,:] + alpha)
        likelihood -= log_multinomial_beta(alpha, n_topics)
        
    return likelihood

In [6]:
def LDA_Gibbs_Sampler(matrix, voc_size, n_docs, n_topics, max_iterations, alpha, beta):

    start_time = time.time()
    topics, ndz, nzw, nd, nz = initialise_everything(n_docs, n_topics, voc_size, matrix)

    for i in range(max_iterations):
        for d in range(n_docs):
             for j, w in enumerate(matrix[d]):

                    z = topics[(d, j)]
                    ndz[d, z] -= 1
                    nzw[z, w] -= 1
                    nd[d] -= 1
                    nz[z] -= 1

                    p_z = cond_topic_prob(ndz, nzw, nz, nd, w, d, alpha, beta, n_topics)
                    z = rd.multinomial(1, p_z).argmax()

                    ndz[d,z] += 1
                    nzw[z,w] += 1
                    nd[d] += 1
                    nz[z] += 1
                    topics[(d, j)] = z

        print("Iteration", i)
        print("Likelihood", loglikelihood(n_topics, voc_size, alpha, beta, nzw, ndz))

    elapsed_time = time.time() - start_time
    print("Elapsed time: ", elapsed_time)
    
    return nzw

In [82]:
def show_words_by_topic(word_topic_prob, vocabulary, typical_len):
    
    n_topics = word_topic_prob.shape[0]
    typical_words = []

    for i in range(n_topics):
        arr = word_topic_prob[i,:]
        typical_ints = arr.argsort()[-typical_len-2:-2][::-1]   # there's some funny business with the last word in vocab
        #print(typical_ints)

        for search_int in typical_ints:
            for k, v in vocabulary.items(): 
                if v == search_int:
                    typical_words.append(k)
                    break

    # Print the most common words in each topic
    typical_words = np.reshape(typical_words, [n_topics, -1])
    print(typical_words)


# 1: Write your own code for doing Gibbs sampling for LDA

In [9]:
filename = "books.txt" 
encoding = "ISO-8859-1"
n_tokens = 10**5

ignore_words = list_of_stopwords()

n_docs, matrix, vocabulary, voc_size = get_matrix(filename, encoding, n_tokens, ignore_words)

In [47]:
typical_len = 10
max_iterations = 50

### 10 Topics, $\alpha = \beta = 0.1$

In [48]:
n_topics = 10

word_topic_prob_01_10 = LDA_Gibbs_Sampler(matrix, voc_size, n_docs, n_topics, \
                                          max_iterations, alpha = 0.1, beta = 0.1)

Iteration 0
Likelihood -2959461.7915741513
Iteration 1
Likelihood -2940257.311157539
Iteration 2
Likelihood -2925523.047138514
Iteration 3
Likelihood -2911869.3406520733
Iteration 4
Likelihood -2897729.3657757165
Iteration 5
Likelihood -2882917.9674948207
Iteration 6
Likelihood -2865786.6832138174
Iteration 7
Likelihood -2843921.32222889
Iteration 8
Likelihood -2811810.7728373394
Iteration 9
Likelihood -2771480.3365318426
Iteration 10
Likelihood -2725559.3087392207
Iteration 11
Likelihood -2676803.521659355
Iteration 12
Likelihood -2634905.3646270726
Iteration 13
Likelihood -2599352.1737921517
Iteration 14
Likelihood -2573341.203122192
Iteration 15
Likelihood -2553125.8285477287
Iteration 16
Likelihood -2536621.514484279
Iteration 17
Likelihood -2521959.3685467397
Iteration 18
Likelihood -2510125.853649458
Iteration 19
Likelihood -2496498.1699143606
Iteration 20
Likelihood -2480397.647884874
Iteration 21
Likelihood -2464348.1137015023
Iteration 22
Likelihood -2448977.7504874323
Iterati

ValueError: cannot reshape array of size 101 into shape (10,newaxis)

In [83]:
show_words_by_topic(word_topic_prob_01_10, vocabulary, typical_len)

[['read' 'books' 'really' 'one' 'best' 'could' 'help' 'ever' 'got'
  'characters']
 ['great' 'read' 'story' 'would' 'like' 'information' 'find' 'little'
  'problems' 'dr.']
 ['book' 'one' 'like' 'life' 'us' 'also' 'best' 'two' 'new' 'shows']
 ['years' 'war' 'first' 'rather' 'two' 'author' 'books' 'also' 'really'
  'children']
 ['book' 'good' 'well' 'reading' 'writing' 'information' 'stories'
  'never' 'world' 'used']
 ['book' 'people' 'many' 'time' 'real' 'recommend' 'history' 'see' 'work'
  'interested']
 ['book' 'read' 'time' 'every' 'found' 'way' 'written' 'lot' 'understand'
  'interesting']
 ['book' 'life' 'one' 'way' 'story' 'novel' 'many' 'could' 'characters'
  'much']
 ['fghkfty' 'one' 'books' 'read' 'would' 'reading' 'work' 'first' 'also'
  'like']
 ['book' 'read' 'world' 'highly' 'would' 'easy' 'love' 'better' 'could'
  'amazing']]


fghkfty isn't supposed to be in this list, what happened there?? Maybe run it all again...

### 50 Topics, $\alpha = \beta = 0.1$

In [84]:
n_topics = 50

word_topic_prob_01_50 = LDA_Gibbs_Sampler(matrix, voc_size, n_docs, n_topics, \
                                          max_iterations, alpha = 0.1, beta = 0.1)

Iteration 0
Likelihood -4599672.834983534
Iteration 1
Likelihood -4533899.245269852
Iteration 2
Likelihood -4471492.79144917
Iteration 3
Likelihood -4408938.794204953
Iteration 4
Likelihood -4341656.914989154
Iteration 5
Likelihood -4267699.248324575
Iteration 6
Likelihood -4178684.4947743546
Iteration 7
Likelihood -4083436.6055780347
Iteration 8
Likelihood -4002468.532392354
Iteration 9
Likelihood -3933130.722976411
Iteration 10
Likelihood -3875114.0012176353
Iteration 11
Likelihood -3826478.829392011
Iteration 12
Likelihood -3778822.780619539
Iteration 13
Likelihood -3737676.9400537093
Iteration 14
Likelihood -3698269.6420583865
Iteration 15
Likelihood -3658807.7096939767
Iteration 16
Likelihood -3620786.80502631
Iteration 17
Likelihood -3580932.6628963803
Iteration 18
Likelihood -3542770.410746256
Iteration 19
Likelihood -3501700.8282556348
Iteration 20
Likelihood -3459107.692953136
Iteration 21
Likelihood -3420942.350231622
Iteration 22
Likelihood -3378689.3712494234
Iteration 23
L

In [85]:
show_words_by_topic(word_topic_prob_01_50, vocabulary, typical_len)

[['book' 'time' 'life' 'could' 'read' 'story' 'way' 'family' 'characters'
  'really']
 ['book' 'many' 'good' 'first' 'make' 'much' 'story' 'enough' 'something'
  'live']
 ['book' 'history' 'life' 'us' 'reading' 'work' 'daughter' 'good' 'found'
  'instead']
 ['africa' 'applied' 'broad' 'engineer' 'scope' 'outline' 'solve' 'topic'
  'proofs' 'hunter-gault']
 ['would' 'reader' 'one' 'man' 'enough' 'page' 'players' 'big' 'high'
  'good']
 ['little' 'way' 'information' 'reader' 'well' 'another' 'know' 'worth'
  'interesting' 'need']
 ['read' 'years' 'never' 'see' 'john' 'author' 'really' 'change' 'great'
  'many']
 ['hunt' 'corps' 'journey' 'wind' 'across' 'lovable' 'river' 'still'
  'difficult' 'hardships']
 ['well' 'would' 'like' 'great' 'work' 'world' 'even' 'highly' 'star'
  'part']
 ['stories' 'really' 'time' 'could' 'reading' 'human' 'real' 'fiction'
  'give' 'especially']
 ['french' 'de' 'psychic' 'michelle' 'un' 'energy' 'vital' 'je'
  'birmingham' 'esp\x1ace']
 ['giraut' 'coelho' '

['horse' 'soul' 'path' 'eve' 'experience' 'ladder' 'monastic' 'plains'
  'carlin' 'mystical'] this topic seems to relate to fantasy or something?

[discworld' 'equal' 'rites' 'magic' 'bernard' 'opinions' 'witches'
  'aquinas' 'franciscan' 'become'] Terry Pratchett!

### 10 Topics, $\alpha = \beta = 0.01$

In [87]:
n_topics = 10

word_topic_prob_001_10 = LDA_Gibbs_Sampler(matrix, voc_size, n_docs, n_topics, max_iterations, alpha = 0.01, beta = 0.01)

Iteration 0
Likelihood -2952537.0491999513
Iteration 1
Likelihood -2923704.5395483505
Iteration 2
Likelihood -2903509.1318901978
Iteration 3
Likelihood -2887078.607217297
Iteration 4
Likelihood -2871145.6656046216
Iteration 5
Likelihood -2854207.3350476045
Iteration 6
Likelihood -2833563.0552205043
Iteration 7
Likelihood -2807690.8204889027
Iteration 8
Likelihood -2770627.7674327097
Iteration 9
Likelihood -2725262.5213922216
Iteration 10
Likelihood -2681391.6937770755
Iteration 11
Likelihood -2646459.24549251
Iteration 12
Likelihood -2619824.6409050794
Iteration 13
Likelihood -2597353.2072432223
Iteration 14
Likelihood -2578861.594342028
Iteration 15
Likelihood -2561858.069673145
Iteration 16
Likelihood -2544587.916673765
Iteration 17
Likelihood -2528977.891855641
Iteration 18
Likelihood -2514858.696767924
Iteration 19
Likelihood -2501015.1988676437
Iteration 20
Likelihood -2485629.7734766654
Iteration 21
Likelihood -2470953.114924092
Iteration 22
Likelihood -2456042.203415541
Iteratio

In [88]:
show_words_by_topic(word_topic_prob_001_10, vocabulary, typical_len)

[['story' 'best' 'books' 'make' 'reading' 'used' 'good' 'problems'
  'written' 'get']
 ['fghkfty' 'many' 'book' 'like' 'also' 'story' 'people' 'time' 'work'
  'could']
 ['book' 'good' 'reading' 'would' 'really' 'one' 'never' 'anyone'
  'author' 'wonderful']
 ['love' 'world' 'well' 'first' 'like' 'recommend' 'life' 'really' 'find'
  'years']
 ['read' 'one' 'book' 'time' 'series' 'pages' 'women' 'every' 'life'
  'see']
 ['information' 'edition' 'real' 'lot' 'know' 'worth' 'care' 'loved'
  'review' 'follow']
 ['book' 'great' 'also' 'way' 'recommend' 'like' 'think' 'dr.' 'new'
  'start']
 ['book' 'read' 'writing' 'great' 'better' 'war' 'recipes' 'learn' 'high'
  'written']
 ['book' 'would' 'life' 'books' 'novel' 'much' 'even' 'way' 'read'
  'years']
 ['help' 'books' 'well' 'could' 'useful' 'interesting' 'much' 'familiar'
  'people' 'helpful']]


### 50 Topics, $\alpha = \beta = 0.01$

In [89]:
n_topics = 50

word_topic_prob_001_50 = LDA_Gibbs_Sampler(matrix, voc_size, n_docs, n_topics, max_iterations, alpha = 0.01, beta = 0.01)

Iteration 0
Likelihood -4509457.424745279
Iteration 1
Likelihood -4428690.131440524
Iteration 2
Likelihood -4358092.23837603
Iteration 3
Likelihood -4287884.735515198
Iteration 4
Likelihood -4209701.015282318
Iteration 5
Likelihood -4115628.21511205
Iteration 6
Likelihood -4016172.8163928203
Iteration 7
Likelihood -3928913.140743313
Iteration 8
Likelihood -3858484.7285458264
Iteration 9
Likelihood -3797577.140734031
Iteration 10
Likelihood -3745444.2266264167
Iteration 11
Likelihood -3696176.0961150792
Iteration 12
Likelihood -3648988.5283368076
Iteration 13
Likelihood -3607090.3600450032
Iteration 14
Likelihood -3566588.100620899


KeyboardInterrupt: 

In [90]:
show_words_by_topic(word_topic_prob_001_50, vocabulary, typical_len)

NameError: name 'word_topic_prob_001_50' is not defined

### Save results!

In [93]:
file = open("nzw_01_10.txt","w") 
file.write(str(word_topic_prob_01_10))
file.close()

file = open("nzw_001_10.txt","w") 
file.write(str(word_topic_prob_001_10))
file.close()

file = open("nzw_01_50.txt","w") 
file.write(str(word_topic_prob_01_50))
file.close()

file = open("nzw_001_50.txt","w") 
file.write(str(word_topic_prob_001_50))
file.close()

NameError: name 'word_topic_prob_001_50' is not defined

# Object oriented in Python - would be less messy to do this

In [None]:
class Person:
    
    def __init__(self, name, age):
        self.name = name
        self.age = age
    
    def greeting(self):
        print("Hello, my name is " + self.name)

In [None]:
p1 = Person("Sara", 25)
p1.age

p1.greeting()

# 2: Write your own code for doing Gibbs samling on Bigram LDA

as in H. M. Wallach: Topic modeling: beyond bag-of-words. ICML(2006) 977-984. http://dirichlet.net/pdf/wallach06topic.pdf

For corpus use the Amazon book reviews corpus that you also used in Assignment 1. You may have to use only a subset of the documents. A corpus of 100 000 tokens is sufficients size.

Run this for different hyperparameters. For LDA you can try α=β=0.1 and α=β=0.01. (A cross-validation search for optimal values will probably be too slow.) Run also for different numbers of topics, e.g. K=10 and K=50.

For the bigram model, see to that you use a larger hyperparameter value on the diagonal of the transition matrix over the topics. Since each document in this model has a transition matrix over topics rather than just a probability distribution, the number of topics cannot be as large as for LDA. Try K=5 and K=10.