In [23]:
## Assignment 2
## NLP
## Adnan Fazlinovic & Sara Nordin Hällgren

In [24]:
import numpy as np
import pandas as pd
import spacy
import string
nlp = spacy.load('en_core_web_sm')
import math

import time
from collections import Counter
from collections import defaultdict
from matplotlib import pyplot as plt

# Import stopwords
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [25]:
# Define stopwords list

stop = list(string.punctuation)
for x in stopwords.words('english'):
    stop.append(x)

'''
Add a couple of more stopwords which were observed manually
'''
for x in ['-PRON-','\n','...','..',"'d'","n't"]:
    stop.append(x)
    

In [26]:
# Define corpus
corpus = 'books.txt'
encoding = 'ISO-8859-1'

In [27]:
def clean_corpus(corpus:str, encoding:str, nr_tokens:int, stop:list) -> list():
    
    '''
    Only consider 100 000 tokens.
    Removes strings from stopword-list from corpus.
    Removes string with fewer characters than 3.
    Keeps lemmatized words.
    '''
    
    token_count = 0
    docs = list()
    docs_count = 0
    freqs = Counter()  
    
    with open(corpus, encoding = encoding) as f:
        
        while token_count < nr_tokens:
            
            doc = f.readline()
            docs_count += 1
            temp_doc = list()
            
            # Lemmatizion of tokens
            result = nlp(doc.lower())
            
            for token in result:
                if len(token.lemma_) > 2:
                    if token.lemma_ not in stop:
                        token_count += 1
                        freqs[token.lemma_] += 1
                        temp_doc.append(token.lemma_)
                        
            docs.append(temp_doc)

    print('Nr of tokens:' + str(token_count))
    print('Nr of docs:' + str(docs_count))
    
    return docs

def map_to_int(docs:list()) -> (list(), int, Counter()):
    
    '''
    Create bag of words from cleaned and smaller corpus.
    Associate each word in bag with an unique integer,
    ranging from 0 (most common word) to length of bag of words.
    Map each token in docs to the respective int. Return this list of list of ints.
    '''
    
    freqs = Counter()
    for doc in docs:
        for token in doc:
            freqs[token] += 1
    most_common = freqs.most_common()
    
    token_to_int = []
    for i in range(len(most_common)):
        token_to_int.append(most_common[i][0])

    # Get pairs of elements    
    mapping = zip(token_to_int, range(0,len(token_to_int)))
    
    # Make pairs into a dictionary
    vocab = dict(mapping)
        
    # Match token to int
    docs_int = list()
    
    for doc in docs:
        docs_int.append(list([vocab.get(x) for x in doc]))
        
    return docs_int, len(vocab), vocab

In [28]:
docs = clean_corpus(corpus=corpus, encoding=encoding, nr_tokens=100000, stop=stop)
docs_mapped, vocab_size, vocab = map_to_int(docs)
n_docs = len(docs_mapped)

Nr of tokens:100010
Nr of docs:1246


# LDA

In [29]:
def init_lda(docs:list(), n_docs:int, n_topics:int, vocab_size:int) -> (np.matrix, np.matrix, np.matrix):
    
    # \Theta
    theta = np.zeros((n_docs,n_topics))
    for d in range(n_docs):
        theta[d] = np.random.dirichlet(alpha * np.ones(n_topics))

    # \phi
    phi = np.zeros((n_topics, vocab_size))
    for z in range(n_topics):
        phi[z] = np.random.dirichlet(beta*np.ones(vocab_size))
    
    # Word topic assignment
    Z_dj = list()
    for d in range(n_docs):
        Z_dj.append([0]*len(docs_mapped[d]))
    
    # Init wt by randomly assigning topics to each word in corpus
    for d in range(n_docs):
        for j in range(len(docs[d])):
            Z_dj[d][j] = np.random.randint(n_topics) # Generate random topic
        
    return theta, phi, Z_dj

def update(docs, Z_dj, n_docs, n_topics, vocab_size, theta, phi):
    
    # Counter for word j in document d
    nd = np.zeros((n_docs, n_topics))
    mk = np.zeros((n_topics, vocab_size))
    
    for d in range(n_docs):
        for j in range(len(docs[d])):
            nd[d][Z_dj[d][j]] += 1
            mk[Z_dj[d][j]][docs[d][j]] += 1
            
    return nd, mk

In [37]:
def gibbs(docs:list(), n_docs:int, n_topics:int, vocab:dict(), vocab_size:int, max_iterations:int, alpha:int, beta:int):

    start_time = time.time()
    theta, phi, Z_dj = init_lda(docs, n_docs=n_docs, n_topics=n_topics, vocab_size=vocab_size)
    print('Matrices initialized.')

    for it in range(max_iterations):
        #print('Iteration ' + str(it))
        
        nd, mk = update(docs, Z_dj, n_docs, n_topics, vocab_size, theta, phi)
        
        for d in range(n_docs):
            
            # Update \Theta
            theta[d] = np.random.dirichlet(alpha + nd[d])
            
        #print('Theta updated.')
        
        for k in range(n_topics):
            
            # Update \phi
            phi[k] = np.random.dirichlet(beta + mk[k])
            
        #print('phi updated.')
        
        for d in range(n_docs):
            for j in range(len(docs[d])):
                
                p_dw = np.exp(np.log(theta[d]) + np.log([row[docs[d][j]] for row in phi]))
                p_dw /= sum(p_dw)
                
                Z_dj[d][j] = np.random.multinomial(1, p_dw).argmax()
                
        if (it+1) % 10 == 0:
            print('Iteration ' + str(it+1))
                

    elapsed_time = time.time() - start_time
    print("Elapsed time: ", elapsed_time)
    
    tops = list()
    for k in range(n_topics):
        order = np.array(mk[k]).argsort()[::-1][:10]
        dt = list()
        for x in order:
            for k,v in vocab.items():
                if x==v:
                    dt.append(k)
        tops.append(dt)

    for i, topic in enumerate(tops):
        print('Topic ' + str(i), topic)

# RUN
## Try different settings

Since we merged our codes into one document, all simulation results were not plotted and needed to be re-runned. We will re-run the setting we found out to give the best results, which is the one below.

$\alpha = 0.1$ \
$\beta = 0.1$ \
$K = 30$ \
$iterations = 100$



In [38]:
gibbs(docs=docs_mapped, n_docs=n_docs, n_topics=30, vocab=vocab, vocab_size=vocab_size, max_iterations=100, alpha=0.1, beta=0.1)

Matrices initialized.
Iteration 10
Iteration 20
Iteration 30
Iteration 40
Iteration 50
Iteration 60
Iteration 70
Iteration 80
Iteration 90
Iteration 100
Elapsed time:  455.75435495376587
Topic 0 ['science', 'evidence', 'national', 'future', 'rule', 'scientific', 'theory', 'law', 'bach', 'history']
Topic 1 ['book', 'read', 'child', 'get', 'find', 'love', 'life', 'like', 'make', 'one']
Topic 2 ['norris', 'fish', 'run', 'government', 'american', 'movie', 'boat', 'ship', 'chuck', 'service']
Topic 3 ['book', 'like', 'read', 'one', 'would', 'say', 'get', 'time', 'find', 'make']
Topic 4 ['novel', 'character', 'story', 'reader', 'plot', 'jason', 'series', 'fiction', 'erica', 'scene']
Topic 5 ['soviet', 'british', 'army', 'mcbride', 'red', 'german', 'war', 'soldier', 'also', 'chinese']
Topic 6 ['recipe', 'horse', 'french', 'soul', 'cook', 'ingredient', 'kitchen', 'nut', 'path', 'author']
Topic 7 ['war', 'american', 'military', 'world', 'vietnam', 'president', 'conflict', 'america', 'force', 'po

We find some topics with quite good results. For example, topic 0 is rather science-oriented, with maybe exception of "bach". Topic 2 seems to deal with actor Chuck Norris and some of his movies. Topic 5 is rather war-oriented with different countries and armies. Topic 6 is food-related, and 7 is war/military oriented. Topic 8 is again about food, and topic 11 is a bit AI-oriented. Topic 16 is strongly related to men and female and relationships. Topic 21 seems to deal with news, and topic 22 is about plays (theatre). Topic 23 is about philosofers (maybe) and 24 and 25 about education. 26 is history-oriented. Rest is quite unclear.