# Text 3: Latent Dirichlet Allocation
**Internet Analytics - Lab 4**

---

**Group:** *J.*

**Names:**

* *Maxime Lucas Lanvin*
* *Victor Salvia*
* *Erik Axel Wilhelm Sjöberg*

---

#### Instructions

*This is a template for part 3 of the lab. Clearly write your answers, comments and interpretations in Markodown cells. Don't forget that you can add $\LaTeX$ equations in these cells. Feel free to add or remove any cell.*

*Please properly comment your code. Code readability will be considered for grading. To avoid long cells of codes in the notebook, you can also embed long python functions and classes in a separate module. Don’t forget to hand in your module if that is the case. In multiple exercises, you are required to come up with your own method to solve various problems. Be creative and clearly motivate and explain your methods. Creativity and clarity will be considered for grading.*

In [3]:
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors
import pickle
import numpy as np
from scipy.sparse.linalg import svds
from numpy.linalg import norm
import pandas as pd
import json
from operator import itemgetter

# Import LibEx1
import nltk
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.util import ngrams 
from utils import load_json, load_pkl
import copy
from collections import Counter
from scipy.sparse import csr_matrix

# Import Data
courses = load_json('data/courses.txt')
stopwords = load_pkl('data/stopwords.pkl')

[nltk_data] Downloading package wordnet to /home/salvia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Some data pre-processing done in Part 1:

In [4]:
# Constructs a term-document matrix from a corpus
def get_term_document_matrix(corpus):
    global_dictionary, dictionary_mapping = get_dictionary(corpus)
    df_index = dict((k,len(list(set(v)))) for k,v in dictionary_mapping.items()) # A dict where the key is the term and the value is in how many documents the term is present
    unique_words = list(df_index.keys()) #The unique words
    word_to_index = dict(zip(unique_words,(range(len(unique_words))))) # Mapping from word to index (That we use for the encoding)
    index_to_word = dict((v,k) for k,v in word_to_index.items()) # Mapping back from index to word. 


    m = len(unique_words)
    n = len(corpus)

    values = []
    rows = []
    columns = []

    for i in range(n):
        tokens = corpus[i]['description']
        loc_word_count = len(tokens)
        loc_counts = Counter(tokens)
        unique_tokens = list(loc_counts.keys())

        for token in unique_tokens:
            tf = loc_counts[token]/loc_word_count
            df = df_index[token]
            idf = np.log(n/(df+1))

            rows.append(word_to_index[token])
            columns.append(i)
            values.append(tf*idf)

    return csr_matrix((values, (rows, columns)), shape=(m, n)), index_to_word, word_to_index

def get_column_scores(mat, i,index_to_word, n=-1):
    a = mat.getcol(i)
    non_zero_rows = csr_matrix.nonzero(a)[0]
    d = {}
    for i in non_zero_rows:
        d[index_to_word[i]] = a[i,0]
    order_d = {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}
    if n == -1:
        return Counter(order_d).most_common()
    else:
        return Counter(order_d).most_common(n)
    
def get_row_scores(mat, word,word_to_index, n=-1,corpus=courses):
    a = mat.getrow(word_to_index[word])
    non_zero_cols = csr_matrix.nonzero(a)[1]
    d = {}
    for i in non_zero_cols:
        d[corpus[i]['name']] = a[0,i]
    
    order_d = {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}
    if n == -1:
        return Counter(order_d).most_common()
    else:
        return Counter(order_d).most_common(n)
    
# We do not care for the position here so just add them up
def corpus_merge(c1,c2):
    merged_corpus =  copy.deepcopy(c1)
    for i in range(len(c1)):
        merged_corpus[i]['description'] = merged_corpus[i]['description'] + c2[i]['description']
    return merged_corpus

# Checks whether or not there is a digit in a string.
def NoNumbers(s):
    return not any(char.isdigit() for char in s)

# Stemms a given string
def stemmer(s):
    word_tokens = tknzr.tokenize(s)
    temp_list = [ps.stem(w) for w in word_tokens if not w in ignored_words] 
    return [w for w in temp_list if NoNumbers(w)]

# Lemmatizes a given string
def lemmazation(s):
    word_tokens = tknzr.tokenize(s)
    temp_list = [lemmatizer.lemmatize(w) for w in word_tokens if not w in ignored_words]
    return [w for w in temp_list if NoNumbers(w)]

def lem_n_stem(s):
    word_tokens = tknzr.tokenize(s)
    temp_list = [ps.stem(lemmatizer.lemmatize(w)) for w in word_tokens if not w in ignored_words]
    return [w for w in temp_list if NoNumbers(w)]    

# Helper function for the tokenize_1gram.
# tokenzie a given string, either stem or Lemmatise the words and removes the ignored words for a 1 gram
def tokenize_1gram(l,lem,stemlem):
    courses_loc = copy.deepcopy(l)
    for i in courses_loc:
        description = i['description']
        if lem == True:
            i['description'] = lemmazation(description)
            if stemlem == False:
                i['description'] = lemmazation(description)
            else:
                i['description'] = lem_n_stem(description)
        else:
            i['description'] = stemmer(description)  
    return courses_loc

# Description: Tokenzie a given string, either stem or Lemmatise the words and removes the ignored words for a 1 gram.
# After this step n-grams are created over the cleaned string //

# @ l: Indicats the level of the n-gram we want returned over the string l. Default is 1.
# @ lem: boolean exression determining whether or not to use stemming or lemmazation. Default is lemmazation
# @ stemlem: boolean expression determining whether or not to use both stemming and lemmazation
def tokenize_ngram(l,n=1,lem=True,stemlem=False):
    if n ==1:
        return tokenize_1gram(l,lem,stemlem)  
    courses_loc = copy.deepcopy(l)
    for i in courses_loc:
        description = i['description']
        sentences = description.split('.')
        grams = []
        for s in sentences:
            if lem == True:
                if stemlem == False:
                    tokens = lemmazation(s)
                else:
                    tokens = lem_n_stem(s)
            else:
                tokens = stemmer(s)
            grams = grams + list(ngrams(tokens,n))
        i['description'] = grams
    return courses_loc   

def get_dictionary(d):
    global_dictionary = []
    dictionary_mapping = {}
    for i in range(0,len(d)):
        temp_list = d[i]
        global_dictionary = global_dictionary + temp_list['description']
        for w in temp_list['description']:
            if w in dictionary_mapping:
                dictionary_mapping[w].append(i)
            else:
                dictionary_mapping[w] = [i]
    return global_dictionary, dictionary_mapping



# Creating a list containing all of the special chars. This list is then added to the stopwords and together they form
# the ignored words. These words will be removed from the corpus. 
specialchar = ['.', ',', '(', ')', '&', ':', '/','-','"',';','', ' ', '..', '...',"'",'%']
ignored_words = set(list(stopwords) + specialchar)


# Creating the tokenizer, the stemmer and the lemmatizer.
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True,preserve_case=False)
ps = PorterStemmer() 
lemmatizer = WordNetLemmatizer()
    
# Tokenzing, Lemmatizing & Stemming the corpus, for 1-grams, 2-grams and 3-grams
lem_and_stem_1gram = tokenize_ngram(courses,1,stemlem=True)
lem_and_stem_2gram = tokenize_ngram(courses,2,stemlem=True)

lem_stem_corpus = corpus_merge(lem_and_stem_1gram, lem_and_stem_2gram)
X, index_to_word, word_to_index = get_term_document_matrix(lem_stem_corpus)

In [12]:
len(lem_stem_corpus[10]['description'])

214

## Exercise 4.8: Topics extraction

Using your pre-processed courses dataset, extract topics using LDA.

1. Print k = 10 topics extracted using LDA and give them labels.
2. How does it compare with LSI?

You can use the default values for all parameters.

----
----

### Train LDA with k = 10 and print the topics

The vectors used for the documents will be the TF-IDF weighting. This is not strictly necessary but it can be helpful. We will select the 10.000 terms with highest TF-IDF for the sake of efficency.

In [3]:
X_f = csr_matrix.toarray(X)

mX = np.zeros(87304)
for i in range(87304):
    mX[i] = np.max(X_f[i,:]) # maximum TF-IDF for each term -> it tells us importance of that term.

indx_map = mX.argsort()[-10000:][::-1]
X_f = X_f[indx_map,:]

data = []
for i in range(854):
    data.append([i+1, Vectors.dense(X_f[:,i])])
rdd = sc.parallelize(data)

model = LDA.train(rdd, k = 10, maxIterations = 20, optimizer = 'online', seed = 1)

topics = model.topicsMatrix()

Now, given the topics matrix, this function returns the distribution over the most frequent words in each topic.

In [28]:
def explain_topics(topics):
    print("----------------- TOPIC 1 ---------------------")
    indx = topics[:,0].argsort()[-10:][::-1]
    tot = np.sum(topics[:,0])
    print(index_to_word[indx_map[indx[0]]], 1000*topics[indx[0],0]/tot)
    print(index_to_word[indx_map[indx[1]]], 1000*topics[indx[1],0]/tot)
    print(index_to_word[indx_map[indx[2]]], 1000*topics[indx[2],0]/tot)
    print(index_to_word[indx_map[indx[3]]], 1000*topics[indx[3],0]/tot)
    print(index_to_word[indx_map[indx[4]]], 1000*topics[indx[4],0]/tot)
    print(index_to_word[indx_map[indx[5]]], 1000*topics[indx[5],0]/tot)
    print(index_to_word[indx_map[indx[6]]], 1000*topics[indx[6],0]/tot)
    print(index_to_word[indx_map[indx[7]]], 1000*topics[indx[7],0]/tot)
    print(index_to_word[indx_map[indx[8]]], 1000*topics[indx[8],0]/tot)
    print(index_to_word[indx_map[indx[9]]], 1000*topics[indx[9],0]/tot)
    print("----------------- TOPIC 2 ---------------------")
    indx = topics[:,1].argsort()[-10:][::-1] 
    tot = np.sum(topics[:,1])
    print(index_to_word[indx_map[indx[0]]], 1000*topics[indx[0],1]/tot)
    print(index_to_word[indx_map[indx[1]]], 1000*topics[indx[1],1]/tot)
    print(index_to_word[indx_map[indx[2]]], 1000*topics[indx[2],1]/tot)
    print(index_to_word[indx_map[indx[3]]], 1000*topics[indx[3],1]/tot)
    print(index_to_word[indx_map[indx[4]]], 1000*topics[indx[4],1]/tot)
    print(index_to_word[indx_map[indx[5]]], 1000*topics[indx[5],1]/tot)
    print(index_to_word[indx_map[indx[6]]], 1000*topics[indx[6],1]/tot)
    print(index_to_word[indx_map[indx[7]]], 1000*topics[indx[7],1]/tot)
    print(index_to_word[indx_map[indx[8]]], 1000*topics[indx[8],1]/tot)
    print(index_to_word[indx_map[indx[9]]], 1000*topics[indx[9],1]/tot)
    print("----------------- TOPIC 3 ---------------------")
    indx = topics[:,2].argsort()[-10:][::-1] 
    tot = np.sum(topics[:,2])
    print(index_to_word[indx_map[indx[0]]], 1000*topics[indx[0],2]/tot)
    print(index_to_word[indx_map[indx[1]]], 1000*topics[indx[1],2]/tot)
    print(index_to_word[indx_map[indx[2]]], 1000*topics[indx[2],2]/tot)
    print(index_to_word[indx_map[indx[3]]], 1000*topics[indx[3],2]/tot)
    print(index_to_word[indx_map[indx[4]]], 1000*topics[indx[4],2]/tot)
    print(index_to_word[indx_map[indx[5]]], 1000*topics[indx[5],2]/tot)
    print(index_to_word[indx_map[indx[6]]], 1000*topics[indx[6],2]/tot)
    print(index_to_word[indx_map[indx[7]]], 1000*topics[indx[7],2]/tot)
    print(index_to_word[indx_map[indx[8]]], 1000*topics[indx[8],2]/tot)
    print(index_to_word[indx_map[indx[9]]], 1000*topics[indx[9],2]/tot)
    print("----------------- TOPIC 4 ---------------------")
    indx = topics[:,3].argsort()[-10:][::-1] 
    tot = np.sum(topics[:,3])
    print(index_to_word[indx_map[indx[0]]], 1000*topics[indx[0],3]/tot)
    print(index_to_word[indx_map[indx[1]]], 1000*topics[indx[1],3]/tot)
    print(index_to_word[indx_map[indx[2]]], 1000*topics[indx[2],3]/tot)
    print(index_to_word[indx_map[indx[3]]], 1000*topics[indx[3],3]/tot)
    print(index_to_word[indx_map[indx[4]]], 1000*topics[indx[4],3]/tot)
    print(index_to_word[indx_map[indx[5]]], 1000*topics[indx[5],3]/tot)
    print(index_to_word[indx_map[indx[6]]], 1000*topics[indx[6],3]/tot)
    print(index_to_word[indx_map[indx[7]]], 1000*topics[indx[7],3]/tot)
    print(index_to_word[indx_map[indx[8]]], 1000*topics[indx[8],3]/tot)
    print(index_to_word[indx_map[indx[9]]], 1000*topics[indx[9],3]/tot)
    print("----------------- TOPIC 5 ---------------------")
    indx = topics[:,4].argsort()[-10:][::-1] 
    tot = np.sum(topics[:,4])
    print(index_to_word[indx_map[indx[0]]], 1000*topics[indx[0],4]/tot)
    print(index_to_word[indx_map[indx[1]]], 1000*topics[indx[1],4]/tot)
    print(index_to_word[indx_map[indx[2]]], 1000*topics[indx[2],4]/tot)
    print(index_to_word[indx_map[indx[3]]], 1000*topics[indx[3],4]/tot)
    print(index_to_word[indx_map[indx[4]]], 1000*topics[indx[4],4]/tot)
    print(index_to_word[indx_map[indx[5]]], 1000*topics[indx[5],4]/tot)
    print(index_to_word[indx_map[indx[6]]], 1000*topics[indx[6],4]/tot)
    print(index_to_word[indx_map[indx[7]]], 1000*topics[indx[7],4]/tot)
    print(index_to_word[indx_map[indx[8]]], 1000*topics[indx[8],4]/tot)
    print(index_to_word[indx_map[indx[9]]], 1000*topics[indx[9],4]/tot)
    print("----------------- TOPIC 6 ---------------------")
    indx = topics[:,5].argsort()[-10:][::-1] 
    tot = np.sum(topics[:,5])
    print(index_to_word[indx_map[indx[0]]], 1000*topics[indx[0],5]/tot)
    print(index_to_word[indx_map[indx[1]]], 1000*topics[indx[1],5]/tot)
    print(index_to_word[indx_map[indx[2]]], 1000*topics[indx[2],5]/tot)
    print(index_to_word[indx_map[indx[3]]], 1000*topics[indx[3],5]/tot)
    print(index_to_word[indx_map[indx[4]]], 1000*topics[indx[4],5]/tot)
    print(index_to_word[indx_map[indx[5]]], 1000*topics[indx[5],5]/tot)
    print(index_to_word[indx_map[indx[6]]], 1000*topics[indx[6],5]/tot)
    print(index_to_word[indx_map[indx[7]]], 1000*topics[indx[7],5]/tot)
    print(index_to_word[indx_map[indx[8]]], 1000*topics[indx[8],5]/tot)
    print(index_to_word[indx_map[indx[9]]], 1000*topics[indx[9],5]/tot)
    print("----------------- TOPIC 7 ---------------------")
    indx = topics[:,6].argsort()[-10:][::-1] 
    tot = np.sum(topics[:,6])
    print(index_to_word[indx_map[indx[0]]], 1000*topics[indx[0],6]/tot)
    print(index_to_word[indx_map[indx[1]]], 1000*topics[indx[1],6]/tot)
    print(index_to_word[indx_map[indx[2]]], 1000*topics[indx[2],6]/tot)
    print(index_to_word[indx_map[indx[3]]], 1000*topics[indx[3],6]/tot)
    print(index_to_word[indx_map[indx[4]]], 1000*topics[indx[4],6]/tot)
    print(index_to_word[indx_map[indx[5]]], 1000*topics[indx[5],6]/tot)
    print(index_to_word[indx_map[indx[6]]], 1000*topics[indx[6],6]/tot)
    print(index_to_word[indx_map[indx[7]]], 1000*topics[indx[7],6]/tot)
    print(index_to_word[indx_map[indx[8]]], 1000*topics[indx[8],6]/tot)
    print(index_to_word[indx_map[indx[9]]], 1000*topics[indx[9],6]/tot)
    print("----------------- TOPIC 8 ---------------------")
    indx = topics[:,7].argsort()[-10:][::-1] 
    tot = np.sum(topics[:,7])
    print(index_to_word[indx_map[indx[0]]], 1000*topics[indx[0],7]/tot)
    print(index_to_word[indx_map[indx[1]]], 1000*topics[indx[1],7]/tot)
    print(index_to_word[indx_map[indx[2]]], 1000*topics[indx[2],7]/tot)
    print(index_to_word[indx_map[indx[3]]], 1000*topics[indx[3],7]/tot)
    print(index_to_word[indx_map[indx[4]]], 1000*topics[indx[4],7]/tot)
    print(index_to_word[indx_map[indx[5]]], 1000*topics[indx[5],7]/tot)
    print(index_to_word[indx_map[indx[6]]], 1000*topics[indx[6],7]/tot)
    print(index_to_word[indx_map[indx[7]]], 1000*topics[indx[7],7]/tot)
    print(index_to_word[indx_map[indx[8]]], 1000*topics[indx[8],7]/tot)
    print(index_to_word[indx_map[indx[9]]], 1000*topics[indx[9],7]/tot)
    print("----------------- TOPIC 9 ---------------------")
    indx = topics[:,8].argsort()[-10:][::-1] 
    tot = np.sum(topics[:,8])
    print(index_to_word[indx_map[indx[0]]], 1000*topics[indx[0],8]/tot)
    print(index_to_word[indx_map[indx[1]]], 1000*topics[indx[1],8]/tot)
    print(index_to_word[indx_map[indx[2]]], 1000*topics[indx[2],8]/tot)
    print(index_to_word[indx_map[indx[3]]], 1000*topics[indx[3],8]/tot)
    print(index_to_word[indx_map[indx[4]]], 1000*topics[indx[4],8]/tot)
    print(index_to_word[indx_map[indx[5]]], 1000*topics[indx[5],8]/tot)
    print(index_to_word[indx_map[indx[6]]], 1000*topics[indx[6],8]/tot)
    print(index_to_word[indx_map[indx[7]]], 1000*topics[indx[7],8]/tot)
    print(index_to_word[indx_map[indx[8]]], 1000*topics[indx[8],8]/tot)
    print(index_to_word[indx_map[indx[9]]], 1000*topics[indx[9],8]/tot)
    print("----------------- TOPIC 10 ---------------------")
    indx = topics[:,9].argsort()[-10:][::-1] 
    tot = np.sum(topics[:,9])
    print(index_to_word[indx_map[indx[0]]], 1000*topics[indx[0],9]/tot)
    print(index_to_word[indx_map[indx[1]]], 1000*topics[indx[1],9]/tot)
    print(index_to_word[indx_map[indx[2]]], 1000*topics[indx[2],9]/tot)
    print(index_to_word[indx_map[indx[3]]], 1000*topics[indx[3],9]/tot)
    print(index_to_word[indx_map[indx[4]]], 1000*topics[indx[4],9]/tot)
    print(index_to_word[indx_map[indx[5]]], 1000*topics[indx[5],9]/tot)
    print(index_to_word[indx_map[indx[6]]], 1000*topics[indx[6],9]/tot)
    print(index_to_word[indx_map[indx[7]]], 1000*topics[indx[7],9]/tot)
    print(index_to_word[indx_map[indx[8]]], 1000*topics[indx[8],9]/tot)
    print(index_to_word[indx_map[indx[9]]], 1000*topics[indx[9],9]/tot)
    print("--------------------------------------")

In [5]:
topics = model.topicsMatrix()
explain_topics(topics)

----------------- TOPIC 1 ---------------------
('train', 'rotat') 0.35799698054722356
rotat 0.2920716406155892
train 0.25596290801729404
instabl 0.15461347433814732
('kinet', 'theori') 0.1432176009619948
grade 0.1425379750910628
statist 0.14251262568976553
group 0.14224781064092867
('knowledg', 'acquir') 0.14214217768274984
mhd 0.1416211635746016
----------------- TOPIC 2 ---------------------
('design', 'xtal') 0.14209834601375732
seminar 0.14064924771654994
slope 0.14039854893608283
('programm', 'student') 0.1390648347425649
chemic 0.13896130630952525
model 0.13890523291931878
('seismic', 'design') 0.13857355001585206
('identifi', 'variant') 0.1372152157585887
('project', 'hum') 0.13660893887533485
main 0.1356469386963386
----------------- TOPIC 3 ---------------------
energi 0.16223567868571018
pile 0.15164160390621703
geostructur 0.15025245121169167
('european', 'intern') 0.14393046174030732
issu 0.14279224268749263
('rule', 'principl') 0.14260015661584985
swiss 0.1425510977742394

### Labels for each topic?

1. Training rotation program.
2. -.
3. Energy and policy.
4. EDMT management program.
5. Business, risk, finances.
6. -.
7. Projects in IC department: supply chain, sensors, probability, ...
8. Lie groups, etc.
9. Drugs, compunds, energy, chemistry.
10. -.

### How does it compare with LSI?

The key is that all the topics are much more interpretable than in the LSI model.

-------
------

## Exercise 4.9: Dirichlet hyperparameters

Analyse the effects of α (distribution of topics in documents) and β (distribution of words in topics). You should start by reading the documentation of pyspark.mllib.clustering.LDA.

1. Fix k = 10 and β = 1.01, and vary α. How does it impact the topics?
2. Fix k = 10 and α = 6, and vary β. How does it impact the topics?

Hint: You can set the seed to produce more comparable outputs. 

-------
-------

* docConcentration (alpha) = -1, topicConcentration (beta) = -1, are the defoult settings in the LDA training functio 

### a) Fix k = 10 and Beta = 1.01.

The difference in $\alpha$ means the following: higher values of $\alpha$ mean that documents have more topics associated, and lower values of it mean that a document contains less topics. A priori we would then like it to be small because each course (document) is pretty specific of an area, it could belong to 2, 3 or 4 main topics at most. Otherwise if we allow documents to belong to many topics it might mean that the topics are very broad and not specific.

* In conclusion, the effect we should see is that if $\alpha$ is high, topics will be very broad. And if it is low, topics should be much more specific.

1. $\beta$ = 1.01, $\alpha$ = 0.01.
2. $\beta$ = 1.01, $\alpha$ = 10.

We should be able to see that in 1 the topics are more specific than in 2.

In [6]:
X_f = csr_matrix.toarray(X)

mX = np.zeros(87304)
for i in range(87304):
    mX[i] = np.max(X_f[i,:]) # maximum TF-IDF for each term -> it tells us importance of that term.

indx_map = mX.argsort()[-10000:][::-1]
X_f = X_f[indx_map,:]

data = []
for i in range(854):
    data.append([i+1, Vectors.dense(X_f[:,i])])
rdd = sc.parallelize(data)

model = LDA.train(rdd, k = 10, maxIterations = 20, optimizer = 'online', topicConcentration = 1.01, docConcentration = 0.01, seed = 1)

topics = model.topicsMatrix()
explain_topics(topics)

----------------- TOPIC 1 ---------------------
('train', 'rotat') 0.2547868432436578
rotat 0.21525036703247685
train 0.19518654784311673
('contact', 'edmt') 0.16709343422409062
('administr', 'enrol') 0.16670002716896526
edmt 0.16520108807045475
('edmt', 'administr') 0.1607289256930936
administr 0.1549359516667174
enrol 0.15081110249294236
contact 0.13768764107667156
----------------- TOPIC 2 ---------------------
model 0.13196082602363426
financi 0.1267805663382578
optic 0.12591094369009206
('design', 'xtal') 0.12507996327059198
robot 0.12268654834662665
radiat 0.12238021390233196
('identifi', 'variant') 0.12215901199745008
implant 0.12186112939836594
('project', 'hum') 0.12179633940156404
main 0.12179484314907998
----------------- TOPIC 3 ---------------------
energi 0.13693368191883776
test 0.12742616572960677
pile 0.12532208251053942
geostructur 0.12515305814754338
rate 0.12505052467343541
('dimens', 'stress') 0.12418184620007201
issu 0.12413568698174024
('fundament', 'concept') 0.

---------
------

In [7]:
X_f = csr_matrix.toarray(X)

mX = np.zeros(87304)
for i in range(87304):
    mX[i] = np.max(X_f[i,:]) # maximum TF-IDF for each term -> it tells us importance of that term.

indx_map = mX.argsort()[-10000:][::-1]
X_f = X_f[indx_map,:]

data = []
for i in range(854):
    data.append([i+1, Vectors.dense(X_f[:,i])])
rdd = sc.parallelize(data)

model = LDA.train(rdd, k = 10, maxIterations = 20, optimizer = 'online', docConcentration = 10.00, topicConcentration = 1.01, seed = 1)

topics = model.topicsMatrix()
explain_topics(topics)

----------------- TOPIC 1 ---------------------
('train', 'rotat') 0.1294358244768289
statist 0.1251948419011645
('radtk', 'laboratori') 0.1225773738454969
fractur 0.12234101225392062
group 0.12231448001679913
design 0.12231230404945463
('featur', 'limit') 0.1222742960447491
grade 0.12213863784827582
inerti 0.12145349872634861
('target-bas', 'screen') 0.12101555953643758
----------------- TOPIC 2 ---------------------
model 0.1270624082727119
('design', 'xtal') 0.12588623196521995
main 0.1231411471428971
('identifi', 'variant') 0.12250949807452988
('project', 'hum') 0.122366382380494
seminar 0.12215264651624859
energi 0.12183286777278854
('problem', 'student') 0.12170364704670543
extern 0.12126807190833111
formul 0.12124258802235634
----------------- TOPIC 3 ---------------------
('fundament', 'concept') 0.1240936643406208
programm 0.12327393614522353
('dimens', 'stress') 0.12268407597528767
('train', 'rotat') 0.12187326492695047
topic 0.12040287396644356
('hidden', 'markov') 0.1203695

It is clear that in the first case topics are more specifip than in the second. This can be seen because the distribution of terms per topic is almost uniform in the case $\alpha = 10$, whereas in $\alpha = 0.01$ we can see there is more variability and each topic tends to have more predominant terms.

In our case we want to have topics that can be interpreted, hence we will choose $\alpha = 0.01$.

### b) Fix k = 10 and Alpha = 6.

1. $\beta$ = 0.01, $\alpha$ = 6.
2. $\beta$ = 10, $\alpha$ = 6.

As in the other case before, now we expect that higher values of $\beta$ mean that topics have more words, wherear lower values mean that topics are more specific (fewer words). We should be able to see that in 1 there are fewer words per topic than in 2.

In [8]:
X_f = csr_matrix.toarray(X)

mX = np.zeros(87304)
for i in range(87304):
    mX[i] = np.max(X_f[i,:]) # maximum TF-IDF for each term -> it tells us importance of that term.

indx_map = mX.argsort()[-10000:][::-1]
X_f = X_f[indx_map,:]

data = []
for i in range(854):
    data.append([i+1, Vectors.dense(X_f[:,i])])
rdd = sc.parallelize(data)

model = LDA.train(rdd, k = 10, maxIterations = 20, optimizer = 'online', docConcentration = 6.0, topicConcentration = 0.01, seed = 1)

topics = model.topicsMatrix()
explain_topics(topics)

----------------- TOPIC 1 ---------------------
('train', 'rotat') 0.15490682362656905
statist 0.1482830827955895
group 0.14282911790157543
design 0.14217753722158835
fractur 0.14148524474541388
grade 0.14104393124453196
('radtk', 'laboratori') 0.14070906335680494
('featur', 'limit') 0.1398243415030838
inerti 0.13954291378429842
analysi 0.13843573295263809
----------------- TOPIC 2 ---------------------
model 0.1514697734769687
('design', 'xtal') 0.14658206651706837
main 0.14412083297238557
seminar 0.14172837515907555
energi 0.14137199151781765
('identifi', 'variant') 0.14045516501789218
('project', 'hum') 0.14022002167104655
chemic 0.13909528034681787
extern 0.13903151370955244
('problem', 'student') 0.13889951663300718
----------------- TOPIC 3 ---------------------
('fundament', 'concept') 0.1439661554754502
programm 0.14218903073977193
('dimens', 'stress') 0.1405724659592668
('train', 'rotat') 0.14003365837630677
topic 0.13844099743819716
comput 0.13758540930344307
overview 0.13663

In [11]:
X_f = csr_matrix.toarray(X)

mX = np.zeros(87304)
for i in range(87304):
    mX[i] = np.max(X_f[i,:]) # maximum TF-IDF for each term -> it tells us importance of that term.

indx_map = mX.argsort()[-10000:][::-1]
X_f = X_f[indx_map,:]

data = []
for i in range(854):
    data.append([i+1, Vectors.dense(X_f[:,i])])
rdd = sc.parallelize(data)

model = LDA.train(rdd, k = 10, maxIterations = 20, optimizer = 'online', docConcentration = 6.0, topicConcentration = 0.99, seed = 1)

topics = model.topicsMatrix()
explain_topics(topics)

----------------- TOPIC 1 ---------------------
('train', 'rotat') 0.1298313919676481
statist 0.12544076427291817
('radtk', 'laboratori') 0.12278080406256044
fractur 0.12255043584661819
group 0.1225331334271251
design 0.12252685133570017
('featur', 'limit') 0.12247230619477349
grade 0.12234538061021526
inerti 0.12165121706475628
('target-bas', 'screen') 0.12120572329390299
----------------- TOPIC 2 ---------------------
model 0.12732396979881322
('design', 'xtal') 0.12611855475441858
main 0.12336527616605018
('identifi', 'variant') 0.1227108160628663
('project', 'hum') 0.12256650494322932
seminar 0.12236333607634961
energi 0.12204482739894122
('problem', 'student') 0.12189781015840838
extern 0.12146374749729877
formul 0.12143693735114733
----------------- TOPIC 3 ---------------------
('fundament', 'concept') 0.12431399606948602
programm 0.12348507347288629
('dimens', 'stress') 0.12288601348111584
('train', 'rotat') 0.12207498678169265
topic 0.1205970178599374
('hidden', 'markov') 0.12

Clearly in the former case in which $\beta = 0.01$ the distribution per topic is more specific as we were expecting. For the sake of topic interpretability we will then choose $\beta = 0.01$.

## Exercise 4.10: EPFL's taught subjects

List the subjects of EPFL’s classes.

1. Find the combination of k, α and β that gives the most interpretable topics.
2. Explain why you chose these values.
3. Report the values of the hyperparameters that you used and your labels for the topics.

### Find the best combination for k, $\alpha$, and $\beta$ for topic interpretability.

We have already decided to use low values of both $\alpha$ and $\beta$. Now is time to explore the $k$ hyperparameter. Intuitively from what we have seen before I would reduce it to 5 in order to have a much clear view of what each topic entails.

In [4]:
X_f = csr_matrix.toarray(X)

mX = np.zeros(87304)
for i in range(87304):
    mX[i] = np.max(X_f[i,:]) # maximum TF-IDF for each term -> it tells us importance of that term.

indx_map = mX.argsort()[-10000:][::-1]
X_f = X_f[indx_map,:]

data = []
for i in range(854):
    data.append([i+1, Vectors.dense(X_f[:,i])])
rdd = sc.parallelize(data)

model = LDA.train(rdd, k = 10, maxIterations = 20, optimizer = 'online', docConcentration = 0.01, topicConcentration = 0.01, seed = 1)

topics = model.topicsMatrix()
explain_topics(topics)

----------------- TOPIC 1 ---------------------
('train', 'rotat') 0.29473832810867207
rotat 0.24000170846046717
train 0.21729319482789855
hous 0.18139957266269213
('simpl', 'complex') 0.1481690953202155
('radtk', 'laboratori') 0.1438116903030069
studio 0.14231303400418446
('arriv', 'simpl') 0.14152303162110397
instabl 0.14099049604012331
('featur', 'limit') 0.13995137191816512
----------------- TOPIC 2 ---------------------
('problem', 'student') 0.1516067149238453
organ 0.1470065477501906
('interferometri', 'tv') 0.1467938246850633
('design', 'xtal') 0.14486191316831973
main 0.1425549556749034
('programm', 'student') 0.1412210266353774
slope 0.1398016569005256
formul 0.13977501840523437
('identifi', 'variant') 0.1396404964729827
('project', 'hum') 0.13899219235215984
----------------- TOPIC 3 ---------------------
lifecycl 0.14566338309862087
magnet 0.14218542749047636
('dimens', 'stress') 0.1401290984664936
programm 0.13989555776925003
('fundament', 'concept') 0.13902390012543622
ge

After trying out several values for $k$, 10 is the one that made more sense based on the topics yielded. The values of the other 2 hyperparameters were decided in the previous exercise based on how they affected the results.

### Labels for the 10 topics:

1. Training rotation, doctorate. (strong)
2. -. (weak)
3. Lifecycle, magnets, geology. (weak)
4. Programming, networks, etc.
5. EDMT management of technology. (strong)
6. Biomass, optics, cells, molecules, etc. (normal)
7. IC projects. (strong)
8. EE, transistors, etc. (weak)
9. Optics, control, and signal. (normal)
10. Chemistry, drugs, and compunds. (strong)

Strong, normal, or weak stand for the weight of the words related to the topic. Hence it means how much evidence we have for syaing this is the topic.

## Exercise 4.11: Wikipedia structure

Extract the structure in terms of topics from the wikipedia-for-school dataset. Use your
intuition about how many topics might be covered by the articles and how they are distributed.

1. Report the values for k, α and β that you chose a priori and why you picked them.
2. Are you convinced by the results? Give labels to the topics if possible.

In [13]:
wiki = sc.textFile("/ix/wikipedia-for-schools.txt").map(json.loads)

In [14]:
print("There are", wiki.count() , "documents in total.")

There are 5554 documents in total.


In [17]:
lem_stem_corpus[10]

{'courseId': 'MGT-430',
 'name': 'Quantitative systems modeling techniques',
 'description': ['dedic',
  'model',
  'tool',
  'optim',
  'method',
  'decis',
  'analysi',
  'techniqu',
  'specif',
  'focu',
  'logist',
  'content',
  'introduct',
  'oper',
  'research',
  'graph',
  'color',
  'linear',
  'program',
  'flow',
  'theori',
  'graph',
  'cover',
  'model',
  'applic',
  'network',
  'design',
  'distribut',
  'transport',
  'distribut',
  'heurist',
  'method',
  'vehicl',
  'rout',
  'problem',
  'facil',
  'locat',
  'problem',
  'job',
  'shop',
  'facil',
  'layout',
  'balanc',
  'assembl',
  'line',
  'open',
  'shop',
  'keyword',
  'model',
  'techniqu',
  'oper',
  'research',
  'learn',
  'outcom',
  'end',
  'student',
  'repres',
  'import',
  'logist',
  'problem',
  'oper',
  'research',
  'models.solv',
  'problem',
  'exact',
  'method',
  'heuristics.classifi',
  'optim',
  'problem',
  'transvers',
  'skill',
  'summar',
  'articl',
  'technic',
  'repor

In [15]:
wikin = np.asarray(wiki.collect())

Wikin is a list in which every element (5554 in total) is a dictionary with:
* pageID
* title
* tokens

In [22]:
def get_dictionary(d):
    global_dictionary = []
    dictionary_mapping = {}
    for i in range(0,len(d)):
        temp_list = d[i]
        global_dictionary = global_dictionary + temp_list['tokens']
        for w in temp_list['tokens']:
            if w in dictionary_mapping:
                dictionary_mapping[w].append(i)
            else:
                dictionary_mapping[w] = [i]
    return global_dictionary, dictionary_mapping

def get_term_document_matrix(corpus):
    global_dictionary, dictionary_mapping = get_dictionary(corpus)
    df_index = dict((k,len(list(set(v)))) for k,v in dictionary_mapping.items()) # A dict where the key is the term and the value is in how many documents the term is present
    unique_words = list(df_index.keys()) #The unique words
    word_to_index = dict(zip(unique_words,(range(len(unique_words))))) # Mapping from word to index (That we use for the encoding)
    index_to_word = dict((v,k) for k,v in word_to_index.items()) # Mapping back from index to word. 


    m = len(unique_words)
    n = len(corpus)

    values = []
    rows = []
    columns = []

    for i in range(n):
        tokens = corpus[i]['tokens']
        loc_word_count = len(tokens)
        loc_counts = Counter(tokens)
        unique_tokens = list(loc_counts.keys())

        for token in unique_tokens:
            tf = loc_counts[token]/loc_word_count
            df = df_index[token]
            idf = np.log(n/(df+1))

            rows.append(word_to_index[token])
            columns.append(i)
            values.append(tf*idf)

    return csr_matrix((values, (rows, columns)), shape=(m, n)), index_to_word, word_to_index

In [23]:
Xw, iww, wiw = get_term_document_matrix(wikin)

In [26]:
index_to_word = iww

In [None]:
Xw_f = csr_matrix.toarray(Xw)

mX = np.zeros(87304)
for i in range(87304):
    mX[i] = np.max(Xw_f[i,:]) # maximum TF-IDF for each term -> it tells us importance of that term.

indx_map = mX.argsort()[-10000:][::-1]
Xw_f = Xw_f[indx_map,:]

data = []
for i in range(854):
    data.append([i+1, Vectors.dense(Xw_f[:,i])])
rdd = sc.parallelize(data)

model = LDA.train(rdd, k = 20, maxIterations = 20, optimizer = 'online', docConcentration = 0.01, topicConcentration = 0.01, seed = 1)

topics = model.topicsMatrix()

In [None]:
explain_topics(topics)

Taking $k = 20$ the topics make more sense than with other $k$'s I have chosen before.

2. Animals and plants
6. Amazon

However, I am still not convinced and find it difficult to find it interpretable.