In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
# Plotting tools
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import collections as matcoll
from mpl_toolkits import mplot3d
import pyLDAvis

# Import numpy and pandas
import pandas as pd
import numpy as np

# Metrics
from scipy.stats import pearsonr
from tools import computeScore

# Data Loader
import loader

# Import GSDMM algorithm model
from gsdmm import MovieGroupProcess

# Utilz
import tools
import math
from tools import vocabCreater, getMoreSent, getMoreTokens, rawTokenize, cleanSent, cleanDoc, stemmedReverse, Id2Word

%matplotlib inline

In [None]:
raw = loader.LoadRaw()
data = getMoreSent(raw)
tokens = tools.tokenize(data)
stemmed = tools.stemDocument(tokens)
stemmed = getMoreTokens(stemmed)
rawTokens = rawTokenize(data)
seed = 7

In [None]:
vocab = vocabCreater(tokens)
stemmed_vocab = vocabCreater(stemmed)

In [None]:
def display(vocab, n = 5, start = 0, fileName = 'NONE'):
    
    vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
    
    labels = np.array(vocab)[start:start+n,0]
    counts = np.array(vocab)[start:start+n,1].astype(int)

    bar_width = 0.35

    indexes = np.arange(len(labels))

    fig = plt.figure(figsize=(20,10))

    plt.bar(indexes, counts)

    # add labels
    plt.xticks(indexes + bar_width, labels)
    plt.show()
    if fileName != "NONE":
        plt.draw()
        fig.savefig(fileName)

In [None]:
lookup = stemmedReverse(tokens)

In [None]:
cleaned = cleanDoc(stemmed, 3, 50)
cleanedVocab = sorted(vocabCreater(cleaned).items(), key=lambda x: x[1], reverse=True)

len(cleaned)

In [None]:
dictionary = Id2Word(cleaned)

def score(dist, num = 5):
    topics = [np.array(list(l.items()))[:num,0] if len(l) >= num else [] for l in dist]
    return computeScore(list(filter(lambda x: len(x) > 0, topics)), cleaned, dictionary)

def top_words(dis, count, index, num = 5, start = 0, minCount = 1):

    for i in index:
        if (count[i]) > minCount - 1:
            s = sorted(dis[i].items(), key=lambda x: x[1], reverse=True)
            print("Cluster {} with {} documents : {}".format(i, count[i], s[start:start+num]))
    
    
def formatTFIDFString(clust):

    formatted = []
    for word, value in clust:
        formatted.append("{}: {:.1f}".format(word, value))
        
    return formatted


def tf_idf_creator(dis, index):
    # First we build a dictionary to look up the count of the words in each cluster
    cross_vocab = {}

    for i, clus in enumerate(dis):
        for word, count in clus.items():
            if word not in cross_vocab:
                cross_vocab[word] = np.zeros(len(index))
                cross_vocab[word][i] += 1
            else:
                cross_vocab[word][i] += 1

    rescored = [{} for _ in range(len(index))]
    for i in index:
        newDict = {}
        for word, count in dis[i].items():
            newDict[word] = count * math.log(len(index)/sum(cross_vocab[word]))
        rescored[i] = newDict
        
    return rescored
    
    
# Simple tf-idf sorter
def top_tf_idf(dis, doc_count, index, num = 5, start = 0, minCount = 1):
    
    rescored = tf_idf_creator(dis, index)
    for i in index:
        if (doc_count[i]) > minCount - 1:
            s = sorted(rescored[i].items(), key=lambda x: x[1], reverse=True)
            print("Cluster {} with {} documents : {}".format(i, doc_count[i], formatTFIDFString(s[start:start+num])))


##  Experiment

In the following code block there is a large amount of code dedicated to runnning a GSDMM experiment with certain hyperparameter and plotting the results of the code.

In [None]:
def plotClusterScore(runs):
    # Plot how the average run over all the runs cluster scores change. 
    # Here the score is the coherence score, and clusters means the number of populated clusters at the end of a training session
    endClusters = list(map(lambda x: x['runValues'][2][-1], runs))
    scores = list(map(lambda x: x['score'], runs))
    
    avgList = {}
    
    for c, s in zip(endClusters, scores):
        if c in avgList:
            avgList[c].append(s)
        else:
            avgList[c] = [s]
    score2Cluster = []
    for key, value in avgList.items():
        avg = sum(value)/len(value)
        score2Cluster.append((key, avg))
    score2Cluster.sort(key=lambda x: x[0])
    score2Cluster = np.array(score2Cluster)
    print("Pearsons Correlation Score: {:.4f}".format(pearsonr(score2Cluster[:,0], score2Cluster[:,1])[0]))
    plt.plot(score2Cluster[:,0], score2Cluster[:,1])
    plt.xlabel('Number of populated topics')
    plt.ylabel('Average coherence score')
    plt.show()
    

def plotBetaScore(runs):
    # Plot the average coherence score for beta values
    beta = list(map(lambda x: round(x['beta'], 2), runs))
    alpha = list(map(lambda x: round(x['alpha'], 2), runs))
    scores = list(map(lambda x: x['score'], runs))
    
    avgList = {}
    
    for b, a, s in zip(beta, alpha, scores):
        if b in avgList:
            avgList[b].append(s)
        else:
            avgList[b] = [s]
    

    beta2Score = []
    for key, value in avgList.items():
        avg = sum(value)/len(value)
        beta2Score.append((key, avg))

    
    beta2Score.sort(key=lambda x: x[0])
    beta2Score = np.array(beta2Score)
    print("Pearsons Correlation Score: {:.4f}".format(pearsonr(beta2Score[:,0], beta2Score[:,1])[0]))
    plt.plot(beta2Score[:,0], beta2Score[:,1])
    plt.xlabel('Beta value')
    plt.ylabel('Average coherence score')
    plt.show()


def plotAlphaScore(runs):
    # Plot the average coherence score for alpha values
    beta = list(map(lambda x: round(x['beta'], 2), runs))
    alpha = list(map(lambda x: round(x['alpha'], 2), runs))
    scores = list(map(lambda x: x['score'], runs))
    
    avgList = {}
    
    for b, a, s in zip(beta, alpha, scores):
        if a in avgList:
            avgList[a].append(s)
        else:
            avgList[a] = [s]
    

    alpha2Score = []
    for key, value in avgList.items():
        avg = sum(value)/len(value)
        alpha2Score.append((key, avg))


    alpha2Score.sort(key=lambda x: x[0])
    alpha2Score = np.array(alpha2Score)
    print("Pearsons Correlation Score: {:.4f}".format(pearsonr(alpha2Score[:,0], alpha2Score[:,1])[0]))
    plt.plot(alpha2Score[:,0], alpha2Score[:,1])
    plt.xlabel('Alpha value')
    plt.ylabel('Average coherence score')
    plt.show()
      
def plotAlphaBetaScore3D(runs):    
    # Plot the coherence score of alpha and beta in 3D (not recommened, as it is hard to tell what is happening)
    beta = list(map(lambda x: round(x['beta'], 2), runs))
    alpha = list(map(lambda x: round(x['alpha'], 2), runs))
    scores = list(map(lambda x: x['score'], runs))

    xalpha = np.repeat(np.arange(0.1, 1.0, 0.1), 9).reshape((9,9)).T
    ybeta = np.repeat(np.arange(0.1, 1.0, 0.1), 9).reshape((9,9))
    
    zscore = np.repeat(0.0, 9*9).reshape((9,9))
    
    for b, a, s in zip(beta, alpha, scores):
        b = int(b*10)-1
        a = int(a*10)-1
        zscore[a][b] = s

    fig = plt.figure()
    ax = plt.axes(projection='3d')
    # Alternative plotting types
    #ax.contour3D(xalpha, ybeta, zscore, 50, cmap='binary')
    #ax.plot_surface(xalpha, ybeta, zscore, rstride=1, cstride=1, cmap='viridis', edgecolor='none')
    ax.scatter(alpha, beta, scores, c=scores, cmap='viridis', linewidth=0.5);
    ax.set_xlabel('alpha')
    ax.set_ylabel('beta')
    ax.set_zlabel('score')
#    ax.view_init(60, 35)
    plt.show()
    
    
def plotAlphaBetaScore2D(runs):
    # Plot the coherence score of alpha and beta in 2D (recommened, as it is easier to see what is ahppening)
    zscore = np.repeat(0.0, 9*9).reshape((9,9))
    
    for run in runs:
        b = int(run['beta']*10)-1
        a = int(run['alpha']*10)-1    
        zscore[b][a] = run['score']
    
    fig = plt.figure()
    ax = plt.axes()
    
    #np.array(scores)*100
    plt.imshow(zscore, cmap='viridis', origin='lower')
    plt.colorbar()
    
    plt.xticks(range(9))
    ax.set_xticklabels(map(lambda x: round(x, 2), np.arange(0.1, 1.0, 0.1)))
    ax.set_yticklabels(map(lambda x: round(x, 2), np.arange(0.0, 1.0, 0.1)))

    ax.set_xlabel('alpha')
    ax.set_ylabel('beta')
    plt.show()


def plotTrainingSession(run):
    # Plot a specific training run
    _, transferred, clusters = run
    
    plt.figure(figsize = (10,5))
    plt.plot(transferred)
    plt.ylabel('Transferred Elements')
    plt.xlabel('Stage')
    plt.show()
    
    plt.figure(figsize = (10,5))
    plt.plot(clusters)
    plt.ylabel('Populated Clusters')
    plt.xlabel('Stage')
    plt.show()

def lineScatter(x, y, size = (20,5), base = 5, title = '', xLabel = '', yLabel = '', hLine = -1000, hlabel=''):
    # Line scatter plot
    lines = []
    for i in range(len(x)):
        pair=[(x[i],0), (x[i], y[i])]
        lines.append(pair)
        
    linecoll = matcoll.LineCollection(lines)
    fig, ax = plt.subplots(figsize = size)
    ax.add_collection(linecoll)

    plt.scatter(x,y)
    plt.axhline(y=hLine, color='b', linestyle='-', label=hlabel)
    plt.legend()
    plt.title(title)
    plt.ylabel(yLabel)
    plt.xlabel(xLabel)

    plt.xticks(x)
    plt.ylim(0,max(y) + min(y) + base)

    plt.show()

def experiment(data, vocab, seed = 0, alpha = .1, beta = .1, step = .1, n_iters = 10, cluster_count = 10, topicWordScore = 10):
    # Runs an experiment, and results in a dataset of different
    # training runs that can be used to figure out the optimal hyper parameters
    loopAlpha = np.arange(alpha, 1.0, step)
    loopBeta = np.arange(beta, 1.0, step)
    
    runs = []
    
    vocab = set(np.array(vocab)[:,0])
    n_terms = len(vocab)
    
    print("Running experiments")
    for a in loopAlpha:
        for b in loopBeta:    
            np.random.seed(seed)
            print("alpha: {}, beta: {}".format(a, b))
                  
            mgp = MovieGroupProcess(K=cluster_count, alpha=a, beta=b, n_iters=n_iters)
            y = mgp.fit(data, n_terms, verbose=False)
            
            iterClass = { 'alpha': a, 'beta': b, 'score': score(mgp.cluster_word_distribution, topicWordScore), 'runValues': y}
            print("score: {}".format(iterClass['score']))
            runs.append(iterClass)
            
    return runs

In [None]:
runExperiment = False # Set to true to run experiment, WARNING, can take a long time depending on setup
if runExperiment:
    runs = experiment(cleaned, cleanedVocab, n_iters=50, cluster_count=50)    
    plotClusterScore()
    plotBetaScore()
    plotAlphaScore()
    plotAlphaBetaScore2D()


In [None]:
cluster_count = 50
np.random.seed(seed)
mgp = MovieGroupProcess(K=cluster_count, alpha=.2, beta=.2, n_iters=50)
vocab = set(np.array(cleanedVocab)[:,0])
n_terms = len(vocab)
y = mgp.fit(cleaned, n_terms)

In [None]:
doc_count = np.array(mgp.cluster_doc_count)
mincount = 15
topicWordScore = 10

print("Number of valid topics: {}".format(sum([count > mincount-1 for count in doc_count])))

print('Number of documents per topic :', doc_count)
print('*'*20)
# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-cluster_count:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*20)
# Show the top 5 words in term frequency for each cluster 
top_words(mgp.cluster_word_distribution, doc_count, top_index, 5, minCount = mincount)
print('*'*20)
print('TF-IDF sorted')
print('*'*20)
top_tf_idf(mgp.cluster_word_distribution, doc_count, top_index, 5, minCount = mincount)
print("Coherence Score: ", score(mgp.cluster_word_distribution, topicWordScore))

In [None]:
lineScatter(range(len(doc_count)), doc_count, xLabel = 'Topics', yLabel = 'Number of Documents in Topic', size = (15,5), hLine = mincount, hlabel="Minimum amount to Pass as a topic")
plotTrainingSession(y)

In [None]:
lookup['rund'] # Lookup words

In [None]:
def doc_tabel(data, model):
    # Creates a table for 
    tokens = tools.tokenize(data)
    stemmed = tools.stemDocument(tokens)
    stemmed = getMoreTokens(stemmed)
    stemmed_vocab = vocabCreater(stemmed)
    
    newAr = []    
    vocab_count = {l: v for l, v in stemmed_vocab.items()} 


    for i, sent in enumerate(stemmed):
        nSent = cleanSent(sent, vocab_count)
        label, prob = model.choose_best_label(nSent)
        newAr.append([data[i], nSent, label, prob])
            
    df = pd.DataFrame(newAr)
    df.columns =["Sentence", "Cleaned", "Cluster", "Probability"]
    return df

def lookupCluster(mgp, clus = 0, num = 100):
    # Lookup the sentences in a certain cluster
    pd.set_option('display.max_colwidth', None)
    pd.options.display.max_rows = 100
    df = doc_tabel(data, mgp).sort_values(by=['Probability'], ascending=False)
    return df.loc[df['Cluster'] == clus].head(num)

In [None]:
lookupCluster(mgp, clus = 0)

In [None]:
def ClusterBarCharts(mgp, top_index, clust = 0):
    # Creates and displays bar charts over most frequent words and most highscoring tf-idf words
    rescored = tf_idf_creator(mgp.cluster_word_distribution, top_index)
    rescored = [{l: round(v) for l, v in d.items()} for d in rescored]
    rescored = list(filter(lambda x: len(x[0].keys()) > 0 and x[1] > mincount, zip(rescored, doc_count)))
    rescored.sort(key = lambda x: x[1], reverse=True)
    rescored = list(map(lambda x: x[0], rescored))

    cluster_dist = list(filter(lambda x: x[1] > mincount, zip(mgp.cluster_word_distribution, doc_count)))
    cluster_dist.sort(key = lambda x: x[1], reverse=True)
    cluster_dist = list(map(lambda x: x[0], cluster_dist))

    display(rescored[clust], 20)
    display(cluster_dist[clust], 20)
    
ClusterBarCharts(mgp, top_index, clust = 8)

In [None]:
def convertForViz(model, documents, mincount = 15):
    # This function converts the document from GSDMM encoding into LDA encoding so in can be visualized by pyLDAvis
    vocab = vocabCreater(documents)
    dictionary = tools.CreateVocab(documents)
    rescored = tf_idf_creator(mgp.cluster_word_distribution, top_index)
    # topic_term_dists
    # Foreach topic what is the distribution for each word
    topic_term_dists = []
    for topic, c in zip(mgp.cluster_word_distribution, mgp.cluster_doc_count):
        if c <= mincount: continue
            
        baseVec = np.zeros(len(dictionary.items()))

        total = sum(np.array(list(topic.items()))[:,1].astype(int))
        for word, count in topic.items():
            prop = count/total#/vocab[word]
            baseVec[dictionary[word]] = prop
        topic_term_dists.append(baseVec)
    
    # doc_topic_dists
    # Foreach document what is the probability to be in each of the different topics
    doc_topic_dists = [mgp.score(doc) for doc in documents] # We score for all topics
    # Here we remove topics that are too small to be considered
    doc_topic_dists = list(map(lambda x: np.array(list(filter(lambda y: y[1] > mincount, zip(x, mgp.cluster_doc_count))))[:,0], doc_topic_dists))
    # Reweight the remaining topics
    doc_topic_dists = list(map(lambda x: x/sum(x), doc_topic_dists))
    
    doc_lengths = [len(doc) for doc in documents]

    
    data = {'topic_term_dists': topic_term_dists, 
            'doc_topic_dists': doc_topic_dists,
            'doc_lengths': doc_lengths,
            'vocab': [l for l, v in dictionary.items()],
            'term_frequency': [vocab[l] for l, v in dictionary.items()]}
    return pyLDAvis.display(pyLDAvis.prepare(**data))


convertForViz(mgp, cleaned, mincount = 15)