In [None]:
%%time

import nltk
import numpy
import time
import random
import unidecode
import string
import heapq

import pandas as pd

from nltk.corpus import stopwords
from nltk import FreqDist, word_tokenize, WordNetLemmatizer
from numba import njit, jit, cuda
import numpy as np

nltk.download('stopwords')
nltk.download('wordnet')

%load_ext line_profiler

for a in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]:
    AMOUNT_OF_TOPICS = a

    # top % of most common words are removed
    TOP_K_WORD_REMOVAL = 0.001

    # https://stats.stackexchange.com/questions/59684/what-are-typical-values-to-use-for-alpha-and-beta-in-latent-dirichlet-allocation/130876
    # High alpha: many topics per document [low : 1 : high]
    ALPHA            = 50 / AMOUNT_OF_TOPICS
    # High beta: topic has many different words [0 - 1]
    BETA             = 0.1




    df = pd.read_csv("news_dataset.csv")
    #df = df[:10000]
    df = df[df['content'].notna()]

    df["documents"] = df["content"]



    # remove accents, special characters and lowercase
    # https://stackoverflow.com/questions/37926248/how-to-remove-accents-from-values-in-columns
    df.content = df.content.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.lower().str.replace('[^a-z ]', '')
    #print(df.content.head())


    #print(len(df['content'][0]))
    lemmatizer = WordNetLemmatizer()
    def remove_singles(content):
        d = dict()
        for word in content.split(' '):
            word = lemmatizer.lemmatize(word)
            if word in d:
                d[word] += 1
            else:
                d[word] = 1

        new = ""
        for word, val in d.items():
            if val > 1 and word != '':
                new += (word + " ") * val

        return new

    df['content'] = df['content'].apply(lambda content: remove_singles(content))
    #print(len(df['content'][0]))



    # remove stop words and single letters
    # https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
    stopwords_list = stopwords.words('english')
    stopwords_list.extend(list(string.ascii_lowercase))
    #pat = r'\b(?:{})\b'.format('|'.join(stopwords_list))
    #df['content'] = df['content'].str.replace(pat, '')
    #print(df.content.head())



    fdict = dict()

    if TOP_K_WORD_REMOVAL > 0:

        # remove common words
        for i in df.content.str.split(' '):
            i = list(filter(lambda x:x!="" , i))
            for word in i:
                if word != '':
                    if word in fdict:
                        fdict[word] += 1
                    else:
                        fdict[word] = 1


    length = int(TOP_K_WORD_REMOVAL * len(fdict))
    top_k_words = heapq.nlargest(length, fdict, key=fdict.get)

    top_k_words.extend(stopwords_list)

    pat = r'\b(?:{})\b'.format('|'.join(top_k_words))
    #print(pat)
    df['content'] = df['content'].str.replace(pat, '')
    #print(df.content.head())


    # remove multiple spaces in a
    df['content'] = df['content'].str.replace(r'\s+', ' ')
    #print(df.content.head())


    vocab_set = set()

    def compute_dict(input):
        result = dict()

        for word in input.split(' '):
            if word != '':
                vocab_set.add(word)

    df['content'].apply(lambda content: compute_dict(content))
    vocab = list(vocab_set)

    tmp = {}
    count = 0
    for word in vocab:
        tmp[word] = count
        count += 1
    vocab = tmp



    #Toevoegen van extra column met index ineens
    def remove_dupes(input):
        output = input.split(' ')
        output = list(filter(lambda x:x!="" , output))
        for i, value in enumerate(output):
            output[i] = vocab[value]
        return np.array(output)
    df['content_list'] = df['content'].apply(lambda content: remove_dupes(content))



    numberOfTopics = AMOUNT_OF_TOPICS
    topics = [i for i in range(0, numberOfTopics)]



    wordToTopic = numpy.zeros((len(vocab), numberOfTopics), dtype=numpy.int64)
    documentToTopic = numpy.zeros((len(df.content), numberOfTopics), dtype=numpy.int64)
    topicAssignment = numpy.zeros(len(df.content_list), dtype=numpy.ndarray)
    words_in_topic = numpy.zeros(numberOfTopics, dtype=numpy.int64)
    words_in_doc = [0 for i in range(len(df.content))]
    for i, content in enumerate(df.content_list):
        tmp = np.zeros(len(content), dtype=numpy.int64)
        for word_index, word in enumerate(content):
            topic = topics[int(np.random.rand()*numberOfTopics)]
            tmp[word_index] = topic
            wordToTopic[word, topic] += 1
            documentToTopic[i, topic] += 1
            words_in_topic[topic] += 1
            words_in_doc[i] += 1
        topicAssignment[i] = tmp


    alpha = ALPHA
    beta = BETA


    @njit
    def calc_fast(rand_val, word_index, word, TA_d, doc_index, wordToTopic, words_in_topic, total_unique_word_b, document_words, WID):

            document_words[TA_d[word_index]] -= 1               
            wordToTopic[word, TA_d[word_index]] -= 1
            words_in_topic[TA_d[word_index]] -= 1

            new_topic = (np.multiply(np.divide((wordToTopic[word] + beta), 
                                                           (words_in_topic + total_unique_word_b)), 
                                                 np.divide((document_words + alpha), 
                                                           (WID))))

            new_topic = np.divide(new_topic, np.sum(new_topic))
            new_topic = np.cumsum(new_topic)
            topic = 0
            for i in range(0, numberOfTopics):
                if rand_val[word_index] < new_topic[i]:
                    topic = i
                    break

            new_topic = topic
            TA_d[word_index] = new_topic
            document_words[new_topic] += 1
            wordToTopic[word, new_topic] += 1
            words_in_topic[new_topic] += 1

    def gibs_it_sanic(i):
        total_unique_word_b = len(vocab) * beta
        topic_count_a = numberOfTopics * alpha
        for iteration in range(i):
            print("Iteration:", iteration + 1)

            for doc_index, word_list in enumerate(df.content_list):
                document_words = documentToTopic[doc_index]
                WID = words_in_doc[doc_index] + topic_count_a
                TA_d = topicAssignment[doc_index]

                rand_val = np.random.random(len(word_list))
                for word_index, word in enumerate(word_list):
                    calc_fast(rand_val, word_index, word, TA_d, doc_index, wordToTopic, words_in_topic, total_unique_word_b, document_words, WID)




    gibs_it_sanic(25)

    t2w = pd.DataFrame(wordToTopic).T
    t2w.columns = vocab

    top_words_per_topic = [
        (t2w.iloc[k][t2w.iloc[k] > 0.001].sort_values(ascending=False).index.values.tolist()[:9])
        for k in topics
    ]


    from tabulate import tabulate
    print(tabulate(top_words_per_topic))

    t2d = pd.DataFrame(documentToTopic.T)

    top_docs_per_topic = [
        (t2d.iloc[k][t2d.iloc[k] > 0.001].sort_values(ascending=False).index.values.tolist()[:5])
        for k in topics
    ]


    from tabulate import tabulate
    print(tabulate(top_docs_per_topic))



    docs = len(df.content_list)
    score = 0
    for tops in top_words_per_topic:
        for i in range(0, len(tops)-2):
            w_i = tops[i]
            w_j = tops[i+1]

            c1 = 0
            c2 = 0
            c3 = 0
            for item in df.content_list:
                tmp = 0
                if vocab[w_i] in item:
                    c1 += 1
                    tmp += 1
                if vocab[w_j] in item:
                    c2 += 1
                    tmp += 1
                if tmp == 2:
                    c3 += 1

            if c1 != 0 and c2 != 0:
                score -= np.log((c3 / docs) / (c1 / docs) * (c2 / docs))
    score /= len(top_words_per_topic)
    print("score:", score, "k:", AMOUNT_OF_TOPICS, "removed:", TOP_K_WORD_REMOVAL, "alpha:", ALPHA, "beta:", BETA)
    print()
    print()
    print()

[nltk_data] Downloading package stopwords to /home/tmtoon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/tmtoon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
Iteration: 11
Iteration: 12
Iteration: 13
Iteration: 14
Iteration: 15
Iteration: 16
Iteration: 17
Iteration: 18
Iteration: 19
Iteration: 20
Iteration: 21
Iteration: 22
Iteration: 23
Iteration: 24
Iteration: 25
------  --------  --------  -------  --------  ---------  -------  --------  ---------
world   show      film      around   water     story      book     see       food
party   election  obama     donald   vote      political  voter    democrat  candidate
court   million   case      federal  business  law        team     game      billion
school  thing     health    child    life      work       student  im        family
united  attack    official  city     officer   security   war      north     force
------  --------  --------  -------  --------  --------- 