In [1]:
%%time

import nltk
import numpy
import time
import random
import unidecode
import string

import pandas as pd

from nltk.corpus import stopwords
from nltk import FreqDist, word_tokenize
from numba import njit, jit, cuda
import numpy as np

nltk.download('stopwords')

%load_ext line_profiler


CPU times: user 1.86 s, sys: 846 ms, total: 2.71 s
Wall time: 1.54 s


[nltk_data] Downloading package stopwords to /home/tmtoon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
AMOUNT_OF_TOPICS = 10

# top % of most common words are removed
TOP_K_WORD_REMOVAL = 0

# https://stats.stackexchange.com/questions/59684/what-are-typical-values-to-use-for-alpha-and-beta-in-latent-dirichlet-allocation/130876
# High alpha: many topics per document [low : 1 : high]
ALPHA            = 50 / AMOUNT_OF_TOPICS
# High beta: topic has many different words [0 - 1]
BETA             = 0.01



In [3]:
%%time

df = pd.read_csv("news_dataset.csv")
#df = df[:1000]
df = df[df['content'].notna()]

CPU times: user 7.03 s, sys: 666 ms, total: 7.69 s
Wall time: 7.71 s


In [4]:
%%time

# remove accents, special characters and lowercase
# https://stackoverflow.com/questions/37926248/how-to-remove-accents-from-values-in-columns
df.content = df.content.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.lower().str.replace('[^a-z ]', '')
print(df.content.head())

0    washington eeaaaaeeeace    congressional repub...
1    after the bullet shells get counted the blood ...
2    when walt disneys bambi opened in  critics pra...
3    death may be the great equalizer but it isnt n...
4    seoul south korea     north koreas leader kim ...
Name: content, dtype: object
CPU times: user 18.3 s, sys: 968 ms, total: 19.3 s
Wall time: 19.4 s


In [5]:
%%time

# remove stop words and single letters
# https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
stopwords_list = stopwords.words('english')
stopwords_list.extend(list(string.ascii_lowercase))
pat = r'\b(?:{})\b'.format('|'.join(stopwords_list))
df['content'] = df['content'].str.replace(pat, '')
print(df.content.head())

0    washington eeaaaaeeeace    congressional repub...
1      bullet shells get counted  blood dries   vot...
2     walt disneys bambi opened   critics praised  ...
3    death may   great equalizer   isnt necessarily...
4    seoul south korea     north koreas leader kim ...
Name: content, dtype: object
CPU times: user 2min 30s, sys: 19.4 ms, total: 2min 30s
Wall time: 2min 30s


In [6]:
%%time

# remove common words
#fdist = FreqDist(sum(df['content'].map(word_tokenize), []))
#top_k_words, _ = zip(*fdist.most_common(int(TOP_K_WORD_REMOVAL * len(fdist))))
#print(top_k_words)

#pat = r'\b(?:{})\b'.format('|'.join(top_k_words))

#df['content'] = df['content'].str.replace(pat, '')
#print(df.content.head())

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.78 µs


In [7]:
%time

# remove multiple spaces in a
df['content'] = df['content'].str.replace(r'\s+', ' ')
print(df.content.head())

CPU times: user 0 ns, sys: 5 µs, total: 5 µs
Wall time: 10.5 µs
0    washington eeaaaaeeeace congressional republic...
1     bullet shells get counted blood dries votive ...
2     walt disneys bambi opened critics praised spa...
3    death may great equalizer isnt necessarily eve...
4    seoul south korea north koreas leader kim said...
Name: content, dtype: object


In [8]:
%%time

vocab_set = set()

def compute_dict(input):
    result = dict()

    for word in input.split(' '):
        if word != '':
            vocab_set.add(word)
            if word in result:
                result[word] += 1
            else:
                result[word] = 1
    return result

df['content_dict'] = df['content'].apply(lambda content: compute_dict(content))
vocab = list(vocab_set)

tmp ={}
count = 0
for word in vocab:
    tmp[word] = count
    count += 1
vocab = tmp

CPU times: user 26.1 s, sys: 920 ms, total: 27 s
Wall time: 27 s


In [9]:
%%time

#Toevoegen van extra column met index ineens
def remove_dupes(input):
    output = list(set(input.split(' ')))
    if '' in output:
        output.remove('')
    for i, value in enumerate(output):
        output[i] = vocab[value]
    return np.array(output)
df['content_list'] = df['content'].apply(lambda content: remove_dupes(content))

CPU times: user 28.8 s, sys: 164 ms, total: 29 s
Wall time: 29 s


In [10]:
%%time

numberOfTopics = AMOUNT_OF_TOPICS
topics = [i for i in range(0, numberOfTopics)]

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 10.7 µs


In [None]:
%%time

wordToTopic = numpy.zeros((len(vocab), numberOfTopics), dtype=numpy.int64)
documentToTopic = numpy.zeros((len(df.content), numberOfTopics))
topicAssignment = []
words_in_topic = numpy.zeros(numberOfTopics, dtype=numpy.int64)
words_in_doc = [0 for i in range(len(df.content))]
for i, content in enumerate(df.content_list):
    tmp = []
    for word in content:
        topic = numpy.random.choice(topics)
        tmp.append(topic)
        wordToTopic[word, topic] += 1
        documentToTopic[i, topic] += 1
        words_in_topic[topic] += 1
        words_in_doc[i] += 1
    topicAssignment.append(np.array(tmp))
    
topicAssignment = np.array(topicAssignment)

In [None]:
%%time

alpha = ALPHA
beta = BETA

In [None]:
%%time

@njit
def calc_fast(TA_d, doc_index, word_index, wordToTopic, word, words_in_topic, total_unique_word_b, document_words, WID):
    document_words[TA_d[word_index]] -= 1
    wordToTopic[word, TA_d[word_index]] -= 1
    words_in_topic[TA_d[word_index]] -= 1
                
    new_topic = numpy.argmax(np.multiply(np.divide((wordToTopic[word] + beta), 
                                                   (words_in_topic + total_unique_word_b)), 
                                         np.divide((document_words + alpha), 
                                                   (WID))))

    TA_d[word_index] = new_topic
    document_words[new_topic] += 1
    wordToTopic[word, new_topic] += 1
    words_in_topic[new_topic] += 1
    
def gibs_it_sanic(i):
    total_unique_word_b = len(vocab) * beta
    topic_count_a = numberOfTopics * alpha
    for iteration in range(i):
        print("Iteration:", iteration + 1)
        
        for doc_index, word_list in enumerate(df.content_list):
            document_words = documentToTopic[doc_index]
            WID = words_in_doc[doc_index] + topic_count_a
            TA_d = topicAssignment[doc_index]
            
            for word_index, word in enumerate(word_list):
                calc_fast(TA_d, doc_index, word_index, wordToTopic, word, words_in_topic, total_unique_word_b, document_words, WID)

                


In [None]:
%%time
gibs_it_sanic(10)

In [None]:
# %lprun -f gibs_it_sanic gibs_it_sanic(10)

In [None]:
t2w = pd.DataFrame(wordToTopic).T
t2w.columns = vocab

top_words_per_topic = [
    (t2w.iloc[k][t2w.iloc[k] > 0.001].sort_values(ascending=False).index.values.tolist()[:10])
    for k in topics
]



from tabulate import tabulate
print(tabulate(top_words_per_topic))

In [None]:

to_sort = []
for i in range(len(documentToTopic)):
    to_sort.append((documentToTopic[i][0], i))

to_sort.sort(key=lambda t: t[0], reverse=True)
result1 = df.content[to_sort[0][1]]
result2 = df.content[to_sort[1][1]]