In [14]:
%%time

import nltk
import numpy
import time
import random
import unidecode
import string
import heapq

import pandas as pd

from nltk.corpus import stopwords
from nltk import FreqDist, word_tokenize
from numba import njit, jit, cuda
import numpy as np

nltk.download('stopwords')

%load_ext line_profiler


The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler
CPU times: user 2.59 ms, sys: 0 ns, total: 2.59 ms
Wall time: 2.06 ms


[nltk_data] Downloading package stopwords to /home/tmtoon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
AMOUNT_OF_TOPICS = 10

# top % of most common words are removed
TOP_K_WORD_REMOVAL = 0.001

# https://stats.stackexchange.com/questions/59684/what-are-typical-values-to-use-for-alpha-and-beta-in-latent-dirichlet-allocation/130876
# High alpha: many topics per document [low : 1 : high]
ALPHA            = 50 / AMOUNT_OF_TOPICS
# High beta: topic has many different words [0 - 1]
BETA             = 0.01



In [17]:
%%time

df = pd.read_csv("news_dataset.csv")
df = df[:10000]
df = df[df['content'].notna()]

CPU times: user 7.34 s, sys: 208 ms, total: 7.54 s
Wall time: 7.56 s


In [18]:
%%time

# remove accents, special characters and lowercase
# https://stackoverflow.com/questions/37926248/how-to-remove-accents-from-values-in-columns
df.content = df.content.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.lower().str.replace('[^a-z ]', '')
print(df.content.head())

0    washington eeaaaaeeeace    congressional repub...
1    after the bullet shells get counted the blood ...
2    when walt disneys bambi opened in  critics pra...
3    death may be the great equalizer but it isnt n...
4    seoul south korea     north koreas leader kim ...
Name: content, dtype: object
CPU times: user 2.05 s, sys: 3.94 ms, total: 2.05 s
Wall time: 2.06 s


In [19]:
%%time

# remove stop words and single letters
# https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
stopwords_list = stopwords.words('english')
stopwords_list.extend(list(string.ascii_lowercase))
#pat = r'\b(?:{})\b'.format('|'.join(stopwords_list))
#df['content'] = df['content'].str.replace(pat, '')
#print(df.content.head())

CPU times: user 532 µs, sys: 0 ns, total: 532 µs
Wall time: 572 µs


In [20]:
%%time

# remove common words
fdict = dict()
for i in df.content.str.split(' '):
    i = list(filter(lambda x:x!="" , i))
    for word in i:
        if word != '':
            if word in fdict:
                fdict[word] += 1
            else:
                fdict[word] = 1

CPU times: user 6.92 s, sys: 180 ms, total: 7.1 s
Wall time: 7.1 s


In [21]:
%%time

length = int(TOP_K_WORD_REMOVAL * len(fdict))
top_k_words = heapq.nlargest(length, fdict, key=fdict.get)

top_k_words.extend(stopwords_list)

pat = r'\b(?:{})\b'.format('|'.join(top_k_words))
print(pat)
df['content'] = df['content'].str.replace(pat, '')
print(df.content.head())

\b(?:the|to|a|of|and|in|that|for|on|he|mr|said|is|was|with|it|as|his|at|by|but|from|have|i|an|not|has|who|be|had|are|they|its|this|she|were|about|trump|her|their|or|would|more|one|you|been|we|new|which|when|people|will|after|like|what|out|there|if|some|also|all|up|than|ms|him|president|so|other|into|no|over|them|years|could|can|time|states|two|many|just|last|first|do|now|most|united|because|state|even|my|how|year|our|did|where|those|before|american|only|s|trumps|news|government|house|i|me|my|myself|we|our|ours|ourselves|you|you're|you've|you'll|you'd|your|yours|yourself|yourselves|he|him|his|himself|she|she's|her|hers|herself|it|it's|its|itself|they|them|their|theirs|themselves|what|which|who|whom|this|that|that'll|these|those|am|is|are|was|were|be|been|being|have|has|had|having|do|does|did|doing|a|an|the|and|but|if|or|because|as|until|while|of|at|by|for|with|about|against|between|into|through|during|before|after|above|below|to|from|up|down|in|out|on|off|over|under|again|further|then|o

In [22]:
%time

# remove multiple spaces in a
df['content'] = df['content'].str.replace(r'\s+', ' ')
print(df.content.head())

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.54 µs
0    washington eeaaaaeeeace congressional republic...
1     bullet shells get counted blood dries votive ...
2     walt disneys bambi opened critics praised spa...
3    death may great equalizer isnt necessarily eve...
4    seoul south korea north koreas leader kim sund...
Name: content, dtype: object


In [23]:
%%time

vocab_set = set()

def compute_dict(input):
    result = dict()

    for word in input.split(' '):
        if word != '':
            vocab_set.add(word)
            if word in result:
                result[word] += 1
            else:
                result[word] = 1
    return result

df['content_dict'] = df['content'].apply(lambda content: compute_dict(content))
vocab = list(vocab_set)

tmp ={}
count = 0
for word in vocab:
    tmp[word] = count
    count += 1
vocab = tmp

CPU times: user 1.78 s, sys: 63 µs, total: 1.78 s
Wall time: 1.78 s


In [24]:
%%time

#Toevoegen van extra column met index ineens
def remove_dupes(input):
    output = list(set(input.split(' ')))
    if '' in output:
        output.remove('')
    for i, value in enumerate(output):
        output[i] = vocab[value]
    return np.array(output)
df['content_list'] = df['content'].apply(lambda content: remove_dupes(content))

CPU times: user 2.67 s, sys: 17 µs, total: 2.67 s
Wall time: 2.68 s


In [25]:
%%time

numberOfTopics = AMOUNT_OF_TOPICS
topics = [i for i in range(0, numberOfTopics)]

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 11.4 µs


In [26]:
%%time

wordToTopic = numpy.zeros((len(vocab), numberOfTopics), dtype=numpy.int64)
documentToTopic = numpy.zeros((len(df.content), numberOfTopics))
topicAssignment = []
words_in_topic = numpy.zeros(numberOfTopics, dtype=numpy.int64)
words_in_doc = [0 for i in range(len(df.content))]
for i, content in enumerate(df.content_list):
    tmp = []
    for word in content:
        topic = numpy.random.choice(topics)
        tmp.append(topic)
        wordToTopic[word, topic] += 1
        documentToTopic[i, topic] += 1
        words_in_topic[topic] += 1
        words_in_doc[i] += 1
    topicAssignment.append(np.array(tmp))
    
topicAssignment = np.array(topicAssignment)

CPU times: user 45 s, sys: 28 ms, total: 45 s
Wall time: 45.2 s


In [27]:
%%time

alpha = ALPHA
beta = BETA

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.15 µs


In [28]:
%%time

@njit
def calc_fast(TA_d, doc_index, word_index, wordToTopic, word, words_in_topic, total_unique_word_b, document_words, WID):
    document_words[TA_d[word_index]] -= 1
    wordToTopic[word, TA_d[word_index]] -= 1
    words_in_topic[TA_d[word_index]] -= 1
                
    new_topic = numpy.argmax(np.multiply(np.divide((wordToTopic[word] + beta), 
                                                   (words_in_topic + total_unique_word_b)), 
                                         np.divide((document_words + alpha), 
                                                   (WID))))

    TA_d[word_index] = new_topic
    document_words[new_topic] += 1
    wordToTopic[word, new_topic] += 1
    words_in_topic[new_topic] += 1
    
def gibs_it_sanic(i):
    total_unique_word_b = len(vocab) * beta
    topic_count_a = numberOfTopics * alpha
    for iteration in range(i):
        print("Iteration:", iteration + 1)
        
        for doc_index, word_list in enumerate(df.content_list):
            document_words = documentToTopic[doc_index]
            WID = words_in_doc[doc_index] + topic_count_a
            TA_d = topicAssignment[doc_index]
            
            for word_index, word in enumerate(word_list):
                calc_fast(TA_d, doc_index, word_index, wordToTopic, word, words_in_topic, total_unique_word_b, document_words, WID)

                


CPU times: user 623 µs, sys: 5 µs, total: 628 µs
Wall time: 639 µs


In [35]:
%%time
gibs_it_sanic(10)

Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
CPU times: user 56 s, sys: 32 ms, total: 56.1 s
Wall time: 56.2 s


In [30]:
# %lprun -f gibs_it_sanic gibs_it_sanic(10)

In [36]:
t2w = pd.DataFrame(wordToTopic).T
t2w.columns = vocab

top_words_per_topic = [
    (t2w.iloc[k][t2w.iloc[k] > 0.001].sort_values(ascending=False).index.values.tolist()[:9])
    for k in topics
]



from tabulate import tabulate
print(tabulate(top_words_per_topic))

---------  -------  -----------  --------------  -------  --------  ----------  --------  -------------
began      cant     immediately  ground          speak    visit     criticized  de        pass
know       good     might        white           help     members   look        whose     security
well       day      told         thats           making   history   instead     done      together
back       never    took         ways            billion  fall      career      served    trade
think      right    left         others          yet      lot       change      trying    seen
including  since    around       city            man      big       taken       thursday  international
made       three    far          need            second   kind      social      reported  line
say        times    officials    twitter         show     law       university  wanted    coming
make       way      may          still           much     get       dont        another   going
see        country  

In [None]:

to_sort = []
for i in range(len(documentToTopic)):
    to_sort.append((documentToTopic[i][0], i))

to_sort.sort(key=lambda t: t[0], reverse=True)
result1 = df.content[to_sort[0][1]]
result2 = df.content[to_sort[1][1]]