In [1]:
%%time

import nltk
import numpy
import time
import random
import unidecode
import string
import heapq

import pandas as pd

from nltk.corpus import stopwords
from nltk import FreqDist, word_tokenize, WordNetLemmatizer
from numba import njit, jit, cuda
import numpy as np

nltk.download('stopwords')
nltk.download('wordnet')

%load_ext line_profiler


CPU times: user 1.65 s, sys: 1.11 s, total: 2.76 s
Wall time: 1.15 s


[nltk_data] Downloading package stopwords to /home/tmtoon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/tmtoon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
AMOUNT_OF_TOPICS = 10

# top % of most common words are removed
TOP_K_WORD_REMOVAL = 0.001

# https://stats.stackexchange.com/questions/59684/what-are-typical-values-to-use-for-alpha-and-beta-in-latent-dirichlet-allocation/130876
# High alpha: many topics per document [low : 1 : high]
ALPHA            = 1
# High beta: topic has many different words [0 - 1]
BETA             = 0.1



In [3]:
%%time

df = pd.read_csv("news_dataset.csv")
#df = df[:10000]
df = df[df['content'].notna()]

CPU times: user 4.22 s, sys: 507 ms, total: 4.73 s
Wall time: 4.73 s


In [4]:
%%time

# remove accents, special characters and lowercase
# https://stackoverflow.com/questions/37926248/how-to-remove-accents-from-values-in-columns
df.content = df.content.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.lower().str.replace('[^a-z ]', '')
print(df.content.head())

0    washington eeaaaaeeeace    congressional repub...
1    after the bullet shells get counted the blood ...
2    when walt disneys bambi opened in  critics pra...
3    death may be the great equalizer but it isnt n...
4    seoul south korea     north koreas leader kim ...
Name: content, dtype: object
CPU times: user 10.3 s, sys: 707 ms, total: 11 s
Wall time: 11 s


In [5]:
%%time
print(len(df['content'][0]))
lemmatizer = WordNetLemmatizer()
def remove_singles(content):
    d = dict()
    for word in content.split(' '):
        word = lemmatizer.lemmatize(word)
        if word in d:
            d[word] += 1
        else:
            d[word] = 1
    
    new = ""
    for word, val in d.items():
        if val > 1 and word != '':
            new += (word + " ") * val
            
    return new

df['content'] = df['content'].apply(lambda content: remove_singles(content))
print(len(df['content'][0]))

5498
3316
CPU times: user 5min 32s, sys: 444 ms, total: 5min 33s
Wall time: 5min 33s


In [6]:
%%time

# remove stop words and single letters
# https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
stopwords_list = stopwords.words('english')
stopwords_list.extend(list(string.ascii_lowercase))
#pat = r'\b(?:{})\b'.format('|'.join(stopwords_list))
#df['content'] = df['content'].str.replace(pat, '')
#print(df.content.head())

CPU times: user 2.29 ms, sys: 8 µs, total: 2.3 ms
Wall time: 1.47 ms


In [7]:
%%time


fdict = dict()

if TOP_K_WORD_REMOVAL > 0:

    # remove common words
    for i in df.content.str.split(' '):
        i = list(filter(lambda x:x!="" , i))
        for word in i:
            if word != '':
                if word in fdict:
                    fdict[word] += 1
                else:
                    fdict[word] = 1

CPU times: user 27.9 s, sys: 1.28 s, total: 29.2 s
Wall time: 29.2 s


In [8]:
%%time

length = int(TOP_K_WORD_REMOVAL * len(fdict))
top_k_words = heapq.nlargest(length, fdict, key=fdict.get)

top_k_words.extend(stopwords_list)

pat = r'\b(?:{})\b'.format('|'.join(top_k_words))
print(pat)
df['content'] = df['content'].str.replace(pat, '')
print(df.content.head())

\b(?:the|a|to|of|and|in|that|it|for|is|on|he|wa|with|said|his|at|i|have|by|but|be|trump|from|are|not|ha|this|an|they|who|about|we|you|or|their|were|more|had|she|her|one|will|would|people|been|there|year|u|state|when|if|which|what|say|all|new|out|so|time|up|president|after|like|than|also|can|s|clinton|no|some|just|him|other|do|our|them|my|into|could|over|republican|mr|how|american|because|first|even|now|two|woman|day|country|get|many|most|make|percent|campaign|last|those|government|way|house|only|told|me|think|company|these|right|police|dont|against|white|where|news|any|know|going|did|before|group|while|want|may|i|me|my|myself|we|our|ours|ourselves|you|you're|you've|you'll|you'd|your|yours|yourself|yourselves|he|him|his|himself|she|she's|her|hers|herself|it|it's|its|itself|they|them|their|theirs|themselves|what|which|who|whom|this|that|that'll|these|those|am|is|are|was|were|be|been|being|have|has|had|having|do|does|did|doing|a|an|the|and|but|if|or|because|as|until|while|of|at|by|for|wit

In [9]:
%%time

# remove multiple spaces in a
df['content'] = df['content'].str.replace(r'\s+', ' ')
print(df.content.head())

0    congressional congressional congressional come...
1     blood blood blood window window see see see c...
2     walt walt walt disney disney disney disney di...
3    death death death death death death death deat...
4    south south south south south south south kore...
Name: content, dtype: object
CPU times: user 7.57 s, sys: 0 ns, total: 7.57 s
Wall time: 7.57 s


In [10]:
%%time

vocab_set = set()

def compute_dict(input):
    result = dict()

    for word in input.split(' '):
        if word != '':
            vocab_set.add(word)

df['content'].apply(lambda content: compute_dict(content))
vocab = list(vocab_set)

tmp = {}
count = 0
for word in vocab:
    tmp[word] = count
    count += 1
vocab = tmp

CPU times: user 4.02 s, sys: 0 ns, total: 4.02 s
Wall time: 4.03 s


In [19]:
%%time

#Toevoegen van extra column met index ineens
def remove_dupes(input):
    output = input.split(' ')
    output = list(filter(lambda x:x!="" , output))
    for i, value in enumerate(output):
        output[i] = vocab[value]
    return np.array(output)
df['content_list'] = df['content'].apply(lambda content: remove_dupes(content))

CPU times: user 5.75 s, sys: 3.99 ms, total: 5.75 s
Wall time: 5.75 s


In [20]:
%%time

numberOfTopics = AMOUNT_OF_TOPICS
topics = [i for i in range(0, numberOfTopics)]

CPU times: user 8 µs, sys: 0 ns, total: 8 µs
Wall time: 10.7 µs


In [21]:
%%time

wordToTopic = numpy.zeros((len(vocab), numberOfTopics), dtype=numpy.int64)
documentToTopic = numpy.zeros((len(df.content), numberOfTopics), dtype=numpy.int64)
topicAssignment = numpy.zeros(len(df.content_list), dtype=numpy.ndarray)
words_in_topic = numpy.zeros(numberOfTopics, dtype=numpy.int64)
words_in_doc = [0 for i in range(len(df.content))]
for i, content in enumerate(df.content_list):
    tmp = np.zeros(len(content), dtype=numpy.int64)
    for word_index, word in enumerate(content):
        topic = topics[int(np.random.rand()*numberOfTopics)]
        tmp[word_index] = topic
        wordToTopic[word, topic] += 1
        documentToTopic[i, topic] += 1
        words_in_topic[topic] += 1
        words_in_doc[i] += 1
    topicAssignment[i] = tmp

CPU times: user 57.1 s, sys: 88.2 ms, total: 57.2 s
Wall time: 56.6 s


In [22]:
%%time

alpha = ALPHA
beta = BETA

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 5.72 µs


In [23]:
%%time

@njit
def calc_fast(rand_val, word_index, word, TA_d, doc_index, wordToTopic, words_in_topic, total_unique_word_b, document_words, WID):
    
        document_words[TA_d[word_index]] -= 1               
        wordToTopic[word, TA_d[word_index]] -= 1
        words_in_topic[TA_d[word_index]] -= 1

        new_topic = (np.multiply(np.divide((wordToTopic[word] + beta), 
                                                       (words_in_topic + total_unique_word_b)), 
                                             np.divide((document_words + alpha), 
                                                       (WID))))

        new_topic = np.divide(new_topic, np.sum(new_topic))
        new_topic = np.cumsum(new_topic)
        topic = 0
        for i in range(0, numberOfTopics):
            if rand_val[word_index] < new_topic[i]:
                topic = i
                break

        new_topic = topic
        TA_d[word_index] = new_topic
        document_words[new_topic] += 1
        wordToTopic[word, new_topic] += 1
        words_in_topic[new_topic] += 1
    
def gibs_it_sanic(i):
    total_unique_word_b = len(vocab) * beta
    topic_count_a = numberOfTopics * alpha
    for iteration in range(i):
        print("Iteration:", iteration + 1)
        
        for doc_index, word_list in enumerate(df.content_list):
            document_words = documentToTopic[doc_index]
            WID = words_in_doc[doc_index] + topic_count_a
            TA_d = topicAssignment[doc_index]
            
            rand_val = np.random.random(len(word_list))
            for word_index, word in enumerate(word_list):
                calc_fast(rand_val, word_index, word, TA_d, doc_index, wordToTopic, words_in_topic, total_unique_word_b, document_words, WID)

                


CPU times: user 246 µs, sys: 2 µs, total: 248 µs
Wall time: 252 µs


In [24]:
%%time
gibs_it_sanic(10)

Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
CPU times: user 5min 24s, sys: 73.2 ms, total: 5min 25s
Wall time: 5min 24s


In [25]:
# %lprun -f gibs_it_sanic gibs_it_sanic(10)

In [26]:
t2w = pd.DataFrame(wordToTopic).T
t2w.columns = vocab

top_words_per_topic = [
    (t2w.iloc[k][t2w.iloc[k] > 0.001].sort_values(ascending=False).index.values.tolist()[:9])
    for k in topics
]



from tabulate import tabulate
print(tabulate(top_words_per_topic))

--------  --------  -------  -------  ---------  --------  ------------  --------------  -------------
court     law       case     justice  bill       federal   department    senate          investigation
health    million   billion  car      bank       market    apple         plan            cost
officer   city      attack   force    islamic    isi       iran          shooting        official
party     cruz      rubio    vote     york       election  voter         democrat        donald
game      team      player   season   de         back      world         play            sport
school    student   sander   work     fox        study     child         voter           university
official  russia    russian  attack   security   united    intelligence  administration  syria
party     election  donald   thing    candidate  voter     political     vote            thats
tax       china     north    policy   korea      business  trade         million         economic
life      show      thi

In [None]:

to_sort = []
for i in range(len(documentToTopic)):
    to_sort.append((documentToTopic[i][0], i))

to_sort.sort(key=lambda t: t[0], reverse=True)
result1 = df.content[to_sort[0][1]]
result2 = df.content[to_sort[1][1]]