In [1]:
%%time

import nltk
import numpy
import time
import random
import unidecode
import string
import heapq

import pandas as pd

from nltk.corpus import stopwords
from nltk import FreqDist, word_tokenize, WordNetLemmatizer
from numba import njit, jit, cuda
import numpy as np

nltk.download('stopwords')
nltk.download('wordnet')

%load_ext line_profiler


CPU times: user 1.69 s, sys: 994 ms, total: 2.69 s
Wall time: 1.08 s


[nltk_data] Downloading package stopwords to /home/tmtoon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/tmtoon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
AMOUNT_OF_TOPICS = 20

# top % of most common words are removed
TOP_K_WORD_REMOVAL = 0.001

# https://stats.stackexchange.com/questions/59684/what-are-typical-values-to-use-for-alpha-and-beta-in-latent-dirichlet-allocation/130876
# High alpha: many topics per document [low : 1 : high]
ALPHA            = 50 / AMOUNT_OF_TOPICS
# High beta: topic has many different words [0 - 1]
BETA             = 0.1



In [3]:
%%time

df = pd.read_csv("news_dataset.csv")
#df = df[:10000]
df = df[df['content'].notna()]

df["documents"] = df["content"]

CPU times: user 4.42 s, sys: 445 ms, total: 4.87 s
Wall time: 4.86 s


In [4]:
%%time

# remove accents, special characters and lowercase
# https://stackoverflow.com/questions/37926248/how-to-remove-accents-from-values-in-columns
df.content = df.content.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.lower().str.replace('[^a-z ]', '')
print(df.content.head())

0    washington eeaaaaeeeace    congressional repub...
1    after the bullet shells get counted the blood ...
2    when walt disneys bambi opened in  critics pra...
3    death may be the great equalizer but it isnt n...
4    seoul south korea     north koreas leader kim ...
Name: content, dtype: object
CPU times: user 11.2 s, sys: 647 ms, total: 11.9 s
Wall time: 11.9 s


In [5]:
%%time
print(len(df['content'][0]))
lemmatizer = WordNetLemmatizer()
def remove_singles(content):
    d = dict()
    for word in content.split(' '):
        word = lemmatizer.lemmatize(word)
        if word in d:
            d[word] += 1
        else:
            d[word] = 1
    
    new = ""
    for word, val in d.items():
        if val > 1 and word != '':
            new += (word + " ") * val
            
    return new

df['content'] = df['content'].apply(lambda content: remove_singles(content))
print(len(df['content'][0]))

5498
3316
CPU times: user 5min 52s, sys: 521 ms, total: 5min 53s
Wall time: 5min 53s


In [6]:
%%time

# remove stop words and single letters
# https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
stopwords_list = stopwords.words('english')
stopwords_list.extend(list(string.ascii_lowercase))
#pat = r'\b(?:{})\b'.format('|'.join(stopwords_list))
#df['content'] = df['content'].str.replace(pat, '')
#print(df.content.head())

CPU times: user 2.1 ms, sys: 0 ns, total: 2.1 ms
Wall time: 1.44 ms


In [7]:
%%time


fdict = dict()

if TOP_K_WORD_REMOVAL > 0:

    # remove common words
    for i in df.content.str.split(' '):
        i = list(filter(lambda x:x!="" , i))
        for word in i:
            if word != '':
                if word in fdict:
                    fdict[word] += 1
                else:
                    fdict[word] = 1

CPU times: user 28.9 s, sys: 1.2 s, total: 30.1 s
Wall time: 30.1 s


In [8]:
%%time

length = int(TOP_K_WORD_REMOVAL * len(fdict))
top_k_words = heapq.nlargest(length, fdict, key=fdict.get)

top_k_words.extend(stopwords_list)

pat = r'\b(?:{})\b'.format('|'.join(top_k_words))
print(pat)
df['content'] = df['content'].str.replace(pat, '')
print(df.content.head())

\b(?:the|a|to|of|and|in|that|it|for|is|on|he|wa|with|said|his|at|i|have|by|but|be|trump|from|are|not|ha|this|an|they|who|about|we|you|or|their|were|more|had|she|her|one|will|would|people|been|there|year|u|state|when|if|which|what|say|all|new|out|so|time|up|president|after|like|than|also|can|s|clinton|no|some|just|him|other|do|our|them|my|into|could|over|republican|mr|how|american|because|first|even|now|two|woman|day|country|get|many|most|make|percent|campaign|last|those|government|way|house|only|told|me|think|company|these|right|police|dont|against|white|where|news|any|know|going|did|before|group|while|want|may|i|me|my|myself|we|our|ours|ourselves|you|you're|you've|you'll|you'd|your|yours|yourself|yourselves|he|him|his|himself|she|she's|her|hers|herself|it|it's|its|itself|they|them|their|theirs|themselves|what|which|who|whom|this|that|that'll|these|those|am|is|are|was|were|be|been|being|have|has|had|having|do|does|did|doing|a|an|the|and|but|if|or|because|as|until|while|of|at|by|for|wit

In [9]:
%%time

# remove multiple spaces in a
df['content'] = df['content'].str.replace(r'\s+', ' ')
print(df.content.head())

0    congressional congressional congressional come...
1     blood blood blood window window see see see c...
2     walt walt walt disney disney disney disney di...
3    death death death death death death death deat...
4    south south south south south south south kore...
Name: content, dtype: object
CPU times: user 7.75 s, sys: 7.96 ms, total: 7.76 s
Wall time: 7.76 s


In [10]:
%%time

vocab_set = set()

def compute_dict(input):
    result = dict()

    for word in input.split(' '):
        if word != '':
            vocab_set.add(word)

df['content'].apply(lambda content: compute_dict(content))
vocab = list(vocab_set)

tmp = {}
count = 0
for word in vocab:
    tmp[word] = count
    count += 1
vocab = tmp

CPU times: user 4.08 s, sys: 11 µs, total: 4.08 s
Wall time: 4.07 s


In [11]:
%%time

#Toevoegen van extra column met index ineens
def remove_dupes(input):
    output = input.split(' ')
    output = list(filter(lambda x:x!="" , output))
    for i, value in enumerate(output):
        output[i] = vocab[value]
    return np.array(output)
df['content_list'] = df['content'].apply(lambda content: remove_dupes(content))

CPU times: user 7.7 s, sys: 17 µs, total: 7.7 s
Wall time: 7.7 s


In [12]:
%%time

numberOfTopics = AMOUNT_OF_TOPICS
topics = [i for i in range(0, numberOfTopics)]

CPU times: user 9 µs, sys: 0 ns, total: 9 µs
Wall time: 13.1 µs


In [13]:
%%time

wordToTopic = numpy.zeros((len(vocab), numberOfTopics), dtype=numpy.int64)
documentToTopic = numpy.zeros((len(df.content), numberOfTopics), dtype=numpy.int64)
topicAssignment = numpy.zeros(len(df.content_list), dtype=numpy.ndarray)
words_in_topic = numpy.zeros(numberOfTopics, dtype=numpy.int64)
words_in_doc = [0 for i in range(len(df.content))]
for i, content in enumerate(df.content_list):
    tmp = np.zeros(len(content), dtype=numpy.int64)
    for word_index, word in enumerate(content):
        topic = topics[int(np.random.rand()*numberOfTopics)]
        tmp[word_index] = topic
        wordToTopic[word, topic] += 1
        documentToTopic[i, topic] += 1
        words_in_topic[topic] += 1
        words_in_doc[i] += 1
    topicAssignment[i] = tmp

CPU times: user 1min 5s, sys: 107 ms, total: 1min 5s
Wall time: 1min 5s


In [14]:
%%time

alpha = ALPHA
beta = BETA

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 6.91 µs


In [15]:
%%time

@njit
def calc_fast(rand_val, word_index, word, TA_d, doc_index, wordToTopic, words_in_topic, total_unique_word_b, document_words, WID):
    
        document_words[TA_d[word_index]] -= 1               
        wordToTopic[word, TA_d[word_index]] -= 1
        words_in_topic[TA_d[word_index]] -= 1

        new_topic = (np.multiply(np.divide((wordToTopic[word] + beta), 
                                                       (words_in_topic + total_unique_word_b)), 
                                             np.divide((document_words + alpha), 
                                                       (WID))))

        new_topic = np.divide(new_topic, np.sum(new_topic))
        new_topic = np.cumsum(new_topic)
        topic = 0
        for i in range(0, numberOfTopics):
            if rand_val[word_index] < new_topic[i]:
                topic = i
                break

        new_topic = topic
        TA_d[word_index] = new_topic
        document_words[new_topic] += 1
        wordToTopic[word, new_topic] += 1
        words_in_topic[new_topic] += 1
    
def gibs_it_sanic(i):
    total_unique_word_b = len(vocab) * beta
    topic_count_a = numberOfTopics * alpha
    for iteration in range(i):
        print("Iteration:", iteration + 1)
        
        for doc_index, word_list in enumerate(df.content_list):
            document_words = documentToTopic[doc_index]
            WID = words_in_doc[doc_index] + topic_count_a
            TA_d = topicAssignment[doc_index]
            
            rand_val = np.random.random(len(word_list))
            for word_index, word in enumerate(word_list):
                calc_fast(rand_val, word_index, word, TA_d, doc_index, wordToTopic, words_in_topic, total_unique_word_b, document_words, WID)

                


CPU times: user 251 µs, sys: 2 µs, total: 253 µs
Wall time: 258 µs


In [36]:
%%time
gibs_it_sanic(10)

Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
CPU times: user 5min 43s, sys: 76.7 ms, total: 5min 43s
Wall time: 5min 42s


In [37]:
# %lprun -f gibs_it_sanic gibs_it_sanic(10)

In [61]:
t2w = pd.DataFrame(wordToTopic).T
t2w.columns = vocab

top_words_per_topic = [
    (t2w.iloc[k][t2w.iloc[k] > 0.001].sort_values(ascending=False).index.values.tolist()[:9])
    for k in topics
]


from tabulate import tabulate
print(tabulate(top_words_per_topic))

---------  -------  ----------  -------------  ---------  ------------  ---------  -----------  ----------
official   russia   russian     investigation  email      intelligence  security   former       fbi
city       york     million     border         wall       mexico        building   space        foundation
medium     donald   twitter     hillary        march      fox           breitbart  post         speech
tax        billion  market      bank           million    rate          price      economy      financial
court      law      case        justice        order      federal       judge      immigration  department
child      drug     study       health         patient    found         human      doctor       medical
im         thing    life        story          friend     didnt         really     something    never
bill       health   plan        care           law        program       act        insurance    abortion
attack     muslim   terrorist   christian      family     c

In [140]:
t2d = pd.DataFrame(documentToTopic.T)

top_docs_per_topic = [
    (t2d.iloc[k][t2d.iloc[k] > 0.001].sort_values(ascending=False).index.values.tolist()[:5])
    for k in topics
]


from tabulate import tabulate
print(tabulate(top_docs_per_topic))

------  ------  ------  ------  ------
133097  132091      89  100946   38569
 51426   51737   50498   51466   51344
 13282   30214   18159   22223   17436
 70605  126345    3093   97255  112542
 71781   96194  128345   36980   73238
 50716  106921    3423   35610   50027
 53790   53787  108026   53551    1162
126885  130005  126218    1115  129303
138437  139018   28318   36060  126572
 56268   53760   53750   53757   53801
 96506  103633  136425   98369   97459
 73706   15664   28739  101692   36978
 53784   43807   53446   26645   51665
106931    6605   68442   69574   68526
 67991   53534   24865  140995   53462
 66870   67980   70780   66660   23284
129968  130648   68824   67169  129113
 56190   55656   54738   55118   52518
 36060    2754   69997   61499   63068
136425   30052  132885   44059   53466
------  ------  ------  ------  ------


In [135]:
print(df["documents"][89])



In [141]:
%%time

docs = len(df.content_list)
score = 0
for tops in top_words_per_topic:
    for i in range(0, len(tops)-2):
        w_i = tops[i]
        w_j = tops[i+1]
        
        c1 = 0
        c2 = 0
        c3 = 0
        for item in df.content_list:
            tmp = 0
            if vocab[w_i] in item:
                c1 += 1
                tmp += 1
            if vocab[w_j] in item:
                c2 += 1
                tmp += 1
            if tmp == 2:
                c3 += 1
        
        score += np.log((c3 / docs) / (c1 / docs) * (c2 / docs))
print(score)

-689.0471412209167
CPU times: user 2min 1s, sys: 27.6 ms, total: 2min 1s
Wall time: 2min 1s
