In [1]:
%%time

import nltk
import numpy
import time
import random
import unidecode
import string
import heapq

import pandas as pd

from nltk.corpus import stopwords
from nltk import FreqDist, word_tokenize, WordNetLemmatizer
from numba import njit, jit, cuda
import numpy as np

nltk.download('stopwords')
nltk.download('wordnet')

%load_ext line_profiler


CPU times: user 1.99 s, sys: 1.13 s, total: 3.12 s
Wall time: 2.29 s


[nltk_data] Downloading package stopwords to /home/tmtoon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/tmtoon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
AMOUNT_OF_TOPICS = 20

# top % of most common words are removed
TOP_K_WORD_REMOVAL = 0.001

# https://stats.stackexchange.com/questions/59684/what-are-typical-values-to-use-for-alpha-and-beta-in-latent-dirichlet-allocation/130876
# High alpha: many topics per document [low : 1 : high]
ALPHA            = 50 / AMOUNT_OF_TOPICS
# High beta: topic has many different words [0 - 1]
BETA             = 0.1



In [3]:
%%time

df = pd.read_csv("news_dataset.csv")
#df = df[:25000]
df = df[df['content'].notna()]

df["documents"] = df["content"]

CPU times: user 8.32 s, sys: 967 ms, total: 9.29 s
Wall time: 9.87 s


In [4]:
%%time

# remove accents, special characters and lowercase
# https://stackoverflow.com/questions/37926248/how-to-remove-accents-from-values-in-columns
df.content = df.content.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.lower().str.replace('[^a-z ]', '')
print(df.content.head())

0    washington eeaaaaeeeace    congressional repub...
1    after the bullet shells get counted the blood ...
2    when walt disneys bambi opened in  critics pra...
3    death may be the great equalizer but it isnt n...
4    seoul south korea     north koreas leader kim ...
Name: content, dtype: object
CPU times: user 16.9 s, sys: 977 ms, total: 17.9 s
Wall time: 17.9 s


In [5]:
%%time
print(len(df['content'][0]))
lemmatizer = WordNetLemmatizer()
def remove_singles(content):
    d = dict()
    for word in content.split(' '):
        word = lemmatizer.lemmatize(word)
        if word in d:
            d[word] += 1
        else:
            d[word] = 1
    
    new = ""
    for word, val in d.items():
        if val > 1 and word != '':
            new += (word + " ") * val
            
    return new

df['content'] = df['content'].apply(lambda content: remove_singles(content))
print(len(df['content'][0]))

5498
3316
CPU times: user 7min, sys: 630 ms, total: 7min
Wall time: 7min


In [6]:
%%time

# remove stop words and single letters
# https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
stopwords_list = stopwords.words('english')
stopwords_list.extend(list(string.ascii_lowercase))
#pat = r'\b(?:{})\b'.format('|'.join(stopwords_list))
#df['content'] = df['content'].str.replace(pat, '')
#print(df.content.head())

CPU times: user 1.99 ms, sys: 3 µs, total: 2 ms
Wall time: 1.26 ms


In [7]:
%%time


fdict = dict()

if TOP_K_WORD_REMOVAL > 0:

    # remove common words
    for i in df.content.str.split(' '):
        i = list(filter(lambda x:x!="" , i))
        for word in i:
            if word != '':
                if word in fdict:
                    fdict[word] += 1
                else:
                    fdict[word] = 1

CPU times: user 30.3 s, sys: 4.13 s, total: 34.4 s
Wall time: 45.8 s


In [8]:
%%time

length = int(TOP_K_WORD_REMOVAL * len(fdict))
top_k_words = heapq.nlargest(length, fdict, key=fdict.get)

top_k_words.extend(stopwords_list)

pat = r'\b(?:{})\b'.format('|'.join(top_k_words))
print(pat)
df['content'] = df['content'].str.replace(pat, '')
print(df.content.head())

\b(?:the|a|to|of|and|in|that|it|for|is|on|he|wa|with|said|his|at|i|have|by|but|be|trump|from|are|not|ha|this|an|they|who|about|we|you|or|their|were|more|had|she|her|one|will|would|people|been|there|year|u|state|when|if|which|what|say|all|new|out|so|time|up|president|after|like|than|also|can|s|clinton|no|some|just|him|other|do|our|them|my|into|could|over|republican|mr|how|american|because|first|even|now|two|woman|day|country|get|many|most|make|percent|campaign|last|those|government|way|house|only|told|me|think|company|these|right|police|dont|against|white|where|news|any|know|going|did|before|group|while|want|may|i|me|my|myself|we|our|ours|ourselves|you|you're|you've|you'll|you'd|your|yours|yourself|yourselves|he|him|his|himself|she|she's|her|hers|herself|it|it's|its|itself|they|them|their|theirs|themselves|what|which|who|whom|this|that|that'll|these|those|am|is|are|was|were|be|been|being|have|has|had|having|do|does|did|doing|a|an|the|and|but|if|or|because|as|until|while|of|at|by|for|wit

In [9]:
%%time

# remove multiple spaces in a
df['content'] = df['content'].str.replace(r'\s+', ' ')
print(df.content.head())

0    congressional congressional congressional come...
1     blood blood blood window window see see see c...
2     walt walt walt disney disney disney disney di...
3    death death death death death death death deat...
4    south south south south south south south kore...
Name: content, dtype: object
CPU times: user 9.59 s, sys: 0 ns, total: 9.59 s
Wall time: 9.59 s


In [10]:
%%time

vocab_set = set()

def compute_dict(input):
    result = dict()

    for word in input.split(' '):
        if word != '':
            vocab_set.add(word)

df['content'].apply(lambda content: compute_dict(content))
vocab = list(vocab_set)

tmp = {}
count = 0
for word in vocab:
    tmp[word] = count
    count += 1
vocab = tmp

CPU times: user 5.03 s, sys: 10 µs, total: 5.03 s
Wall time: 5.03 s


In [11]:
%%time

#Toevoegen van extra column met index ineens
def remove_dupes(input):
    output = input.split(' ')
    output = list(filter(lambda x:x!="" , output))
    for i, value in enumerate(output):
        output[i] = vocab[value]
    return np.array(output)
df['content_list'] = df['content'].apply(lambda content: remove_dupes(content))

CPU times: user 9.41 s, sys: 4.02 ms, total: 9.41 s
Wall time: 9.41 s


In [12]:
%%time

numberOfTopics = AMOUNT_OF_TOPICS
topics = [i for i in range(0, numberOfTopics)]

CPU times: user 10 µs, sys: 0 ns, total: 10 µs
Wall time: 15 µs


In [13]:
%%time

wordToTopic = numpy.zeros((len(vocab), numberOfTopics), dtype=numpy.int64)
documentToTopic = numpy.zeros((len(df.content), numberOfTopics), dtype=numpy.int64)
topicAssignment = numpy.zeros(len(df.content_list), dtype=numpy.ndarray)
words_in_topic = numpy.zeros(numberOfTopics, dtype=numpy.int64)
words_in_doc = [0 for i in range(len(df.content))]
for i, content in enumerate(df.content_list):
    tmp = np.zeros(len(content), dtype=numpy.int64)
    for word_index, word in enumerate(content):
        topic = topics[int(np.random.rand()*numberOfTopics)]
        tmp[word_index] = topic
        wordToTopic[word, topic] += 1
        documentToTopic[i, topic] += 1
        words_in_topic[topic] += 1
        words_in_doc[i] += 1
    topicAssignment[i] = tmp

CPU times: user 1min 17s, sys: 107 ms, total: 1min 17s
Wall time: 1min 16s


In [14]:
%%time

alpha = ALPHA
beta = BETA

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.11 µs


In [15]:
%%time

@njit
def calc_fast(rand_val, word_index, word, TA_d, doc_index, wordToTopic, words_in_topic, total_unique_word_b, document_words, WID):
    
        document_words[TA_d[word_index]] -= 1               
        wordToTopic[word, TA_d[word_index]] -= 1
        words_in_topic[TA_d[word_index]] -= 1

        new_topic = (np.multiply(np.divide((wordToTopic[word] + beta), 
                                                       (words_in_topic + total_unique_word_b)), 
                                             np.divide((document_words + alpha), 
                                                       (WID))))

        new_topic = np.divide(new_topic, np.sum(new_topic))
        new_topic = np.cumsum(new_topic)
        topic = 0
        for i in range(0, numberOfTopics):
            if rand_val[word_index] < new_topic[i]:
                topic = i
                break

        new_topic = topic
        TA_d[word_index] = new_topic
        document_words[new_topic] += 1
        wordToTopic[word, new_topic] += 1
        words_in_topic[new_topic] += 1
    
def gibs_it_sanic(i):
    total_unique_word_b = len(vocab) * beta
    topic_count_a = numberOfTopics * alpha
    for iteration in range(i):
        print("Iteration:", iteration + 1)
        
        for doc_index, word_list in enumerate(df.content_list):
            document_words = documentToTopic[doc_index]
            WID = words_in_doc[doc_index] + topic_count_a
            TA_d = topicAssignment[doc_index]
            
            rand_val = np.random.random(len(word_list))
            for word_index, word in enumerate(word_list):
                calc_fast(rand_val, word_index, word, TA_d, doc_index, wordToTopic, words_in_topic, total_unique_word_b, document_words, WID)

                


CPU times: user 323 µs, sys: 0 ns, total: 323 µs
Wall time: 329 µs


In [16]:
%%time
gibs_it_sanic(25)

Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
Iteration: 11
Iteration: 12
Iteration: 13
Iteration: 14
Iteration: 15
Iteration: 16
Iteration: 17
Iteration: 18
Iteration: 19
Iteration: 20
Iteration: 21
Iteration: 22
Iteration: 23
Iteration: 24
Iteration: 25
CPU times: user 17min 23s, sys: 998 ms, total: 17min 24s
Wall time: 17min 25s


In [17]:
# %lprun -f gibs_it_sanic gibs_it_sanic(10)

In [18]:
t2w = pd.DataFrame(wordToTopic).T
t2w.columns = vocab

top_words_per_topic = [
    (t2w.iloc[k][t2w.iloc[k] > 0.001].sort_values(ascending=False).index.values.tolist()[:9])
    for k in topics
]


from tabulate import tabulate
print(tabulate(top_words_per_topic))

--------  -------------  ----------  ----------  -----------  ------------  ---------  ---------  ------------
officer   twitter        medium      gun         video        shooting      breitbart  fox        march
official  investigation  email       department  report       intelligence  fbi        security   russian
law       court          case        federal     immigration  order         judge      justice    crime
black     america        food        history     political    world         movement   much       today
attack    killed         french      authority   france       man           migrant    refugee    germany
show      film           star        music       song         movie         world      series     character
force     military       war         syria       russia       isi           united     iran       syrian
school    student        university  men         college      child         girl       education  young
muslim    speech         world       israel     

In [19]:
t2d = pd.DataFrame(documentToTopic.T)

top_docs_per_topic = [
    (t2d.iloc[k][t2d.iloc[k] > 0.001].sort_values(ascending=False).index.values.tolist()[:5])
    for k in topics
]


from tabulate import tabulate
print(tabulate(top_docs_per_topic))

------  ------  ------  ------  ------
138437   28739  139018   13282   73706
132091  133097  100946   38569      89
138437   36980   67991   71781  140995
   270  130648   73143   77505   53226
 36060   25008   24865   21849    8796
 17316  104846  129968  108026   56867
 63068   61499   36060   69997  127361
 53693   53566   53580   53427   53790
126572   51165   53551    4659   71452
 83788   69253   87362   67991   53559
 66870   67980   70780   66660  136425
 70605  126885  126345  112542  130005
136014    1162   32721    6262  131434
 50498   50967   50581   51104   51344
 18159   52518   55656   56190   17436
 50716  106921    3423   50027    3435
 23284   54137   26027   56268   53992
101692   51665   15664   43807   56974
103298   68442   68526    6407  100502
 53924   72247   50319    1748   71781
------  ------  ------  ------  ------


In [20]:
%%time

docs = len(df.content_list)
score = 0
for tops in top_words_per_topic:
    for i in range(0, len(tops)-2):
        w_i = tops[i]
        w_j = tops[i+1]
        
        c1 = 0
        c2 = 0
        c3 = 0
        for item in df.content_list:
            tmp = 0
            if vocab[w_i] in item:
                c1 += 1
                tmp += 1
            if vocab[w_j] in item:
                c2 += 1
                tmp += 1
            if tmp == 2:
                c3 += 1
        
        a = c3 / docs
        b = c1 / docs
        c = c2 / docs
        if b != 0 and c != 0:
            score -= np.log(a / b * c)
score /= len(top_words_per_topic)
print(score)

36.25233180059962
CPU times: user 2min 28s, sys: 15.1 ms, total: 2min 28s
Wall time: 2min 28s


In [24]:
t2d = pd.DataFrame(documentToTopic.T)

top_docs_per_topic = [
    (t2d.iloc[k][t2d.iloc[k] > 0.001].sort_values(ascending=False).index.values.tolist()[:100])
    for k in topics
]

a = np.asarray(top_docs_per_topic)
a = a.transpose()
np.savetxt("topic_ids_", a, delimiter=",")

[[138437  28739 139018 ... 105805  86492  67004]
 [132091 133097 100946 ...  63950  44649 129440]
 [138437  36980  67991 ...  71620  96141  66938]
 ...
 [101692  51665  15664 ... 134701  95468 129016]
 [103298  68442  68526 ...  35138 104626 138394]
 [ 53924  72247  50319 ... 126506   2669  52058]]
[[138437 132091 138437 ... 101692 103298  53924]
 [ 28739 133097  36980 ...  51665  68442  72247]
 [139018 100946  67991 ...  15664  68526  50319]
 ...
 [105805  63950  71620 ... 134701  35138 126506]
 [ 86492  44649  96141 ...  95468 104626   2669]
 [ 67004 129440  66938 ... 129016 138394  52058]]
