In [70]:
import nltk
import numpy
import time
import random
import unidecode

import pandas as pd

from nltk.corpus import stopwords

nltk.download('stopwords')

%load_ext line_profiler


The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


[nltk_data] Downloading package stopwords to /home/tmtoon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [71]:
df = pd.read_csv("news_dataset.csv")
df = df[:100]

In [72]:
# remove accents, special characters and lowercase
# https://stackoverflow.com/questions/37926248/how-to-remove-accents-from-values-in-columns
df.content = df.content.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.lower().str.replace('[^a-z ]', '')
print(df.content.head())

# remove stop words
# https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
stopwords = stopwords.words('english')
pat = r'\b(?:{})\b'.format('|'.join(stopwords))
df['content'] = df['content'].str.replace(pat, '')


# remove multiple spaces in a
df['content'] = df['content'].str.replace(r'\s+', ' ')
df = df[df['content'].notna()]
print(df.content.head())


0    washington eeaaaaeeeace    congressional repub...
1    after the bullet shells get counted the blood ...
2    when walt disneys bambi opened in  critics pra...
3    death may be the great equalizer but it isnt n...
4    seoul south korea     north koreas leader kim ...
Name: content, dtype: object
0    washington eeaaaaeeeace congressional republic...
1     bullet shells get counted blood dries votive ...
2     walt disneys bambi opened critics praised spa...
3    death may great equalizer isnt necessarily eve...
4    seoul south korea north koreas leader kim said...
Name: content, dtype: object


In [73]:
vocab_set = set()

def compute_dict(input):
    result = dict()

    for word in input.split(' '):
        if word != '':
            vocab_set.add(word)
            if word in result:
                result[word] += 1
            else:
                result[word] = 1
    return result

df['content_dict'] = df['content'].apply(lambda content: compute_dict(content))
vocab = list(vocab_set)

tmp ={}
count = 0
for word in vocab:
    tmp[word] = count
    count += 1
vocab = tmp

In [74]:
#Toevoegen van extra column met index ineens
def remove_dupes(input):
    output = list(set(input.split(' ')))
    if '' in output:
        output.remove('')
    for i, value in enumerate(output):
        output[i] = vocab[value]
    return output
df['content_list'] = df['content'].apply(lambda content: remove_dupes(content))

In [75]:
numberOfTopics = 20
topics = [i for i in range(0, numberOfTopics)]

In [76]:
wordToTopic = numpy.zeros((len(vocab), numberOfTopics))
documentToTopic = numpy.zeros((len(df.content), numberOfTopics))
topicAssignment = []
words_in_topic = [0 for i in range(numberOfTopics)]
words_in_doc = [0 for i in range(len(df.content))]
for i, content in enumerate(df.content_list):
    tmp = []
    for word in content:
        topic = numpy.random.choice(topics)
        tmp.append(topic)
        wordToTopic[word, topic] += 1
        documentToTopic[i, topic] += 1
        words_in_topic[topic] += 1
        words_in_doc[i] += 1
    topicAssignment.append(tmp)

In [77]:
alpha = 1
beta = 0.001

In [78]:
def gibs_it(i):
    total_unique_word_b = len(vocab) * beta
    topic_count_a = numberOfTopics * alpha
    for iteration in range(i):
        print("Iteration:", iteration + 1)
        
        for doc_index, word_list in enumerate(df.content_list):
            document_words = documentToTopic[doc_index]
            TA_d = topicAssignment[doc_index]
            WID = words_in_doc[doc_index] - 1 + topic_count_a
            
            
            for word_index, word in enumerate(word_list):
                TA = TA_d[word_index]
                documentToTopic[doc_index, TA] -= 1
                wordToTopic[word, TA] -= 1
                words_in_topic[TA] -= 1
                
                tmp = 0
                new_topic = 0
                
                count_word = wordToTopic[word]
                for topic in topics:
                    p = ((count_word[topic] + beta)/(words_in_topic[topic] + total_unique_word_b) * ((document_words[topic] + alpha)/(WID)))
                    
                    if p > tmp:
                        tmp = p
                        new_topic = topic
                    
                    
                TA_d[word_index] = new_topic
                documentToTopic[doc_index, new_topic] += 1
                wordToTopic[word, new_topic] += 1
                words_in_topic[new_topic] += 1


In [None]:
%lprun -f gibs_it gibs_it(5)

Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4


In [79]:
%%time

gibs_it(5)

Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
CPU times: user 5.49 s, sys: 3.84 ms, total: 5.49 s
Wall time: 5.5 s


In [11]:

to_sort = []
for i in range(len(documentToTopic)):
    to_sort.append((documentToTopic[i][0], i))

to_sort.sort(key=lambda t: t[0], reverse=True)
result1 = df.content[to_sort[0][1]]
result2 = df.content[to_sort[1][1]]