In [1]:
import pandas as pd
import nltk
import numpy
import time
import random
nltk.download('stopwords')
from nltk.corpus import stopwords
import unidecode



[nltk_data] Downloading package stopwords to /home/senne/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv("news_dataset.csv")
df = df[:100]

In [3]:
# remove accents, special characters and lowercase
# https://stackoverflow.com/questions/37926248/how-to-remove-accents-from-values-in-columns
df.content = df.content.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.lower().str.replace('[^a-z ]', '')
print(df.content.head())

# remove stop words
# https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
stopwords = stopwords.words('english')
pat = r'\b(?:{})\b'.format('|'.join(stopwords))
df['content'] = df['content'].str.replace(pat, '')


# remove multiple spaces in a
df['content'] = df['content'].str.replace(r'\s+', ' ')
df = df[df['content'].notna()]
print(df.content.head())


0    washington     congressional republicans have ...
1    after the bullet shells get counted the blood ...
2    when walt disneys bambi opened in  critics pra...
3    death may be the great equalizer but it isnt n...
4    seoul south korea     north koreas leader kim ...
Name: content, dtype: object
0    washington congressional republicans new fear ...
1     bullet shells get counted blood dries votive ...
2     walt disneys bambi opened critics praised spa...
3    death may great equalizer isnt necessarily eve...
4    seoul south korea north koreas leader kim said...
Name: content, dtype: object


In [4]:
vocab_set = set()

def compute_dict(input):
    result = dict()

    for word in input.split(' '):
        if word != '':
            vocab_set.add(word)
            if word in result:
                result[word] += 1
            else:
                result[word] = 1
    return result

df['content_dict'] = df['content'].apply(lambda content: compute_dict(content))
vocab = list(vocab_set)

tmp ={}
count = 0
for word in vocab:
    tmp[word] = count
    count += 1
vocab = tmp

In [5]:
#Toevoegen van extra column met index ineens
def remove_dupes(input):
    output = list(set(input.split(' ')))
    if '' in output:
        output.remove('')
    for i, value in enumerate(output):
        output[i] = vocab[value]
    return output
df['content_list'] = df['content'].apply(lambda content: remove_dupes(content))

In [6]:
numberOfTopics = 20
topics = [i for i in range(0, numberOfTopics)]

In [7]:
wordToTopic = numpy.zeros((len(vocab), numberOfTopics))
documentToTopic = numpy.zeros((len(df.content), numberOfTopics))
topicAssignment = []
words_in_topic = [0 for i in range(numberOfTopics)]
words_in_doc = [0 for i in range(len(df.content))]
for i, content in enumerate(df.content_list):
    tmp = []
    for word in content:
        topic = numpy.random.choice(topics)
        tmp.append(topic)
        wordToTopic[word, topic] += 1
        documentToTopic[i, topic] += 1
        words_in_topic[topic] += 1
        words_in_doc[i] += 1
    topicAssignment.append(tmp)

In [8]:
alpha = 1
beta = 0.001

def gibs():
    for doc_index, word_list in enumerate(df.content_list):
        for word_index, word in enumerate(word_list):

            documentToTopic[doc_index, topicAssignment[doc_index][word_index]] -= 1
            wordToTopic[word, topicAssignment[doc_index][word_index]] -= 1
            words_in_topic[topicAssignment[doc_index][word_index]] -= 1
            P = []
            for topic in topics:
                total_words_in_topic = words_in_topic[topic]
                count_word_in_topic = wordToTopic[word, topic]
                total_unique_word = len(vocab)

                document_words_in_topic = documentToTopic[doc_index, topic]
                topic_count = numberOfTopics
                total_document_words_in_topic = words_in_doc[doc_index]-1

                P.append(((count_word_in_topic + beta)/(total_words_in_topic + total_unique_word*beta))
                         *((document_words_in_topic + alpha)/(total_document_words_in_topic + topic_count * alpha)))

            P = P / numpy.sum(P)

            #werken met kansvector of grootste kans
            new_topic = numpy.random.choice(topics, p=P)
            topicAssignment[doc_index][word_index] = new_topic
            documentToTopic[doc_index, new_topic] += 1
            wordToTopic[word, new_topic] += 1
            words_in_topic[new_topic] += 1




tick1 = time.time()

for iteration in range(5):
    print(iteration)
    gibs()

print("model time: " + str(time.time()-tick1))

0
1
2
3
4
model time: 16.753443717956543


In [9]:

to_sort = []
for i in range(len(documentToTopic)):
    to_sort.append((documentToTopic[i][0], i))

to_sort.sort(key=lambda t: t[0], reverse=True)
result1 = df.content[to_sort[0][1]]
result2 = df.content[to_sort[1][1]]