In [120]:
from collections import Counter
import random

In [202]:
documents = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

K=4

In [203]:
def sample_from(weights):
    sum_weights=sum(weights)
    rnd=sum_weights*random.random()
    for i, w in enumerate(weights):
        rnd-=w
        if rnd<=0:
            return i

In [204]:
# a list of Counters, one for each document
document_topic_counts= [Counter() for _ in documents]

In [205]:
#How many times each word is assigned to each topic:
topic_word_counts= [Counter() for _ in range(K)]

In [206]:
#number of words assigned to each topic
topic_counts = [0 for _ in range(K)]

In [207]:
document_lengths=list(map(len, documents))

In [208]:
document_lengths

[7, 5, 6, 5, 4, 6, 4, 4, 4, 4, 3, 4, 3, 5, 3]

In [209]:
distinct_words = [word for document in documents for word in document]

In [210]:
W = len(distinct_words)
W

67

In [211]:
D = len(documents)

In [212]:
def p_topic_given_document(topic, d, alpha=0.1):
        """the fraction of words in document _d_
    that are assigned to _topic_ (plus some smoothing)"""
        return (document_topic_counts[d][topic]+alpha)/(document_lengths[d]+K*alpha)
        

In [213]:
def p_word_given_topic(word, topic, beta=0.1):
    """the fraction of words assigned to _topic_
    that equal _word_ (plus some smoothing)"""
    return (topic_word_counts[topic][word]+beta)/(topic_counts[topic]+W*beta)

In [214]:
def topic_weight(d, word, k):
    """given a document and a word in that document,
    return the weight for the kth topic"""
    return p_topic_given_document(k, d)*p_word_given_topic(word, k)

In [215]:
def choose_new_topic(d, word):
    return sample_from([topic_weight(d, word, k) for k in range(K)])

In [216]:
#initiate
random.seed(0)
document_topics = [[random.randrange(K) for word in document] for document in documents]

In [217]:
for d in range(D):
    for word, topic in zip(documents[d], document_topics[d]):
        document_topic_counts[d][topic]+=1
        topic_counts[topic] += 1
        topic_word_counts[topic][word] += 1

In [218]:
for _ in range(1000):
    for d in range(D):
        for i, (word, topic) in enumerate(zip(documents[d], document_topics[d])):
            document_lengths[d]=-1
            document_topic_counts[d][topic]-=1
            topic_counts[topic]-=1
            topic_word_counts[topic][word]-=1
            
            new_topic=choose_new_topic(d, word)
            document_topics[d][i]=new_topic
            print(i)
            print("docu")
            print("topic "+str(topic))
            print("word "+str(word))
            print("new topic "+str(new_topic))
            
            document_lengths[d]+=1
            document_topic_counts[d][new_topic]+=1
            topic_counts[new_topic]+=1
            topic_word_counts[new_topic][word]+=1
            

0
topic 3
word Hadoop
new topic None


TypeError: list indices must be integers or slices, not NoneType

In [231]:
choose_new_topic(0, 'Hadoop')

0

In [145]:
choose_new_topic(0, 'libsvm')

3

In [117]:
 p_word_given_topic('libsvm', 3)

0.04845814977973569

In [118]:
topic_weight(0, 'libsvm', 3)

0.02684843433742112

In [91]:
topic_word_counts

[Counter({'Big Data': 1,
          'C++': 1,
          'HBase': 1,
          'Hadoop': 1,
          'Haskell': 1,
          'Java': 1,
          'R': 1,
          'artificial intelligence': 1,
          'libsvm': 1,
          'pandas': 2,
          'regression': 1,
          'scikit-learn': 2,
          'statistics': 1,
          'statsmodels': 1}),
 Counter({'Cassandra': 1,
          'HBase': 1,
          'Mahout': 1,
          'MongoDB': 1,
          'MySQL': 1,
          'Postgres': 1,
          'Python': 1,
          'databases': 1,
          'decision trees': 1,
          'deep learning': 2,
          'neural networks': 2,
          'numpy': 1,
          'theory': 1}),
 Counter({'C++': 1,
          'Cassandra': 1,
          'HBase': 1,
          'Java': 2,
          'MongoDB': 1,
          'Postgres': 1,
          'Python': 2,
          'R': 2,
          'artificial intelligence': 1,
          'machine learning': 1,
          'mathematics': 1,
          'probability': 1,
         