In [1]:
from collections import Counter
import random

In [2]:
documents = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

K=4

In [5]:
class CustomLDA(object):
    
    def __init__(self, documents,nr_topics ):
        self.documents=documents
        self.nr_topics=nr_topics
        self.distinct_words=None
        self.count_distinct_words()
        
        self.D = len(self.documents)
        self.W = len(self.distinct_words)
        
        self.document_topic_counts =None
        self.count_document_topic()
        
        self.topic_word_counts =None
        self.count_topic_word()
        
        self.topic_counts = None
        self.count_topic()
        
        self.document_lengths =None
        self.get_document_lengths()
        
        self.document_topics = None
        self.init_document_topics()
        
        self.init_counts()
        
        self.nr_iter=None
        self.topic_names=None
        
    def count_document_topic(self):
        '''How many times each topic is assigned to each document'''
        self.document_topic_counts = [Counter() for _ in self.documents]
    
    def count_topic_word(self):
        self.topic_word_counts= [Counter() for _ in range(self.nr_topics)]
    
    def sample_from(self, weights):
        """returns i with probability weights[i] / sum(weights)"""
        total = sum(weights)
        rnd = total * random.random() # uniform between 0 and total
        for i, w in enumerate(weights):
            rnd -= w # return the smallest i such that
            if rnd <= 0: 
                return i # weights[0] + ... + weights[i] >= rnd
            
    def count_topic(self):
        '''The total number of words assigned to each topic'''
        self.topic_counts= [0 for _ in range(self.nr_topics)]
    
    def get_document_lengths(self):
        self.document_lengths= list(map(len, self.documents))
    
    def count_distinct_words(self):
        self.distinct_words=set(word for document in self.documents for word in document)
    
    def p_topic_given_document(self, topic, d, alpha=0.1):
        """the fraction of words in document _d_
        that are assigned to _topic_ (plus some smoothing)"""
        return ((self.document_topic_counts[d][topic] + alpha) /
                (self.document_lengths[d] + self.nr_topics * alpha))

    def p_word_given_topic(self,word, topic, beta=0.1):
        """the fraction of words assigned to _topic_
        that equal _word_ (plus some smoothing)"""
        return ((self.topic_word_counts[topic][word] + beta) /
                (self.topic_counts[topic] + self.W * beta))
    
    def init_document_topics(self):
        random.seed(0)
        self.document_topics = [[random.randrange(self.nr_topics) for word in document] for document in self.documents]

    def init_counts(self):
        for d in range(self.D):
            for word, topic in zip(self.documents[d], self.document_topics[d]):
                self.document_topic_counts[d][topic] += 1
                self.topic_word_counts[topic][word] += 1
                self.topic_counts[topic] += 1
                
    def topic_weight(self, d, word, k):
        """given a document and a word in that document,
        return the weight for the kth topic"""
        return self.p_word_given_topic(word, k) * self.p_topic_given_document(k, d)

    def choose_new_topic(self, d, word):
        return self.sample_from([self.topic_weight(d, word, k)
                            for k in range(self.nr_topics)])

    def train(self, nr_iter):  
        self.nr_iter=nr_iter
        for iter in range(self.nr_iter):
            for d in range(self.D):
                for i, (word, topic) in enumerate(zip(self.documents[d],
                                                      self.document_topics[d])):

                    # remove this word / topic from the counts
                    # so that it doesn't influence the weights
                    self.document_topic_counts[d][topic] -= 1
                    self.topic_word_counts[topic][word] -= 1
                    self.topic_counts[topic] -= 1
                    self.document_lengths[d] -= 1

                    # choose a new topic based on the weights
                    new_topic = self.choose_new_topic(d, word)
                    self.document_topics[d][i] = new_topic

                    # and now add it back to the counts
                    self.document_topic_counts[d][new_topic] += 1
                    self.topic_word_counts[new_topic][word] += 1
                    self.topic_counts[new_topic] += 1
                    self.document_lengths[d] += 1
                    
    def top_words_per_topic(self):
        for k, word_counts in enumerate(self.topic_word_counts):
            for word, count in word_counts.most_common():
                if count > 0: 
                    print(k, word, count)
    
    def assign_topics(self, topic_names):
        self.topic_names=topic_names
        for document, topic_counts in zip(self.documents, self.document_topic_counts):
            print(document)
            for topic, count in topic_counts.most_common():
                if count > 0:
                    
                    print(self.topic_names[topic], count)
            print()

In [6]:
lda_model=CustomLDA(documents, 4)

In [7]:
lda_model.train(nr_iter=1000)

In [8]:
lda_model.top_words_per_topic()

0 Java 3
0 Big Data 3
0 Hadoop 2
0 Storm 1
0 HBase 1
0 deep learning 1
0 Cassandra 1
0 C++ 1
0 programming languages 1
0 Spark 1
0 MapReduce 1
1 neural networks 2
1 machine learning 2
1 Postgres 2
1 HBase 2
1 MongoDB 2
1 scipy 1
1 numpy 1
1 MySQL 1
1 deep learning 1
1 NoSQL 1
1 Cassandra 1
1 decision trees 1
1 artificial intelligence 1
1 databases 1
2 regression 3
2 Python 2
2 libsvm 2
2 R 2
2 scikit-learn 2
2 support vector machines 1
2 Haskell 1
2 Mahout 1
2 mathematics 1
3 probability 3
3 statistics 3
3 Python 2
3 pandas 2
3 R 2
3 statsmodels 2
3 theory 1
3 C++ 1
3 artificial intelligence 1


In [11]:
topic_names = ["Big Data and programming languages",
               "Python and statistics",
               "databases",
               "machine learning"]

In [12]:
lda_model.assign_topics(topic_names)

['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra']
Big Data and programming languages 7

['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres']
Python and statistics 5

['Python', 'scikit-learn', 'scipy', 'numpy', 'statsmodels', 'pandas']
Python and statistics 2
databases 2
machine learning 2

['R', 'Python', 'statistics', 'regression', 'probability']
machine learning 3
databases 2

['machine learning', 'regression', 'decision trees', 'libsvm']
Python and statistics 2
databases 2

['Python', 'R', 'Java', 'C++', 'Haskell', 'programming languages']
Big Data and programming languages 3
databases 3

['statistics', 'probability', 'mathematics', 'theory']
machine learning 3
databases 1

['machine learning', 'scikit-learn', 'Mahout', 'neural networks']
Python and statistics 2
databases 2

['neural networks', 'deep learning', 'Big Data', 'artificial intelligence']
Python and statistics 3
Big Data and programming languages 1

['Hadoop', 'Java', 'MapReduce', 'Big Data']
Big D

In [13]:
lda_model.document_topic_counts

[Counter({0: 7, 1: 0, 2: 0, 3: 0}),
 Counter({0: 0, 1: 5, 2: 0, 3: 0}),
 Counter({0: 0, 1: 2, 2: 2, 3: 2}),
 Counter({0: 0, 1: 0, 2: 2, 3: 3}),
 Counter({0: 0, 1: 2, 2: 2, 3: 0}),
 Counter({0: 3, 1: 0, 2: 3, 3: 0}),
 Counter({0: 0, 1: 0, 2: 1, 3: 3}),
 Counter({0: 0, 1: 2, 2: 2, 3: 0}),
 Counter({0: 1, 1: 3, 2: 0, 3: 0}),
 Counter({0: 4, 1: 0, 2: 0, 3: 0}),
 Counter({0: 0, 1: 0, 2: 0, 3: 3}),
 Counter({0: 1, 1: 0, 2: 0, 3: 3}),
 Counter({0: 0, 1: 0, 2: 0, 3: 3}),
 Counter({0: 0, 1: 5, 2: 0, 3: 0}),
 Counter({0: 0, 1: 0, 2: 3, 3: 0})]