In [1]:
from sklearn.datasets import make_multilabel_classification
X, _ = make_multilabel_classification(random_state=0)

In [2]:
import numpy as np

class GibbsLDA:
    
    def __init__(self,K,alpha=None,beta=None,sim_time=1000):
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.sim_time = sim_time
        
    def fit(self,X):
        M,N = X.shape
        V = len(np.unique(X.reshape(-1)))
        K = self.K
        
        N_kv = np.zeros((K,V))
        N_mk = np.zeros((M,K))
        if self.alpha is None:
            alpha = [1 for _ in range(K)]
        else:
            alpha = self.alpha
        if self.beta is None:
            beta = [1 for _ in range(V)]
        else:
            beta = self.beta
        
        topic_mat = np.zeros((M,N))
        for m in range(M):
            for n in range(N):
                word = X[m,n]
                topic = int(K*np.random.random())
                topic_mat[m,n] = topic
                N_mk[m,topic] += 1
                N_kv[topic,int(word)] +=1 
                
        for _ in range(self.sim_time):
            for m in range(M):
                for n in range(N):
                    word = X[m,n]
                    topic = int(topic_mat[m,n])
                    N_mk[m,topic] -= 1
                    N_kv[topic,int(word)] -= 1
        
                    prob = (N_kv[:,int(word)]+beta[int(word)])/(np.sum(N_kv+beta,axis=1))\
                          *(N_mk[m,:]+alpha[topic])
                    prob = list(prob/np.sum(prob))
                    topic = list(np.random.multinomial(1,prob)).index(1)
                    
                    topic_mat[m,n] = topic
                    N_mk[m,topic] += 1
                    N_kv[topic,int(word)] += 1        
                
        theta = N_mk+alpha
        self.theta = theta/np.sum(theta,1).reshape(-1,1)
        phi = N_kv+beta
        self.phi = phi/np.sum(phi,1).reshape(-1,1)
        
        
    def transform(self):
        return self.theta

In [3]:
clf = GibbsLDA(K=5,sim_time=1000)
clf.fit(X)

In [4]:
clf.transform()[-2:]

array([[0.04, 0.16, 0.16, 0.48, 0.16],
       [0.44, 0.08, 0.2 , 0.16, 0.12]])