In [2]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

# Multi-layer Perceptron is sensitive to feature scaling, so it is highly recommended to scale your data.
# For example, scale each attribute on the input vector X to [0, 1] or [-1, +1]. e.g. scaler = StandardScaler()
# Finding a reasonable regularization parameter  is best done using GridSearchCV,
# usually in the range 10.0 ** -np.arange(1, 7)
# Empirically, we observed that L-BFGS converges faster and with better solutions on small datasets.
# For relatively large datasets, however, Adam is very robust.

if __name__=='__main__':
    # X : array-like or sparse matrix, shape (n_samples, n_features)
    # y : array-like, shape (n_samples,) or (n_samples, n_outputs)
    X = [[0., 0., 0., 0., 0.], [1., 1., 1., 1., 1.]]
    y = [0, 1]
    
    X_train = [[0., 0., 0., 0., 0.], [1., 1., 1., 1., 1.]]
    X_test = [[0., 0., 0., 0., 0.], [1., 1., 2., 1., 1.]]
    scaler = StandardScaler() 
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test) 
    print X_test
    
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
    clf.fit(X, y)
    print clf.predict([[1., 1., 1., 0., 0.], [0., 0., 0., 0., 1.]])
    print clf.predict_proba([[1., 1., 1., 0., 0.], [0., 0., 0., 0., 1.]])
    print clf.score(X, y)

[[-1. -1. -1. -1. -1.]
 [ 1.  1.  3.  1.  1.]]
[1 0]
[[ 0.10225897  0.89774103]
 [ 0.97116002  0.02883998]]
1.0


In [33]:
# LDA Gibbs sampling
import numpy as np
import random

def rndbias(p):
    if random.random() > p:
        return 1
    else:
        return 0

def samplp(pz):
    res = []
    for p in pz:
        res.append(rndbias(p))
    return res

if __name__=='__main__':
    # Input: words w ∈ documents d
    # words
    w = np.array([0, 1])
    # documents as doc/word matrix
    d = np.array([
        [1, 0],
        [0, 1]
    ])

    # begin randomly initialize z and increment counters
    # z(i) topic index assignment over available topics
    z = np.array([1,1]) # todo: random w/ prior

    # init increment counters
    cdt = np.random.randint(2, size=(2, 2)) # nd,top
    cwt = np.random.randint(2, size=(2, 2)) # nw,top

    N_D = d.shape[0]  # num of docs
    N_W = w.shape[0]  # num of words
    N_K = z.shape[0]  # num of topics

    # Dirichlet priors
    alpha = 0.5
    beta = 0.01

    pz = np.array([0.5, 0.5])

    for niter in range(15):
        # for i = 0 → N −1 do
        nk = N_K
        # for all documents m ∈ [1, M] d
        for d in range(N_D): # documents
            for i in range(N_W): # words
                word = w[i]
                topic = z[i]
                # nd,topic-=1; nword,topic-=1; ntopic-=1
                cdt[d][topic] -= 1 #  the number of words assigned to topic k in document d
                cwt[word][topic] -= 1 # the number of times word w is assigned to topic k
                nk -= 1 # the number of times word w is assigned to topic k
                # for k = 0 → K −1 do
                for k in range(N_K): # topics
                    # p(z = k|·) = (nd,k + αk) nk,w+βw nk+β×W
                    pz[k] = (cdt[d][k] + alpha) * (cwt[i][k] + beta) / (nk + N_W * beta)
                if min(pz)<0:
                    pz += -min(pz)
                pz = pz / sum(pz) # normalize
                # topic ← sample from p(z|·)
                topic = np.random.choice(N_K, p=pz)
                z[i] = topic
                # nd,topic+=1; nword,topic+=1; ntopic+=1
                nk += 1
                cdt[d][topic] += 1
                cwt[word][topic] += 1
    # Output: topic assignments z and counts nd,k,nk,w, and nk
    # return z, nd,k,nk,w,nk
    print pz

[ 0.99019608  0.00980392]


In [16]:
import random
import numpy as np

def rndbias(p):
    if random.random() > p:
        return 1
    else:
        return 0

if __name__=='__main__':
    for i in range(10):
        print rndbias(0.8)
    print np.arange(5)
    print np.random.choice(5, 3, replace=False, p=[0.1, 0, 0.3, 0.6, 0])

0
0
0
0
0
0
1
0
0
0
[0 1 2 3 4]
[3 2 0]


In [17]:
# small LDA example for demo purposes
# example from Probabilistic Topic Models, M Steyvers, T Griths, Handbook of latent semantic analysis 
import random
import math
import time

def poissonrn(lda): # generate a poisson random number
    l = math.exp(-lda)
    k = 1.
    p = random.random()
    while p > l:
        k += 1.
        p *= random.random()
    return (k-1.)

def dirichlet(params): # sample dirichlet distribution
    sample = [random.gammavariate(a,1) for a in params]
    sample = [v/sum(sample) for v in sample]
    return sample

def partsct(doc): # find all parts of a document and count its respective occurance
    chklst = []
    ctlst = []
    for e in doc:
        if not e in chklst:
            chklst.append(e)
            ctlst.append(1)
        else:
            ctlst[chklst.index(e)] += 1
    return (chklst, ctlst)

def topic_parts(ntopics, dbt): # create a topics-part table randomly TODO: add prior distr
    res = [[0 for i in range(len(dbt))] for i in range(ntopics)]
    ie = 0
    for e in dbt:
        for p in e[1]:
            for k in range(p):
                res[random.randint(0,ntopics-1)][ie] += 1
        ie += 1
    return res

def doc_parts_tbl(docs): # build a document-parts table
    ptslst = []
    for d in docs:
        ptslst.append(partsct(d)) 
    return ptslst

def all_parts(dbt): # a list of all parts in all docs, uniquely, from doc parts table
    apts = []
    for d in dbt:
        for de in d[0]:
            if not de in apts:
                apts.append(de)
    return apts

def parts_per_doc(lda): # pick how many parts you want per each composite (doc) - from Poisson distribution
    return poissonrn(lda)

def parts_topics(parts, topics, beta): # build the parts-topics table: for each column draw from Dirichlet distr.(beta)
    res = []
    betalst = []
    for p in parts:
        betalst.append(beta)
    for t in topics:
        res.append(dirichlet(betalst))
    return res

def comp_topics(docs, topics, alpha): # build the composites(docs)-topics table: for each row draw from Dirichlet distr.(alpha)
    res = []
    alphalst = []
    for t in topics:
        alphalst.append(alpha)
    for d in docs:
        res.append(dirichlet(alphalst))
    return res

def smpl_topbydoc(ndoc, doctop): # sample document by topic
    maxtop = 0.
    i = 0
    imax = 0
    for p in doctop[ndoc]:
        if p > maxtop:
            maxtop = p
            imax = i
        i += 1
    return imax

def smpl_partsbytop(ntop, parttop): # sample parts by topic
    maxprt = 0.
    i = 0
    imax = 0
    for p in parttop[ntop]:
        if p > maxprt:
            maxprt = p
            imax = i
        i += 1
    return imax

if __name__=='__main__':
    docs = []
    docs.append('cc')
    docs.append('dd')
    #print docs
    dbt = doc_parts_tbl(docs) # document and parts table
    print dbt
    random.seed(time.clock())
    ntop = 3 # number of topics
    tpparts = topic_parts(ntop, dbt) # randomly assign a topic to each part
    print tpparts
    alpha = 0.2
    beta = 0.5
    #print dirichlet([beta, beta])
    apts = all_parts(dbt) # find all parts making up the documents
    parttop = parts_topics(apts, [0, 1, 2], beta) # create a random parts-topics distribution
    doctop = comp_topics(apts, docs, alpha) # create a random doc-topic distribution
    nthistop = 0
    maxtop = smpl_topbydoc(nthistop, doctop) # sample document by topic
    npt = smpl_partsbytop(maxtop, parttop) # sample parts by topc
    print 'most representative part of doc %d: index %d : part %s' % (nthistop, npt, apts[npt])

[(['c'], [2]), (['d'], [2])]
[[0, 1], [2, 1], [0, 0]]
most representative part of doc 0: index 1 : part d


In [4]:
# https://wiseodd.github.io/techblog/2017/09/07/lda-gibbs/
import numpy as np

# Words
W = np.array([0, 1, 2, 3, 4])

# D := document words
X = np.array([
    [0, 0, 1, 2, 2],
    [0, 0, 1, 1, 1],
    [0, 1, 2, 2, 2],
    [4, 4, 4, 4, 4],
    [3, 3, 4, 4, 4],
    [3, 4, 4, 4, 4]
])

N_D = X.shape[0]  # num of docs
N_W = W.shape[0]  # num of words
N_K = 2  # num of topics

# Dirichlet priors
alpha = 1
gamma = 1

# Z := word topic assignment
Z = np.zeros(shape=[N_D, N_W])

for i in range(N_D):
    for l in range(N_W):
        Z[i, l] = np.random.randint(N_K)  # randomly assign word's topic

# Pi := document topic distribution
Pi = np.zeros([N_D, N_K])

for i in range(N_D):
    Pi[i] = np.random.dirichlet(alpha*np.ones(N_K))

# B := word topic distribution
B = np.zeros([N_K, N_W])

for k in range(N_K):
    B[k] = np.random.dirichlet(gamma*np.ones(N_W))
    
for it in range(1000):
    # Sample from full conditional of Z
    # ---------------------------------
    for i in range(N_D):
        for v in range(N_W):
            # Calculate params for Z
            p_iv = np.exp(np.log(Pi[i]) + np.log(B[:, X[i, v]]))
            p_iv /= np.sum(p_iv)

            # Resample word topic assignment Z
            Z[i, v] = np.random.multinomial(1, p_iv).argmax()

    # Sample from full conditional of Pi
    # ----------------------------------
    for i in range(N_D):
        m = np.zeros(N_K)

        # Gather sufficient statistics
        for k in range(N_K):
            m[k] = np.sum(Z[i] == k)

        # Resample doc topic dist.
        Pi[i, :] = np.random.dirichlet(alpha + m)

    # Sample from full conditional of B
    # ---------------------------------
    for k in range(N_K):
        n = np.zeros(N_W)

        # Gather sufficient statistics
        for v in range(N_W):
            for i in range(N_D):
                for l in range(N_W):
                    n[v] += (X[i, l] == v) and (Z[i, l] == k)

        # Resample word topic dist.
        B[k, :] = np.random.dirichlet(gamma + n)
        
print B

[[ 0.00946214  0.02248632  0.03198897  0.23357924  0.70248334]
 [ 0.16259179  0.39564105  0.35115256  0.00778928  0.08282532]]
