In [2]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

import pickle
from preprocess import *

In [3]:
n_components = 51

In [4]:
def getQLPSortedIndexList(question, cv, tf, lda,cwlist, lam, u):
    # should return list of 
    simiList = []
    qes = cv.transform([question.content])[0].toarray()[0]
    for row in tf:
        prob = 1
        answerDis = lda.transform([row]).tolist()[0]
        for i in range(len(qes)):
            if(qes[i] == 0): continue
            pseudo = (row[i] + u * cwlist[i]) / (u + sum(row))
            
            # calculate plda
            plda = 0
            
            # traverse topic
            for topic_idx, topic in enumerate(lda.components_):
                plda += answerDis[topic_idx] * topic.item(i)*qes[i]
                
            prob *= lam*pseudo + (1-lam) * plda
        
        simiList.append(prob)
    # sort the similarity list, and return the index list.
    res = list(range(len(simiList)))
    return sorted(res, key = lambda i : simiList[i], reverse= True)
    

In [5]:
X, cv, answers, questions, word_ratio, answer_mapping = generate_count_vectorizer()
X = X.toarray()

In [6]:
lda = LatentDirichletAllocation(n_components=n_components, max_iter=300,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

In [7]:
lda.fit(X)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=300, mean_change_tol=0.001,
             n_components=51, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [8]:
row = X[2]
print(row)
d = lda.transform([row]).tolist()[0]
print(d)

[0 0 0 ... 0 0 0]
[0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.5286416063119729, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.030683403003062045, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.12170893900167384, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.00025799793601663577, 0.30684014869050935, 0.0002579

In [9]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)

In [10]:
ques = answers[17]

In [17]:
def evalQuestion(index):
    l = getQLPSortedIndexList(questions[index], cv, X, lda, word_ratio, 0.3, 1)
    tar = answer_mapping[questions[index].peer_idx]
    # find the anser
    for i in range(len(l)):
        if tar == l[i]:
            return i

In [25]:
def evalCheck(index):
    ques = answers[index]
    l = getQLPSortedIndexList(ques, cv, X, lda, word_ratio, 0.3, 1)
    tar = answer_mapping[answers[index].idx]
    # find the anser
    for i in range(len(l)):
        if tar == l[i]:
            return i

In [30]:
for i in range(50, 80):
    print(evalQuestion(i))

172
0
24
151
451
590
41
2
541
495
555
50
96
14
420
0
10
547
469
321
464
17
537
33
579
302
392
18
124
486


In [33]:
for i in range(30, 50):
    print(evalCheck(i))

0
0
0
14
2
0
1
4
8
0
0
38
35
2
0
4
2
2
123
0


In [None]:
l = getQLPSortedIndexList(ques, cv, X, lda, word_ratio, 0, 1)
print(l)

In [None]:
sum(cv.transform([ques.content])[0].toarray()[0])

In [None]:
tf_feature_names = cv.get_feature_names()
print_top_words(lda, tf_feature_names, 10)

In [None]:
simiList = [3,2,1,5,9]
res = list(range(len(simiList)))
print(sorted(res, key = lambda i : simiList[i], reverse= True))