In [None]:
!pip install pattern

In [None]:
!pip install nltk

In [1]:
#generic scifeed document analyzer
#read in the discipline specific config file and it will load the data and go from there.

from pattern.web import Newsfeed
from pattern.web import cache
from pattern.en import parse, pprint, tag
from pattern.web import download, plaintext
import numpy as np
import nltk.stem
import pickle
import random
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
def load_docs(path, name):
    filename = path+name+".p"
    fileobj = open(filename, "rb")
    z = fileobj.read()
    lst = pickle.loads(str(z))
    titles = []
    sitenames = []
    abstracts = []
    for i in range(0, len(lst)):
        titles.extend([lst[i][0]])
        sitenames.extend([lst[i][1]])
        abstracts.extend([lst[i][2]])
        
    print("done loading "+filename)
    return abstracts, sitenames, titles

In [3]:
def read_config(subj, basepath):
    docpath =basepath+ "/config_"+subj+".json"
    with open(docpath, 'rb') as f:
        doc = f.read() 
    z =json.loads(doc)
    subject = z['subject']
    loadset = z['loadset']
    subtopics = []
    for w in z['supertopics']:
        subtopics.extend([(w[0], w[1])])
    return subject, loadset, subtopics


In [4]:
titles, sitenames, disp_title = load_docs("/home/jovyan/work/", 
                                     "phy_train")

done loading /home/jovyan/work/phy_train.p


In [21]:
subj, loadset, supertopics = read_config("phy", "/home/jovyan/work")
#special case to eliminate other-phy
supertopics = supertopics[0:7]

In [22]:
supertopic_names = [x[0] for x in supertopics]
print supertopic_names

[u'Astro', u'GenRelQGr', u'Cond Matter', u'high eng', u'MathPh', u'quantum', u'ed-physics']


In [23]:
#!pip install gensim

In [24]:
from sklearn.cluster import KMeans
from gensim import corpora, models, similarities


In [25]:
def buildVectorizer(bio):
    nounlist = []
    for doc in bio:
        st = ""
        for (word, pos) in tag(doc):
            if pos in ["JJ", "NNS", "NN", "NNP"]:
                st = st+word+" "
            else:
                if st!= "":
                    st = st[0:-1]+" "
                    #print "got one"
        nounlist.extend([st])
    sciencestopwords = set([u'model','according', 'data', u'models', 'function', 'properties', 'approach', 'parameters', 
                    'systems', 'number', 'order', u'data', 'analysis', u'information', u'journal',
                    'results','using','research', 'consumers', 'scientists', 'model', 'models', 'journal',
                    'researchers','paper','new','study','time','case', 'simulation', u'simulation', 'equation',
                    'based','years','better', 'theory', 'particular','many','due','much','set', 'studies', 'systems',
                    'simple', 'example','work','non','experiments', 'large', 'small', 'experiment', u'experiments',
                    'provide', 'analysis', 'problem', 'method', 'used', 'methods'])
    #now doing the new vectorizer
    from sklearn.feature_extraction.text import TfidfVectorizer
    english = nltk.corpus.stopwords.words('english')
    newstop = english+list(sciencestopwords) 
    vectorizer = TfidfVectorizer(min_df=1, max_df=.5, stop_words=newstop, decode_error='ignore')
    X = vectorizer.fit_transform(nounlist)
    Xinv = vectorizer.inverse_transform(X)
        #X is a sparse matrix of docs x vocab size (7638). 
    #so X[doc_num] is the sparse vector of its words. 
    #the ||X[doc_num]|| = 1 there are 7638 unique words and 755 docs. with a total number of 38888 non-zeros.
    #Xinv[doc_num] is the list of words in the doc.
     
    return nounlist, vectorizer, X, Xinv


In [26]:
#nltk.download()

In [27]:
nounlist, vectorizer, X, Xinv = buildVectorizer(titles)

In [28]:
texts = [ item.tolist() for item in Xinv]
print len(texts)
print len(nounlist)

2000
2000


In [29]:
#returns a list consiting of tuples
#   (subtopic_name, [list of title-numbers in that subtopic])
def make_topic_lists_sets(nounlist, disp_title, supertopics):
    topic_lists = []
    for topic in supertopics:
        #print topic[0]
        subtop_list = []
        for i in range(0,len(nounlist)):
            t = disp_title[i]
            for j in topic[1]:
                x = t.find(j) 
                if x  > 0:
                    subtop_list.extend([i])
        topic_lists.extend([(topic[0], subtop_list)])
        #print subtop_list
    return topic_lists #returns the list of tuples for the training sets
    

# for each subtopic
#   (subtopic name, training set items, list of the ARXiV sub areas for this supertopic )
def fillTopicTables(nounlist, disp_title, supertopics ):
    toplsts = make_topic_lists_sets(nounlist, disp_title, supertopics)
    super_topics = []   
    for i in range(0,len(toplsts)):
        topl = toplsts[i]
        area = topl[0]
        items = topl[1]
        z = int(3.0*len(items)/4)
        tupletitles = []
        tuplenums = []
        for r in range(0,z):
            w = int(random.random()*len(items))
            tupletitles.extend([nounlist[items[w]]])
            tuplenums.extend([items[w]])
        tup = (area, tupletitles, supertopics[i][1])
        super_topics.extend([tup])
    
    for top in super_topics:
        print top[0] + " "+ str(len(top[1]))+ " " + str(top[2])

    return super_topics

In [30]:
def makeInvertedTrainingList(super_topics):
    #create list of all training set items
    #  (doc, docno, subtopicname, subtopic-index)
    lis = []
    n = 0
    for top in super_topics:
        items = top[1]
        for i in range(0, len(items)):
            lis.extend([(items[i], top[0], n)])
        n = n+1
    return lis
        

In [31]:
def compute_centroid(items, vectorizer):
    stmt = items[0]
    #stmt = titles[items[0]]
    count = len(items)
    #print stmt
    #print count
    vec = vectorizer.transform([stmt])[0]
    for i in range(1,count):
        stmt = items[i]
        vec2 = vectorizer.transform([stmt])[0]
        vec = vec+vec2
    z = np.linalg.norm(vec.toarray())
    return vec/z

In [32]:
#the X and titles used here should be for the training list
def makeguess(statement, km, vectorizer, lsi, dictionary, index_lsi, ldamodel, index_lda, X, titles):
    new_title_vec = vectorizer.transform([statement])
    new_title_label = km.predict(new_title_vec)
    similar_indicies = (km.labels_==new_title_label).nonzero()[0]
    similar = compdist(new_title_vec, similar_indicies, X, titles)
    kmeans_items = list(x[1] for x in similar)

    #now for lsi items
    new_title_vec_lsi = dictionary.doc2bow(statement.lower().split())
    new_title_lsi = lsi[new_title_vec_lsi]
    sims = index_lsi[new_title_lsi] # perform a similarity query against the corpus
    simlist = list(enumerate(sims))
    topten = sorted(simlist, key = lambda x: x[1], reverse = True)[0:30]
    lsi_items = list(x[0] for x in topten)

    #now do lda
    new_title_vec_lda = dictionary.doc2bow(statement.lower().split())
    new_title_lda = ldamodel[new_title_vec_lda]
    sims = index_lda[new_title_lda] # perform a similarity query against the corpus
    simlist = list(enumerate(sims))
    topten = sorted(simlist, key = lambda x: x[1], reverse = True)[0:30]
    lda_items = list(x[0] for x in topten)

    
    dist_lsi = compdist(new_title_vec, lsi_items, X, titles)
    #dist_lsi = []
    dist_km = compdist(new_title_vec, kmeans_items, X, titles)
    #dist_km = []
    #dist_lda = compdist(new_title_vec, lda_items, X, titles)
    dist_lda = []
    s = dist_lda + dist_km + dist_lsi
    #print s
    d1 = sorted(s, reverse=True)
    d = [x for x in d1 if x[0] > 0.00]

    notdups = []
    for x in d:
        if x not in notdups:
            notdups.extend([x])
    return notdups


    

In [33]:
from sklearn.cluster import KMeans
from gensim import corpora, models, similarities

def big_build_analizers( subj):    
    #subject, loadset, basepath, supertopics =read_config(doc_service, subj)
    # for pushing to blob storage set basepath = ""
    subject=subj
    basepath = "/home/jovyan/work/celery/"
    cvectpath = basepath+"count_vectorizer-"+subject+".p"
    tfidftranpath = basepath+"tfidf_transformer-"+subject+".p"
    RFpath = basepath+"random_forest-"+subject+".p"
    namspath = basepath+"supertopic_names-"+subject+".p"
    GBpath = basepath+"gradientboosting-"+subject+".p"
    vectpath = basepath+"vectorizer-"+subject+".p"
    lsimodpath = basepath+"lsimod-"+subject+".p"
    lsiindpath = basepath+"lsiind-"+subject+".p"
    ldamodpath = basepath+"ldamod-"+subject+".p"
    ldaindpath = basepath+"ldaind-"+subject+".p"
    kmpath =  basepath+"km-"+subject+".p"
    ncentpath = basepath+"ncent-"+subject+".p"
    Xpath = basepath+"Xtrain-"+subject+".p"
    trainsetpath = basepath+"Tset-"+subject+".p"
    dictpath = basepath+"Dict-"+subject+".p"
    
    supertopic_names = [ x[0] for x in supertopics]
    fo = open(namspath,'wb') 
    pickle.dump(supertopic_names,fo)
    fo.close()
    #doc_service.put_block_blob_from_bytes("sciml", namspath, pickled)
    
    #titles, sitenames, disp_title = load_data2(loadset)
    #print "data loaded"
    #create a version of the docs "nounlist" where each item is filtered
    #through the stoplist.  Vectorizer is built from that and Xinv is a list
    #where each element is the an array of the words in that doc.
    #nounlist, vectorizer, X, Xinv = buildVectorizer(titles)
    dictwords = [ item.tolist() for item in Xinv]
    dictionary = corpora.Dictionary(dictwords)
    print(dictionary)
    print "dictionary built"
    dictcorpus = [dictionary.doc2bow(text) for text in dictwords]
    tfidf = models.TfidfModel(dictcorpus)

    
    #create training set
    print "creating training set"
    trainingSets = fillTopicTables(nounlist, disp_title, supertopics)
    traininglist = makeInvertedTrainingList(trainingSets)
    traindocs = [tex[0] for tex in traininglist]
    trainlable = [tex[1] for tex in traininglist]
    traintarget = [tex[2] for tex in traininglist]
    corpus = [dictionary.doc2bow(text.lower().split()) for text in traindocs]
    corpus_tfidf = tfidf[corpus]
    print len(traindocs)
    
    #create lsi
    print "creating lsi model"
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) # initialize an LSI transformation
    corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
    index_lsi = similarities.MatrixSimilarity(corpus_lsi)

    #create lda
    print "creating lda model"
    lda = models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes = 10, iterations = 500) # initialize an LSI transformation
    corpus_lda = lda[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
    index_lda = similarities.MatrixSimilarity(corpus_lda)
    
    #create km for full list of documents 
    num_clusters = 10
    km = KMeans(n_clusters=num_clusters, init='random', n_init=1, verbose=1, tol=.00001)
    Xtrain = vectorizer.fit_transform(traindocs)
    Xinvtrain = vectorizer.inverse_transform(Xtrain)
    print Xtrain.shape
    print Xtrain[1].shape
    km.fit(Xtrain)
    #print Xtrain
    print "k-means analizer built"
    #pickled = pickle.dumps(Xtrain)
    fo = open(Xpath,'wb') 
    pickle.dump(Xtrain,fo)
    fo.close()
    
    fo = open(trainsetpath,'wb') 
    pickle.dump(trainingSets,fo)
    fo.close()
    #pickled = pickle.dumps(trainingSets)
    #doc_service.put_block_blob_from_bytes("sciml", trainsetpath, pickled)
   
    #dumping km, lda, lsi
    fo = open(kmpath,'wb') 
    pickle.dump(km,fo)
    fo.close()
    #pickled = pickle.dumps(km)
    #doc_service.put_block_blob_from_bytes("sciml", kmpath, pickled)
    
    fo = open(lsimodpath,'wb') 
    pickle.dump(lsi,fo)
    fo.close()
    #pickled = pickle.dumps(lsi)
    #doc_service.put_block_blob_from_bytes("sciml", lsimodpath, pickled)
    
    fo = open(lsiindpath,'wb') 
    pickle.dump(index_lsi,fo)
    fo.close()
    #pickled = pickle.dumps(index_lsi)
    #doc_service.put_block_blob_from_bytes("sciml", lsiindpath, pickled)
    
    fo = open(ldamodpath,'wb') 
    pickle.dump(lda,fo)
    fo.close()
    #pickled = pickle.dumps(lda)
    #doc_service.put_block_blob_from_bytes("sciml", ldamodpath, pickled)
    
    
    fo = open(ldaindpath,'wb') 
    pickle.dump(index_lda,fo)
    fo.close()
    #pickled = pickle.dumps(index_lda)
    #doc_service.put_block_blob_from_bytes("sciml", ldaindpath, pickled)
    
    fo = open(vectpath,'wb') 
    pickle.dump(vectorizer,fo)
    fo.close()
    #pickled = pickle.dumps(vectorizer)
    #doc_service.put_block_blob_from_bytes("sciml", vectpath, pickled)
    #pickle.dump( vectorizer, open( vectpath, "wb" ) )
    
    #here is where we create the new centroid list
    new_centroids = []
    for ts in trainingSets:
        print "computing centroid for "+ ts[0]
        cent = compute_centroid(ts[1], vectorizer)
        new_centroids.extend([cent])
    print "dumping new centroids"
    
    fo = open(ncentpath,'wb') 
    pickle.dump(new_centroids,fo)
    fo.close()
    #pickled = pickle.dumps(new_centroids)
    #doc_service.put_block_blob_from_bytes("sciml", ncentpath, pickled)
       
    
    training_set_data = np.array(traindocs)
    training_set_target = traintarget
    print "total training set size = "+ str(len(training_set_data))
    
    
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(training_set_data)
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    clfrf = RandomForestClassifier(n_estimators = 100)
    clfrf.fit(X_train_tfidf, training_set_target)
    
    print "dumping vectorizer, transformer and clfrf"
    fo = open(cvectpath,'wb') 
    pickle.dump(count_vect,fo)
    fo.close()
    #pickled = pickle.dumps(count_vect)
    #doc_service.put_block_blob_from_bytes("sciml", cvectpath, pickled)
    #pickle.dump( count_vect, open( cvectpath, "wb" ) )
    
    fo = open(tfidftranpath,'wb') 
    pickle.dump(tfidf_transformer,fo)
    fo.close()
    #pickled = pickle.dumps(tfidf_transformer)
    #doc_service.put_block_blob_from_bytes("sciml",  tfidftranpath, pickled)
    #pickle.dump( tfidf_transformer, open( tfidftranpath, "wb" ) )
    
    fo = open(RFpath,'wb') 
    pickle.dump(clfrf,fo)
    fo.close()
    #pickled = pickle.dumps(clfrf)
    #doc_service.put_block_blob_from_bytes("sciml", RFpath, pickled)
    #pickle.dump( clfrf, open( RFpath, "wb" ) )
    fo = open(dictpath,'wb') 
    pickle.dump(dictionary,fo)
    fo.close()
    
    return

In [34]:
print len(nounlist)

2000


In [35]:
len(disp_title)

2000

In [36]:
big_build_analizers( "phy")

Dictionary(11908 unique tokens: [u'writings', u'testbeds', u'rovelli', u'yellow', u'four']...)
dictionary built
creating training set
Astro 550 [u'astro-ph.CO', u'astro-ph.EP', u'astro-ph.GA', u'astro-ph.HE', u'astro-ph.IM', u'physics.space-ph', u'astro-ph.SR']
GenRelQGr 333 [u'gr-qc']
Cond Matter 105 [u'cond-mat.other', u'cond-mat.mes-hall', u'cond-mat.quant-gas', u'cond-mat.soft', u'cond-mat.stat-mech', u'physics.plasm-ph', u'cond-mat.str-el', u'cond-mat.mtrl-sci', u'cond-mat.supr-con']
high eng 221 [u'hep-ex', u'hep-ph', u'hep-th', u'nucl-th', u'physics.acc-ph', u'physics.ao-ph', u'physics.atm-clus', u'physics.atom-ph']
MathPh 105 [u'math-ph', u'physics.comp-ph', u'physics.data-an', u'physics.flu-dyn']
quantum 59 [u'quant-ph']
ed-physics 121 [u'physics.pop-ph', u'physics.soc-ph', u'physics.hist-ph', u'physics.ed-ph', u'physics.ins-det', u'physics.gen-ph']
1494
creating lsi model
creating lda model
(1494, 8685)
(1, 8685)
Initialization complete
Iteration  0, inertia 2755.161
Iteratio

In [37]:
#for an item in the titles list compare its nonlist vector to the list of centroid and return 
#the sorted list (closest first)
def cosdist(vectorizer, itemno, centroids, nounlist):
    new_title_vec = vectorizer.transform([nounlist[itemno]])
    #new_title_vec = vectorizer.transform([titles[itemno]])
    scores = []
    for i in range(0, len(centroids)):
        dist = np.dot(new_title_vec.toarray()[0], centroids[i].toarray()[0])
        scores.extend([(dist, i)])
    scores = sorted(scores,reverse=True) 
    return scores

#this version works for an arbitrary text string
def cosdistString(vectorizer, item, centroids):
    new_title_vec = vectorizer.transform([item])
    scores = []
    for i in range(0, len(centroids)):
        dist = np.dot(new_title_vec.toarray()[0], centroids[i].toarray()[0])
        scores.extend([(dist, i)])
    scores = sorted(scores,reverse=True) 
    return scores

#returns list of tuples (distance, sitename, itemno, abstract for item)
def compdist(new_title_vec, indexlist, X, titles):
    similar = []
    for i in indexlist:
        if np.linalg.norm(X[i].toarray()) != 0.0:
            #dist = np.linalg.norm((new_title_vec-X[i]).toarray())
            dist = np.dot(new_title_vec.toarray()[0],X[i].toarray()[0])
            similar.append((dist,i, titles[i]))
    similar = sorted(similar,reverse=True) 
    return similar

In [38]:
#subject, loadset, basepath, supertopics =read_config(doc_service, "all4")
    # for pushing to blob storage set basepath = ""
basepath = "/home/jovyan/work/celery/"
subject = "phy"
cvectpath = basepath+"count_vectorizer-"+subject+".p"
tfidftranpath = basepath+"tfidf_transformer-"+subject+".p"
rfpath = basepath+"random_forest-"+subject+".p"
namspath = basepath+"supertopic_names-"+subject+".p"
GBpath = basepath+"gradientboosting-"+subject+".p"
vectpath = basepath+"vectorizer-"+subject+".p"
lsimodpath = basepath+"lsimod-"+subject+".p"
lsiindpath = basepath+"lsiind-"+subject+".p"
ldamodpath = basepath+"ldamod-"+subject+".p"
ldaindpath = basepath+"ldaind-"+subject+".p"
kmpath =  basepath+"km-"+subject+".p"
ncentpath = basepath+"ncent-"+subject+".p"
Xpath = basepath+"Xtrain-"+subject+".p"
trainsetpath = basepath+"Tset-"+subject+".p"
dictpath = basepath+"Dict-"+subject+".p"
with open(cvectpath, 'rb') as f:
    cvecb = f.read()
with open(vectpath, 'rb') as f:
    vecb = f.read()
with open(tfidftranpath, 'rb') as f:
    tranb = f.read()
with open(namspath, 'rb') as f:
    groupb = f.read()
with open(rfpath, 'rb') as f:
    rfbb = f.read()
with open(lsimodpath, 'rb') as f:
    lsimodb = f.read()
with open(lsiindpath, 'rb') as f:
    lsib = f.read()
with open(ldamodpath, 'rb') as f:
    ldamodb = f.read()
with open(ldaindpath, 'rb') as f:
    ldab = f.read()
with open(kmpath, 'rb') as f:
    kmpathb = f.read()
with open(ncentpath, 'rb') as f:
    ncentb = f.read()
with open(Xpath, 'rb') as f:
    Xb = f.read()
with open(trainsetpath, 'rb') as f:
    trainingb = f.read()
with open(dictpath, 'rb') as f:
    dictb = f.read()

c_vectorizer = pickle.loads(str(cvecb))
tfidf_transformer = pickle.loads(str(tranb))
groupnames = pickle.loads(str(groupb))
clfrf = pickle.loads(str(rfbb))
vectorizer = pickle.loads(str(vecb))
new_centroids = pickle.loads(str(ncentb))
index_lsi = pickle.loads(str(lsib))
lsi = pickle.loads(str(lsimodb))
index_lda = pickle.loads(str(ldab))
lda = pickle.loads(str(ldamodb))
km = pickle.loads(str(kmpathb))
Xtrain = pickle.loads(str(Xb))
trainingSets = pickle.loads(str(trainingb))
dictionary = pickle.loads(str(dictb))

      


In [39]:
traininglist = makeInvertedTrainingList(trainingSets)
traindocs = [tex[0] for tex in traininglist]
trainlabel = [tex[1] for tex in traininglist]
traintarget = [tex[2] for tex in traininglist]


In [40]:
print traindocs[33]
print trainlabel[33]

non-detection \nu Green Bank Telescope observations ultra-faint dwarf spheroidal galaxy Segue non-negligible halo magnetic field Milky Way bounds particle dark matter properties model galaxy Einasto dark matter profile synchrotron flux dark matter annihilation function magnetic field strength B diffusion coefficient D_0 particle mass m_\chi different annihilation channels data disfavor annihilations m_\chi sensitive b \bar b channel fiducial B \sim Segue proximity Milky Way models annihilation \tau^+\tau ^ m_\chi intermediate value D_0 consistency data compelling limits WIMP annihilation \mu^+\mu ^ m_\chi confidence D_0 Milky Way value B 
Astro


In [41]:
statement = ""
statement = traindocs[33]
gl = makeguess(statement, km, vectorizer, lsi, dictionary, index_lsi, lda, index_lda, Xtrain, traindocs)

print len(gl)

112


In [42]:
for i in range(0,len(gl)):
    print str(gl[i][0])+ "  "+ str(gl[i][1])+ "  "+trainlabel[gl[i][1]]
    

1.0  523  Astro
1.0  473  Astro
1.0  108  Astro
1.0  33  Astro
0.333405502511  989  high eng
0.276095396983  277  Astro
0.276095396983  27  Astro
0.276095396983  8  Astro
0.244567490564  492  Astro
0.244567490564  78  Astro
0.195728070244  1208  high eng
0.172697944777  1184  high eng
0.172697944777  1131  high eng
0.168100883865  1174  high eng
0.164709975197  1040  high eng
0.164709975197  1036  high eng
0.164116095207  1169  high eng
0.149216749855  1071  high eng
0.149216749855  1046  high eng
0.142829297232  351  Astro
0.12957074053  449  Astro
0.128809571458  71  Astro
0.126327319563  1062  high eng
0.126327319563  1001  high eng
0.120333277516  867  GenRelQGr
0.118888489006  478  Astro
0.114896982065  245  Astro
0.112429159679  748  GenRelQGr
0.109364763421  463  Astro
0.109364763421  403  Astro
0.109364763421  292  Astro
0.105051499835  879  GenRelQGr
0.104909179466  1068  high eng
0.104272141967  484  Astro
0.103594373668  29  Astro
0.0996341331268  1033  high eng
0.0981179990

In [43]:
#now use random forest classifier

In [44]:
stmt = "Einsteins general theory of relativity explains gravity as curvature in spacetime"
#stmt = traindocs[33]
X_new_counts = c_vectorizer.transform([stmt])
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predictedrf = clfrf.predict(X_new_tfidf)


In [45]:
print groupnames[predictedrf[0]]

GenRelQGr


In [46]:
#now do centroid

In [47]:
item = "genome dna rna gene sequence alignment express"
item = traindocs[33]
z = cosdistString(vectorizer, item, new_centroids)

In [48]:
print z[0:4]

[(0.20095832754718015, 0), (0.12056768576374675, 3), (0.089435731081336012, 1), (0.061377081514750839, 2)]


In [49]:
print groupnames[z[0][1]]

Astro


In [50]:
def bigpredict(statement, km, vectorizer, lsi, dictionary, index_lsi, lda, index_lda, 
               Xtrain, traindocs, c_vectorizer, clfrf, tfidf_transformer, new_centroids  ):
    
    bestof3l = makeguess(statement, km, vectorizer, lsi, dictionary, index_lsi, 
                        lda, index_lda, Xtrain, traindocs)
    if len(bestof3l)> 0:
        best = trainlabel[bestof3l[0][1]]
    else:
        best = "?"
    if len(bestof3l) > 1:
        nextb = trainlabel[bestof3l[1][1]]
    else:
        nextb = "?"
    
    X_new_counts = c_vectorizer.transform([statement])
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    predictedrf = clfrf.predict(X_new_tfidf)
    rf = groupnames[predictedrf[0]]
    
    z = cosdistString(vectorizer, statement, new_centroids)
    cent = groupnames[z[0][1]]
    
    return rf, best, nextb, cent
    
    
    


In [51]:
print len(traindocs)

1494


In [52]:
statement = traindocs[33]
print trainlabel[33]
rf, best, nextb, cent = bigpredict(statement, km, vectorizer, lsi, dictionary, index_lsi, 
    lda, index_lda, Xtrain, traindocs, c_vectorizer, clfrf, tfidf_transformer, new_centroids  )
print "rf="+rf+" best="+best+" nextb="+nextb+" cent="+cent

Astro
rf=Astro best=Astro nextb=Astro cent=Astro


In [53]:
topl = []
for top in supertopics:
    s = " "+top[0].lower()+". "
    for x in top[1]:
        s = s + x + " "
    topl.extend([(top[0], s)])
topdict = {x[0]:x[1] for x in topl}  


In [54]:
def findTopic(topdict, groupnames, found):
    for nm in groupnames:
        if topdict[nm].find(found)> 0:
            return nm
    return "none"


In [55]:
loadset

[u'astro-ph.CO',
 u'astro-ph.EP',
 u'astro-ph.GA',
 u'astro-ph.HE',
 u'astro-ph.IM',
 u'astro-ph.SR',
 u'cond-mat.dis-nn',
 u'cond-mat.mes-hall',
 u'cond-mat.mtrl-sci',
 u'cond-mat.other',
 u'cond-mat.quant-gas',
 u'cond-mat.soft',
 u'cond-mat.stat-mech',
 u'cond-mat.str-el',
 u'cond-mat.supr-con',
 u'nucl-th',
 u'quant-ph',
 u'gr-qc',
 u'hep-ex',
 u'hep-ph',
 u'hep-th',
 u'math-ph',
 u'physics.acc-ph',
 u'physics.ao-ph',
 u'physics.atm-clus',
 u'physics.atom-ph',
 u'physics.bio-ph',
 u'physics.chem-ph',
 u'physics.class-ph',
 u'physics.comp-ph',
 u'physics.data-an',
 u'physics.ed-ph',
 u'physics.flu-dyn',
 u'physics.gen-ph',
 u'physics.geo-ph',
 u'physics.hist-ph',
 u'physics.ins-det',
 u'physics.med-ph',
 u'physics.optics',
 u'physics.plasm-ph',
 u'physics.pop-ph',
 u'physics.soc-ph',
 u'physics.space-ph']

In [56]:
def load_data2(titles, sitenames, disp_title, loadset):
    s = set(loadset)
    ti = []
    si = []
    di = []
    for i in range(len(titles)):
        if sitenames[i] in s:
            ti.append(titles[i])
            si.append(sitenames[i])
            di.append(disp_title[i])
    return ti, si, di    

In [57]:
titles, sitenames, disp_title = load_data2(titles, sitenames, disp_title, loadset)
print "data loaded"

data loaded


In [58]:
print len(titles)

2000


In [59]:

outTable = {gname:0 for gname in groupnames}
outCount = {gname:0 for gname in groupnames}
rfWin = {gname:0 for gname in groupnames}
bestWin = {gname:0 for gname in groupnames}
nextWin = {gname:0 for gname in groupnames}
centWin = {gname:0 for gname in groupnames}
falseposBes = {gname:0 for gname in groupnames}
falseposRf = {gname:0 for gname in groupnames}
bothAgr = {gname:0 for gname in groupnames}
listTable = {gname:[] for gname in groupnames}
num = 0.
correct = 0.
dataIn = []
ans = []
for i in range(0, len(titles)):
    statement = titles[i]
    rf, best, nextb, cent = bigpredict(statement, km, vectorizer, lsi, dictionary, index_lsi, 
    lda, index_lda, Xtrain, traindocs, c_vectorizer, clfrf, tfidf_transformer, new_centroids  )
    tup = (rf,best,nextb, cent, i, disp_title[i], titles[i], sitenames[i])
    #print "rf="+rf[0:5]+" best="+best[0:5]+" nextb="+nextb[0:5]+" cent="+cent[0:5]
    dt = disp_title[i]
    j = dt.find("[")
    if j > 0:
        jj = dt.find("]")
        q = dt[j+1:jj]
        #print q
    else:
        q = "none"
    if q != "none":
        p = findTopic(topdict, groupnames, q)
        #print "p ="+p
        if  p != "none":
            num = num+1.0
            outCount[p] = outCount[p]+1
            if p == rf or p == best:
                correct = correct+1
                outTable[p] = outTable[p]+1
                #prstr = "best="+best[0:4] + "\tsecond="+nextb[0:4]+ "\trb="+t[0:4]+"\tsec="+sec[0:4]
                #print prstr + "\tfound = "+ findTopic(topdict, groupnames, q)[0:4] +" "+str(i)
                if p == best:
                    bestWin[p] = bestWin[p]+1
                if p == rf:
                    rfWin[p] = rfWin[p]+1
                if p == cent:
                    centWin[p] = centWin[p]+ 1 
            #dataIn.extend([(nameIndex[best],nameIndex[nextb],nameIndex[t],nameIndex[sec])])
            #ans.extend([nameIndex[findTopic(topdict, groupnames, q)]])
            if( p == best and p == rf):
                bothAgr[p] = bothAgr[p]+ 1
            if (p != best):
                falseposBes[best] = falseposBes[best]+1 
            if (p != rf):
                falseposRf[rf] = falseposRf[rf]+1 
            # this will only work for the labled data.  save it under "true" category
            listTable[p].extend([tup])
    # the following is the way it should be done
    #listTable[best].extend([tup])
    #if best != nextb:
        #listTable[nextb].extend([tup])
print "correct rate = "+ str(100.0*correct/num)
for nam in groupnames:
    if outCount[nam]> 0:
        ans = nam[0:9] + " \t= "+ str(100.0*outTable[nam]/outCount[nam])[0:5]
        #ans = ans+  " \tboth agree ="+str(100.0*bothAgr[nam]/outCount[nam])[0:5]
        ans = ans+ " \t rf win = "+str(100.0*rfWin[nam]/outCount[nam])[0:5]
        ans = ans+ " \t best win = "+str(100.0*bestWin[nam]/outCount[nam])[0:5]
        ans = ans + "\t cent win " +str(100.0*centWin[nam]/outCount[nam])[0:5]
        #ans = ans+ " \t false pos rf = "+str(100.0*falseposRf[nam]/num)[0:5]
        #ans = ans+ " \t false pos best = "+str(100.0*falseposBes[nam]/num)[0:5]
        print ans
    #pickled = pickle.dumps(listTable[nam])
    #path = "labeled_dump_"+subj+"_subtopic_"+nam+".p"
    #path = "dump_"+subj+"_subtopic_"+nam+".p"
    #doc_service.put_block_blob_from_bytes("sciml", path, pickled)


correct rate = 88.1968859869
Astro 	= 97.94 	 rf win = 96.70 	 best win = 93.00	 cent win 88.75
GenRelQGr 	= 93.69 	 rf win = 80.40 	 best win = 85.36	 cent win 85.13
Cond Matt 	= 75.17 	 rf win = 41.84 	 best win = 75.17	 cent win 68.08
high eng 	= 77.62 	 rf win = 46.10 	 best win = 73.89	 cent win 57.62
MathPh 	= 74.46 	 rf win = 50.35 	 best win = 70.21	 cent win 65.24
quantum 	= 65.82 	 rf win = 48.10 	 best win = 65.82	 cent win 63.29
ed-physic 	= 82.71 	 rf win = 64.81 	 best win = 82.09	 cent win 74.69


In [None]:
i