# Jeopardy Topics

In [1]:
import numpy as np
import spacy
nlp = spacy.load('en_core_web_sm')
import pandas as pd
pd.set_option('display.max_colwidth', 800)

In [2]:
JEOPARDY_CSV =  '../data/jeopardy/Jeopardy.csv'

questions = pd.read_csv(JEOPARDY_CSV)

questions.columns = [x.strip() for x in questions.columns]

In [3]:
questions = questions.dropna(subset=['Question'])

In [4]:
questions['Category'].nunique()

27995

In [7]:
file='../data/jeopardy/JQuestions.txt'
questions['Question'].sample(frac=0.04,replace=False,random_state=0).to_csv(file)

f=open(file,'r',encoding='utf-8')
text=f.read()
f.close()

doc=nlp(text)
pos_list=['NOUN']
preproc_text=[]
preproc_sent=[]

for token in doc:
    if token.text!='\n':
        if not(token.is_stop) and not(token.is_punct) and token.pos_ in pos_list:
            preproc_sent.append(token.lemma_)
    else:
        preproc_text.append(preproc_sent)
        preproc_sent=[]

preproc_text.append(preproc_sent) #last sentence

print(preproc_text)



In [30]:
import tomotopy as tp
NUM_TOPICS=1000

mdl = tp.LDAModel(k=NUM_TOPICS,seed=1234)

for line in preproc_text:
    mdl.add_doc(line)
    
mdl.train(10)
    
for k in range(mdl.k):
    print('Top 7 words of topic #{}'.format(k))
    print(mdl.get_topic_words(k, top_n=7))

Top 7 words of topic #0
[('edition', 0.046412043273448944), ('type', 0.03483796492218971), ('city', 0.011689815670251846), ('web', 0.011689815670251846), ('country', 0.011689815670251846), ('creature', 0.011689815670251846), ('age', 0.011689815670251846)]
Top 7 words of topic #1
[('state', 0.08959732204675674), ('square', 0.0224832221865654), ('wood', 0.0224832221865654), ('plate', 0.0224832221865654), ('moon', 0.011297539807856083), ('o', 0.011297539807856083), ('hotel', 0.011297539807856083)]
Top 7 words of topic #2
[('poem', 0.03483796492218971), ('song', 0.03483796492218971), ('lady', 0.03483796492218971), ('year', 0.02326389029622078), ('pig', 0.02326389029622078), ('god', 0.011689815670251846), ('citation', 0.011689815670251846)]
Top 7 words of topic #3
[('fame', 0.011556064710021019), ('limit', 0.011556064710021019), ('country', 0.011556064710021019), ('sect', 0.011556064710021019), ('butterfly', 0.011556064710021019), ('hospital', 0.011556064710021019), ('mph', 0.01155606471002

[('martini', 0.02439320646226406), ('chain', 0.02439320646226406), ('door', 0.012257282622158527), ('son', 0.012257282622158527), ('applause', 0.012257282622158527), ('mother', 0.012257282622158527), ('animal', 0.012257282622158527)]
Top 7 words of topic #561
[('degree', 0.05040242150425911), ('singer', 0.020221328362822533), ('liquor', 0.020221328362822533), ('day', 0.020221328362822533), ('fire', 0.020221328362822533), ('age', 0.010160966776311398), ('relativity', 0.010160966776311398)]
Top 7 words of topic #562
[('author', 0.04247881844639778), ('day', 0.03188559412956238), ('order', 0.03188559412956238), ('rest', 0.021292373538017273), ('mean', 0.021292373538017273), ('farmer', 0.021292373538017273), ('diagram', 0.010699152946472168)]
Top 7 words of topic #563
[('company', 0.022997712716460228), ('media/2006', 0.022997712716460228), ('son', 0.011556064710021019), ('woman', 0.011556064710021019), ('pass', 0.011556064710021019), ('cartoon', 0.011556064710021019), ('snake', 0.01155606

In [31]:
print('Log perplexity=',mdl.ll_per_word)

Log perplexity= -14.396450040387437


In [32]:
bag_of_words=[word for sent in preproc_text for word in sent]
doc_inst = mdl.make_doc(bag_of_words)
np.argsort(np.array(mdl.infer(doc_inst)[0]))[::-1]

array([461, 234, 640, 679, 852, 978, 686, 862, 844, 748, 450, 504, 377,
       404, 992, 310, 840, 128, 565, 827, 899, 109, 751, 367, 782, 559,
       428, 996,  93, 684, 163, 121, 731, 519, 922, 639, 726, 645,  91,
       320, 405, 270, 815, 786, 126, 500, 512, 944, 113, 929, 412, 765,
       954, 553, 397, 332, 671, 498, 875, 792,  68, 296, 974, 592, 236,
        50, 798, 237, 140, 704, 646, 133, 456, 744, 204, 244, 691, 550,
       773, 607, 242, 386, 695, 104, 957,  47, 543, 895,  81, 724, 960,
       334, 642, 528, 132, 573, 110, 352, 321,  46, 819, 610, 314, 415,
       739, 846, 877, 619,  71, 985, 372, 103, 921,  98, 276, 624, 894,
       987,  25, 520, 452, 298, 778, 737, 218, 523,  37, 223, 809, 828,
       284, 659, 424, 709, 290, 505, 228, 972, 538, 219, 579,   1, 589,
       229, 823, 402, 700, 915, 353,  74,  15, 328, 701, 813, 338, 427,
       758, 118, 950, 356, 946, 469, 226, 488, 258, 481,  65, 812,   9,
       442, 664, 584, 266, 602, 423, 631, 187, 581, 774, 491, 21

In [33]:
print(mdl.get_topic_words(461, top_n=7))

[('city', 0.15946216881275177), ('device', 0.02001992054283619), ('force', 0.02001992054283619), ('character', 0.02001992054283619), ('death', 0.010059761814773083), ('person', 0.010059761814773083), ('language', 0.010059761814773083)]


In [34]:
print(mdl.get_topic_words(234, top_n=7))

[('year', 0.09871795773506165), ('group', 0.02968442067503929), ('child', 0.019822485744953156), ('murder', 0.019822485744953156), ('field', 0.019822485744953156), ('writing', 0.009960552677512169), ('memorial', 0.009960552677512169)]


In [36]:
print(mdl.get_topic_words(186, top_n=7))

[('dragon', 0.027016131207346916), ('power', 0.013575269840657711), ('flying', 0.013575269840657711), ('line', 0.013575269840657711), ('process', 0.013575269840657711), ('crystal', 0.013575269840657711), ('freestyle', 0.013575269840657711)]
