## Winning Jeopardy

In [7]:
import pandas as pd
import numpy as np

In [8]:
jeopardy = pd.read_csv('jeopardy.csv')
new_columns = []
for i in jeopardy.columns:
    new_columns.append(i.strip())
    
jeopardy.columns = new_columns

In [9]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [10]:
jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 7 columns):
Show Number    19999 non-null int64
Air Date       19999 non-null object
Round          19999 non-null object
Category       19999 non-null object
Value          19999 non-null object
Question       19999 non-null object
Answer         19999 non-null object
dtypes: int64(1), object(6)
memory usage: 1.1+ MB


In [11]:
import re
def normlz(txt):
    
    txt = re.sub("[^A-Za-z0-9\s]", "", txt).lower()
    return txt

In [12]:
jeopardy['clean_question']=jeopardy['Question'].apply(normlz)

In [13]:
jeopardy['clean_answer']=jeopardy['Answer'].apply(normlz)

In [14]:
jeopardy['clean_answer'].head()

0    copernicus
1    jim thorpe
2       arizona
3     mcdonalds
4    john adams
Name: clean_answer, dtype: object

In [15]:
jeopardy['Value']=jeopardy['Value'].str.replace('$','')
jeopardy['Value']=pd.to_numeric(jeopardy['Value'], errors='coerce')

In [16]:
jeopardy['clean_value']=jeopardy['Value'].fillna(0)

In [17]:
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])

In [29]:
jeopardy.head(6)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value,answer_in_question,question_overlap
19325,10,1984-09-21,Final Jeopardy!,U.S. PRESIDENTS,,"Adventurous 26th president, he was 1st to ride...",Theodore Roosevelt,adventurous 26th president he was 1st to ride ...,theodore roosevelt,0.0,0.0,0.0
19301,10,1984-09-21,Double Jeopardy!,LABOR UNIONS,200.0,Notorious labor leader missing since '75,Jimmy Hoffa,notorious labor leader missing since 75,jimmy hoffa,200.0,0.0,0.0
19302,10,1984-09-21,Double Jeopardy!,1789,200.0,"Washington proclaimed Nov. 26, 1789 this first...",Thanksgiving,washington proclaimed nov 26 1789 this first n...,thanksgiving,200.0,0.0,0.0
19303,10,1984-09-21,Double Jeopardy!,TOURIST TRAPS,200.0,Both Ferde Grofe' & the Colorado River dug thi...,the Grand Canyon,both ferde grofe the colorado river dug this ...,the grand canyon,200.0,0.0,0.5
19304,10,1984-09-21,Double Jeopardy!,LITERATURE,200.0,"Depending on the book, he could be a ""Jones"", ...",Tom,depending on the book he could be a jones a sa...,tom,200.0,0.0,0.0
19305,10,1984-09-21,Double Jeopardy!,HOMONYMS,200.0,Hindu hierarchy or a play's actors,a caste (cast),hindu hierarchy or a plays actors,a caste cast,200.0,0.333333,0.0


In [19]:
def deduction(row):
    split_answer = row['clean_answer'].split(" ")
    split_question = row['clean_question'].split(" ")
    match_count = 0
    if 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer)==0:
        return 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return match_count/len(split_answer)

In [20]:
jeopardy["answer_in_question"] = jeopardy.apply(deduction, axis=1)

In [21]:
jeopardy["answer_in_question"].mean()

0.06049325706933587

In [37]:
question_overlap = []
terms_used = set([])

In [38]:
for i, row in jeopardy.sort_values('Air Date').iterrows():
    split_words = row['clean_question'].split(' ')
    split_words = [w for w in split_words if len(w)>5]
    #print(split_words)
    match_count = 0
    for word in split_words: # count unique words in question
        if word in terms_used: # count if this question's words were repeated previously. This however does not mean that the 
                                # exact question was repeated. Instead it counts if the word was repeated previously, which is 
                                # not a good measure.
            match_count += 1
    for word in split_words:
        terms_used.add(word)
    if len(split_words) > 0:
        match_count /= len(split_words) # percentage of how many words exist in the dictionary for this question
    question_overlap.append(match_count)
    
jeopardy['question_overlap'] = question_overlap
jeopardy['question_overlap'].mean()

0.6876924200174069

Aroud 68% of the words in questions are repeated from previous questions. This is not an indication of repeated questions. This only means that the vocabulary used in questions are 68% similar.

In [55]:
def low_high(row):
    return (1 if row['clean_value']>800 else 0)     

In [56]:
jeopardy['high_value'] = jeopardy.apply(low_high, axis=1)

In [60]:
def counts(word):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        splitw = row['clean_question'].split(" ")
        if word in splitw:
            if row['high_value'] == 1: # if this word is in this question and question is of high value, then increase high_count
                high_count += 1
            else: 
                low_count += 1
    return high_count, low_count

In [63]:
observed_expected = []
comparison_terms = list(terms_used)[:5]
for i in comparison_terms:
    observed_expected.append(counts(i))

In [64]:
observed_expected

[(0, 1), (2, 12), (1, 0), (0, 1), (0, 1)]

In [65]:
comparison_terms

['orphanage', 'reported', 'yggdrasil', 'beatty', 'jesters']

In [68]:
high_values = jeopardy[jeopardy['high_value']==1]['high_value'].sum() # how many questions are high valued

In [74]:
low_values = len(jeopardy[jeopardy['high_value']==0]['high_value']) # how many questions are low valued

In [77]:
from scipy.stats import chisquare
chi_squared = []
for i in observed_expected: # a list of 5 elements each has 1x2 shape 
    total = sum(i)
    total_prop = total/jeopardy.shape[0] 
    exp_high = total_prop*high_values
    exp_low = total_prop*low_values
    observed = np.array([i[0], i[1]])
    expected = np.array([exp_high, exp_low])
    
    chi_squared.append(chisquare(observed, expected))

In [79]:
chi_squared

[Power_divergenceResult(statistic=0.3308710986890265, pvalue=0.565146603267378),
 Power_divergenceResult(statistic=0.838195592262166, pvalue=0.3599133427437923),
 Power_divergenceResult(statistic=3.022325020112631, pvalue=0.08212564786568953),
 Power_divergenceResult(statistic=0.3308710986890265, pvalue=0.565146603267378),
 Power_divergenceResult(statistic=0.3308710986890265, pvalue=0.565146603267378)]