## Winning Jeopardy

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
jeopardy = pd.read_csv('../Datasets/jeopardy.csv')
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
jeopardy.iloc[4, 5]

'Signer of the Dec. of Indep., framer of the Constitution of Mass., second President of the United States'

In [4]:
jeopardy.columns = [col.replace(' ', '') for col in jeopardy.columns]
jeopardy.columns

Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

## Normalizing Columns

In [5]:
import re

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\W", ' ', text)
    text = re.sub(r"\s+", " ", text)
    return text

In [6]:
normalize_text('Signer of the Dec. of Indep., framer of the Constitution of Mass., second President of the United States')

'signer of the dec of indep framer of the constitution of mass second president of the united states'

In [7]:
def normalize_text(text):
    text = text.lower()
    text = re.sub(r"[^A-Za-z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

def normalize_values(text):
    text = str(text)
    text = re.sub(r"[^A-Za-z0-9\s]", "", text)
    try:
        text = int(text)
    except ValueError:
        text = 0
    return text

In [8]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normalize_text)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize_text)
jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_values)

In [9]:
jeopardy['clean_value'].unique()

array([  200,   400,   600,   800,  2000,  1000,  1200,  1600,  3200,
           0,  5000,   100,   300,   500,  1500,  4800,  1800,  1100,
        2200,  3400,  3000,  4000,  6800,  1900,  3100,   700,  1400,
        2800,  8000,  6000,  2400, 12000,  3800,  2500,  6200, 10000,
        7000,  1492,  7400,  1300,  7200,  2600,  3300,  5400,  4500,
        2100,   900,  3600,  2127,   367,  4400,  3500,  2900,  3900,
        4100,  4600, 10800,  2300,  5600,  1111,  8200,  5800,   750,
        7500,  1700,  9000,  6100,  1020,  4700,  2021,  5200,  3389],
      dtype=int64)

In [10]:
jeopardy["Air Date"] = pd.to_datetime(jeopardy['AirDate'])

In [11]:
# x = jeopardy['Value'].str.replace('[$,]', '', regex=True)

In [12]:
# pd.to_numeric(x).fillna(0).unique()

In [13]:
def count_matches(row):
    split_answer = row['clean_answer'].split()
    split_question = row['clean_question'].split()
    match_count = 0

    if 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    for word in split_answer:
        if word in split_question:
            match_count += 1
    return match_count / len(split_answer)


In [14]:
jeopardy["answer_in_question"] = jeopardy.apply(count_matches, axis=1)

In [15]:
jeopardy['answer_in_question'].mean()

0.05900196524977763

In [16]:
question_overlap = []
terms_used = set()
jeopardy.sort_values('Air Date', inplace=True)

for index, row in jeopardy.iterrows():
    split_question = row['clean_question'].split()
    split_question = [word for word in split_question if len(word) > 5]
    match_count = 0

    for word in split_question:
        if word in terms_used:
            match_count += 1
        terms_used.add(word)
    if len(split_question) > 0:
        match_count /= len(split_question)
    question_overlap.append(match_count)

jeopardy['question_overlap'] = question_overlap
jeopardy['question_overlap'].mean()

0.6894031359073245

## Low Value vs. High Value Questions

There is about a `70%` overlap between terms in new questions and terms in old questions. This only looks at a small set of questions, and it doesn't look at phrases — it looks at single terms. This makes it relatively insignificant, but it does mean that it's worth looking more into the recycling of questions.

In [17]:
def determine_value(row):
    value = 0
    if row['clean_value'] > 800:
        value = 1
    return value

jeopardy['high_value'] = jeopardy.apply(determine_value, axis=1)

In [18]:
def count_usage(term):
    low_count = 0
    high_count = 0

    for index, row in jeopardy.iterrows():
        split_question = row['clean_question'].split()
        if term in split_question:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1

    return high_count, low_count
        
    

In [19]:
from random import choice, randint
comparison_terms = [choice(list(terms_used)) for _ in range(10)]
observed_expected = []

for term in comparison_terms:
    observed_expected.append(count_usage(term))

In [20]:
observed_expected

[(1, 0),
 (10, 15),
 (1, 0),
 (0, 1),
 (1, 0),
 (0, 1),
 (0, 2),
 (0, 1),
 (0, 1),
 (0, 1)]

In [21]:
from scipy.stats import chisquare

high_value_count = (jeopardy['high_value'] == 1).sum()
low_value_count = (jeopardy['high_value'] == 0).sum()

chi_squared = []

for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    expected_high_count = total_prop * high_value_count
    expected_low_count = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([expected_high_count, expected_low_count])
    chi_squared.append(chisquare(observed, expected))
    
    

In [22]:
chi_squared

[Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=1.568834083924655, pvalue=0.21037641379142114),
 Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.803925692253768, pvalue=0.3699222378079571),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469)]

## Chi-Squared Results

None of the terms had a significant difference in usage between high value and low value rows. Additionally, the frequencies were all lower than 5, so the chi-squared test isn't as valid. It would be better to run this test with only terms that have higher frequencies.