In [30]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import string

# Loading data:

In [2]:
jeopardy = pd.read_csv("jeopardy.csv")
jeopardy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
print(jeopardy.columns)

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')


In [4]:
jeopardy.columns = jeopardy.columns.str.strip(" ")

# Normalizing data:

In [5]:
def normalize(s):
    l =  s.lower()
    for i in string.punctuation:
        l = l.replace(i, '')
    return l.strip(' ')

jeopardy["clean_question"] = jeopardy["Question"].apply(normalize)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize)

In [6]:
def norm_dollar(s):
    for i in string.punctuation:
        s = s.replace(i, '')
    try:
        num = int(s)
    except Exception:
        num = 0
    return num
    
jeopardy["clean_value"] = jeopardy["Value"].apply(norm_dollar)

In [7]:
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

# Analizeing data:
    How often the answer is deducible from the question.
    How often new questions are repeats of older questions.


In [8]:
def match(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    match_count = 0
    if "the" in split_answer:
        split_answer.remove("the")
    for w in split_answer:
        if w in split_question:
            match_count += 1
    if len(split_answer) != 0:
        return float(match_count) / len(split_answer)
    else:
        return 0
jeopardy["answer_in_question"] = jeopardy.apply(match, axis=1)

In [9]:
jeopardy["answer_in_question"].mean()

0.06036527447973025

The questions and answers don't seem to have much words in common. So there is little chance to find the answer by lokking at the words of questions.

In [10]:
jeopardy.sort_values(by="Air Date", ascending=True, inplace=True)

In [11]:
question_overlap = []
terms_used = set()
for i in jeopardy.index:
    row = jeopardy.loc[i]
    split_question = row["clean_question"].split(" ")
    long_terms = [w for w in split_question if len(w)>=6]
    match_count = 0
    for w in long_terms:
        if w in terms_used:
            match_count += 1
        terms_used.add(w)
    if len(split_question) > 0:
        question_overlap.append(match_count / len(split_question))
    else:
        question_overlap.append(0)
jeopardy["question_overlap"] = pd.Series(question_overlap,index=jeopardy.index)

In [12]:
jeopardy["question_overlap"].mean()

0.21632769252362086

The overlap value increases over time, as the number of question accumulates. But we can not infer the high overlap comes from questions being recycled. Further analysis is needed.

In [13]:
def hi_low(row):
    if row["clean_value"] > 800:
        return 1
    else:
        return 0
    
jeopardy["high_value"] = jeopardy.apply(hi_low, axis=1)

In [20]:
def hl_count(word):
    low_count = 0
    high_count = 0
    for _, row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        if word in split_question:
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

observed_expected = []
comparison_terms = list(terms_used)[:5]
for term in comparison_terms:
    observed_expected.append(hl_count(term))

In [21]:
observed_expected

[(0, 2), (0, 1), (1, 0), (1, 5), (0, 3)]

In [26]:
high_value_count = np.sum(jeopardy["high_value"] == 1)
low_value_count = np.sum(jeopardy["high_value"] == 0)

In [36]:
chi_squared = []
p_value = []
for h, l in observed_expected:
    total = h + l
    total_prop = total / jeopardy.shape[0]
    expected_high = high_value_count * total_prop
    expected_low = low_value_count * total_prop
    chi2, p = stats.chisquare([expected_high, expected_low], [h, l])
    chi_squared.append(chi2)
    p_value.append(p)

In [37]:
print(chi_squared)
print(p_value)

[inf, inf, inf, 0.62257433087665182, inf]
[0.0, 0.0, 0.0, 0.43009225421889385, 0.0]


None of them are valid statistically significant results. Maybe we should computer more values.