In [1]:
import pandas as pd

In [2]:
jeopardy = pd.read_csv("jeopardy.csv")

In [4]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [8]:
jeopardy.columns = jeopardy.columns.str.strip()

In [10]:
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

In [14]:
jeopardy = jeopardy.sort_values("Air Date")

In [23]:
jeopardy["Value"] = jeopardy["Value"].str.replace(r"[$,]", "", regex=True).fillna(0).astype(float)

In [27]:
jeopardy["Clean Question"] = jeopardy.Question.str.lower().str.replace(r"[\W\s+]"," ")
jeopardy["Clean Answer"] = jeopardy.Answer.str.lower().str.replace(r"[\W\s+]"," ")

In [28]:
jeopardy

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,Clean Question,Clean Answer
19325,10,1984-09-21,Final Jeopardy!,U.S. PRESIDENTS,0.0,"Adventurous 26th president, he was 1st to ride...",Theodore Roosevelt,"adventurous 26th president, he was 1st to ride...",theodore roosevelt
19301,10,1984-09-21,Double Jeopardy!,LABOR UNIONS,200.0,Notorious labor leader missing since '75,Jimmy Hoffa,notorious labor leader missing since '75,jimmy hoffa
19302,10,1984-09-21,Double Jeopardy!,1789,200.0,"Washington proclaimed Nov. 26, 1789 this first...",Thanksgiving,"washington proclaimed nov. 26, 1789 this first...",thanksgiving
19303,10,1984-09-21,Double Jeopardy!,TOURIST TRAPS,200.0,Both Ferde Grofe' & the Colorado River dug thi...,the Grand Canyon,both ferde grofe' & the colorado river dug thi...,the grand canyon
19304,10,1984-09-21,Double Jeopardy!,LITERATURE,200.0,"Depending on the book, he could be a ""Jones"", ...",Tom,"depending on the book, he could be a ""jones"", ...",tom
...,...,...,...,...,...,...,...,...,...
1953,6294,2012-01-19,Double Jeopardy!,WEAPONS OF WORLD WAR II,800.0,"Ships in the U.S. Navy's Casablanca class of ""...",aircraft carriers,"ships in the u.s. navy's casablanca class of ""...",aircraft carriers
1954,6294,2012-01-19,Double Jeopardy!,ACTING PRESIDENTS ON TV,800.0,Dennis Haysbert & D.B. Woodside as David & Way...,24,dennis haysbert & d.b. woodside as david & way...,24
1955,6294,2012-01-19,Double Jeopardy!,4 N,800.0,"""U"" know it means not deliberate; I'm sorry, t...",unintentional,"""u"" know it means not deliberate; i'm sorry, t...",unintentional
1945,6294,2012-01-19,Double Jeopardy!,AMERICAN HISTORY,400.0,In December 1974 this former New York governor...,Rockefeller,in december 1974 this former new york governor...,rockefeller


# Answer in Question

In [30]:
def count_matches(row):
    splitted_question = row["Clean Question"].split()
    splitted_answer = row["Clean Answer"].split()
    
    if "the" in splitted_answer:
        splitted_answer.remove("the")
    
    if len(splitted_answer) == 0:
        return 0
    
    count_match = 0
    for word in splitted_answer:
        if word in splitted_question:
            count_match += 1
            
    return count_match / len(splitted_answer)      

In [35]:
jeopardy["answer_in_question"] = jeopardy.apply(count_matches, axis = 1)

In [36]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,Clean Question,Clean Answer,answer_in_question
19325,10,1984-09-21,Final Jeopardy!,U.S. PRESIDENTS,0.0,"Adventurous 26th president, he was 1st to ride...",Theodore Roosevelt,"adventurous 26th president, he was 1st to ride...",theodore roosevelt,0.0
19301,10,1984-09-21,Double Jeopardy!,LABOR UNIONS,200.0,Notorious labor leader missing since '75,Jimmy Hoffa,notorious labor leader missing since '75,jimmy hoffa,0.0
19302,10,1984-09-21,Double Jeopardy!,1789,200.0,"Washington proclaimed Nov. 26, 1789 this first...",Thanksgiving,"washington proclaimed nov. 26, 1789 this first...",thanksgiving,0.0
19303,10,1984-09-21,Double Jeopardy!,TOURIST TRAPS,200.0,Both Ferde Grofe' & the Colorado River dug thi...,the Grand Canyon,both ferde grofe' & the colorado river dug thi...,the grand canyon,0.0
19304,10,1984-09-21,Double Jeopardy!,LITERATURE,200.0,"Depending on the book, he could be a ""Jones"", ...",Tom,"depending on the book, he could be a ""jones"", ...",tom,0.0


In [38]:
jeopardy["answer_in_question"].mean()*100

4.552247259382615

On average, the answer only makes up for about `4.55%` of the question. This isn't a huge number, and it means that we probably can't just hope that hearing a question will enable us to determine the answer. We'll probably have to study.

# Recycled Questions/Question Overlapped

In [41]:
question_overlap_percentage = []
term_used = set()

for i,row in jeopardy.iterrows():
    split_question = row["Clean Question"].split()
    split_question = [word for word in split_question if len(split_question)>5]
    
    match_count = 0
    for word in split_question:
        if word in term_used:
            match_count += 1
    
    for word in split_question:
        term_used.add(word)
        
    if len(split_question) > 0:
        match_count /= len(split_question)
        
    question_overlap_percentage.append(match_count)      

In [42]:
jeopardy["question_overlap"] = question_overlap_percentage

In [44]:
jeopardy["question_overlap"].mean()*100

77.52177907641111

# Low Value vs. High Value Questions

In [47]:
jeopardy["high_value"] = jeopardy["Value"].apply(lambda x: 0 if x < 800 else 1)

In [48]:
def count_usage(term):
    low_count = 0
    high_count = 0
    
    for i, row in jeopardy.iterrows():
        if term in row["Clean_question"].split():
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return low_count,high_count

In [50]:
from random import choice

terms_used_list = list(term_used)
comparison_terms = [choice(terms_used_list) for _ in range(10)]

observed_expected = []

for term in comparison_terms:
    observed_expected.append(count_usage(term))

observed_expected

KeyError: 'clean_question'