# This is Jeopardy!

In [109]:
import pandas as pd
import random
pd.set_option('display.max_colwidth', None)

In [2]:
df = pd.read_csv('jeopardy.csv')

In [3]:
df.head()
df.info()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,"No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves",Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,"The city of Yuma in this state has a record average of 4,055 hours of sunshine each year",Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", this company served its billionth burger",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Constitution of Mass., second President of the United States",John Adams


In [98]:
#Rename Columns
df.columns = ['show_number', 'date', 'round','category', 'value', 'question', 'answer']

Unnamed: 0,show_number,date,round,category,value,question,answer
0,4680,2004-12-31,Jeopardy!,HISTORY,200.0,"For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory",Copernicus


In [6]:
#Clean value column
df = df.dropna(subset=['value'])
df['value'] = df.value.str.replace('[$,(None)]', '')
df['value'] = pd.to_numeric(df.value)

In [77]:
def filterQuestion(filterList):
    '''Filters the dataset for questions that contains all of the words in a list of words'''
    lower = lambda string : str(string).lower()
    filterList = tuple(map(lower, filterList))
    filter_by_question = lambda col : all([ele.lower() in col.lower().split(' ') for ele in filterList])
    return df[df.question.apply(filter_by_question)]

In [75]:
def find_avg_difficulty(*filterList):
    '''find average difficulty of a type of question'''
    df1 = filterQuestion(filterList)
    return df1.value.mean()

In [114]:
find_avg_difficulty('King')

818.8878000979912

In [12]:
def find_most_common(*filterList):
    '''find most common answers to questions containing words in filterList'''
    unique_answers = filterQuestion(filterList).drop_duplicates(subset=['answer'])['answer']
    freqs = {val : df[df.answer == val].answer.count() for val in unique_answers}
    return {k: v for k, v in sorted(freqs.items(), key=lambda item: item[1], reverse=True)}

In [13]:
find_most_common('King')

{'China': 216,
 'Australia': 215,
 'France': 193,
 'India': 185,
 'Canada': 176,
 'Spain': 171,
 'Alaska': 161,
 'Italy': 160,
 'Hawaii': 157,
 'Ireland': 136,
 'Brazil': 133,
 'London': 132,
 'Sweden': 130,
 'George Washington': 128,
 'Greece': 125,
 'Ronald Reagan': 123,
 'Egypt': 122,
 'Georgia': 118,
 'Norway': 111,
 'Denmark': 109,
 'Virginia': 107,
 'Napoleon': 106,
 'Louisiana': 101,
 'Scotland': 101,
 'Rome': 100,
 'Poland': 95,
 'Cleopatra': 95,
 'Africa': 94,
 'Maine': 94,
 'Antarctica': 93,
 'Venice': 93,
 'Hamlet': 89,
 'Pennsylvania': 89,
 'Henry VIII': 85,
 'Belgium': 84,
 'Portugal': 81,
 'Elvis Presley': 80,
 'England': 74,
 'Macbeth': 74,
 'Athens': 71,
 'Hungary': 70,
 'Jerusalem': 70,
 'Alexander the Great': 69,
 'David': 68,
 '4': 66,
 'Rhode Island': 66,
 'the Philippines': 65,
 'Solomon': 61,
 'Morocco': 61,
 'Gerald Ford': 60,
 'World War I': 60,
 'Maryland': 59,
 'Washington, D.C.': 58,
 'Queen Victoria': 57,
 'Islam': 57,
 'King Lear': 55,
 'Victoria': 55,
 'Ri

In [78]:
def filter_by_year(year, df):
    return df[df.date.str.match('^' + str(year))]

In [83]:
def find_usage(year, filter_list):
    df1 = filter_by_year(year, filterQuestion(filter_list))
    return df1.shape[0]

In [89]:
def compare_usage_by_year(*filter_list):
    '''Returns a dataframe containing the usage of words in questions present
    in filter_list between 1980s to 2010s '''
    usages = [0, 0, 0, 0]
    for i in range(1984, 2013):
        if i < 1990:
            usages[0] += find_usage(i, filter_list)
        elif i < 2000:
            usages[1] += find_usage(i, filter_list)
        elif i < 2010:
            usages[2] += find_usage(i, filter_list)
        else:
            usages[3] += find_usage(i, filter_list)
    return pd.DataFrame({'decade': ['80s', '90s', '00s', '10s'], 
                        'usage': usages})

In [90]:
compare_usage_by_year('King', 'England')

Unnamed: 0,decade,usage
0,80s,2
1,90s,28
2,00s,38
3,10s,6


In [95]:
def compare_by_round(*filter_list):
    df1 = filterQuestion(filter_list)
    return pd.DataFrame({'round':['single', 'double'],
                         'usage': [df1[df1['round'] == 'Jeopardy!'].shape[0],
                                   df1[df1['round'] == 'Double Jeopardy!'].shape[0]]})

In [99]:
compare_by_round('Literature')

Unnamed: 0,round,usage
0,single,42
1,double,68


In [112]:
def ask_question():
    choice = input("Filter by keywords? (y/n) ")
    filter_list = ()
    while choice == 'y':
        filter_list += (input("Enter keyword: "), )
        choice = input("Add more keywords? (y/n) ")
    if len(filter_list) == 0:
        print("Too many keywords. Enter fewer")
    df1 = filterQuestion(filter_list)
    loc = random.randrange(0, df1.shape[0])
    print("Question:")
    question = df1.iloc[loc]['question']
    print(question)
    print("Answer: (in lowercase)")
    answer = input()
    if answer == df1.iloc[loc]['answer']:
        print('Correct')
    else:
        print('Incorrect. The correct answer is\n' + df1.iloc[loc]['answer'])

In [113]:
ask_question()

Filter by keywords? (y/n) y
Enter keyword: England
Add more keywords? (y/n) n
Question:
This future president was the first U.S. ambassador to England in 1785
Answer: (in lowercase)
idk
Incorrect. The correct answer is
John Adams
