# This is Jeopardy!

In [1]:
import pandas as pd
import random
pd.set_option('display.max_colwidth', None)

In [2]:
df = pd.read_csv('jeopardy.csv')

In [3]:
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216930 entries, 0 to 216929
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Show Number  216930 non-null  int64 
 1    Air Date    216930 non-null  object
 2    Round       216930 non-null  object
 3    Category    216930 non-null  object
 4    Value       216930 non-null  object
 5    Question    216930 non-null  object
 6    Answer      216928 non-null  object
dtypes: int64(1), object(6)
memory usage: 11.6+ MB


In [4]:
#Rename Columns
df.columns = ['show_number', 'date', 'round','category', 'value', 'question', 'answer']

In [5]:
#Clean value column
df = df.dropna(subset=['value'])
df['value'] = df.value.str.replace('[$,(None)]', '')
df['value'] = pd.to_numeric(df.value)

In [6]:
def filterQuestion(filterList):
    '''Filters the dataset for questions that contains all of the words in a list of words'''
    lower = lambda string : str(string).lower()
    filterList = tuple(map(lower, filterList))
    filter_by_question = lambda col : all([ele.lower() in col.lower().split(' ') for ele in filterList])
    return df[df.question.apply(filter_by_question)]

In [7]:
def find_avg_difficulty(*filterList):
    '''find average difficulty of a type of question'''
    df1 = filterQuestion(filterList)
    return df1.value.mean()

In [8]:
find_avg_difficulty('King')

818.8878000979912

In [9]:
def find_most_common(*filterList):
    '''find most common answers to questions containing words in filterList'''
    unique_answers = filterQuestion(filterList).drop_duplicates(subset=['answer'])['answer']
    freqs = {val : df[df.answer == val].answer.count() for val in unique_answers}
    return {k: v for k, v in sorted(freqs.items(), key=lambda item: item[1], reverse=True)}

In [10]:
#find_most_common('King')

In [11]:
def filter_by_year(year, df):
    return df[df.date.str.match('^' + str(year))]

In [12]:
def find_usage(year, filter_list):
    df1 = filter_by_year(year, filterQuestion(filter_list))
    return df1.shape[0]

In [13]:
def compare_usage_by_year(*filter_list):
    '''Returns a dataframe containing the usage of words in questions present
    in filter_list between 1980s to 2010s '''
    usages = [0, 0, 0, 0]
    for i in range(1984, 2013):
        if i < 1990:
            usages[0] += find_usage(i, filter_list)
        elif i < 2000:
            usages[1] += find_usage(i, filter_list)
        elif i < 2010:
            usages[2] += find_usage(i, filter_list)
        else:
            usages[3] += find_usage(i, filter_list)
    return pd.DataFrame({'decade': ['80s', '90s', '00s', '10s'], 
                        'usage': usages})

In [14]:
compare_usage_by_year('King', 'England')

Unnamed: 0,decade,usage
0,80s,2
1,90s,28
2,00s,38
3,10s,6


In [15]:
def compare_by_round(*filter_list):
    df1 = filterQuestion(filter_list)
    return pd.DataFrame({'round':['single', 'double'],
                         'usage': [df1[df1['round'] == 'Jeopardy!'].shape[0],
                                   df1[df1['round'] == 'Double Jeopardy!'].shape[0]]})

In [16]:
compare_by_round('Literature')

Unnamed: 0,round,usage
0,single,42
1,double,68


In [21]:
def ask_question():
    choice = input("Filter by keywords? (y/n) ")
    filter_list = ()
    while choice == 'y':
        filter_list += (input("Enter keyword: "), )
        choice = input("Add more keywords? (y/n) ")
    df1 = filterQuestion(filter_list)
    if df1.shape[0] == 0:
        print("Too many keywords. Enter fewer")
    else:
        loc = random.randrange(0, df1.shape[0])
        print("Question:")
        question = df1.iloc[loc]['question']
        print(question)
        print("Answer: (in lowercase)")
        answer = input()
        if answer == df1.iloc[loc]['answer'].lower():
            print('Correct')
        else:
            print('Incorrect. The correct answer is\n' + df1.iloc[loc]['answer'])

In [23]:
ask_question()

Filter by keywords? (y/n) y
Enter keyword: Pakistan
Add more keywords? (y/n) n
Question:
...this 86,000-square mile region disputed by India & Pakistan that also extends into China
Answer: (in lowercase)
kashmir
Correct
