#### The goal of this project is to figure out some patterns in the questions that could help winning based on a dataset of Jeopardy questions.

In [22]:
import pandas as pd
import numpy as np
jeopardy = pd.read_csv('jeopardy.csv')
jeopardy = jeopardy[:20000]
jeopardy.head(3)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona


In [23]:
jeopardy.columns = jeopardy.columns.str.strip()
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

Normalizing text

In [25]:
import string
def norm(x):
    """ 
    Converting the string to lowercase. Remove all punctuation in the string 
    so that Don't and don't aren't considered to be different words.
    """
    x = x.str.lower()
    x = [''.join(c for c in s if c not in string.punctuation) for s in x]
    x = [s for s in x if s]
    return x

jeopardy['clean_question'] = norm(jeopardy['Question'])
jeopardy['clean_answer'] = norm(jeopardy['Answer'])
jeopardy[['clean_question', 'clean_answer']].head(3)

Unnamed: 0,clean_question,clean_answer
0,for the last 8 years of his life galileo was u...,copernicus
1,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,the city of yuma in this state has a record av...,arizona


Normalizing columns

In [26]:
# normalize dollar values by Removing punctuations in the string and Converting the string to an integer.  
# If the conversion has an error, assign 0.
def conv_str(ins):
    outs = ins.replace('$','').replace(',', '')
    if outs == 'None':
        return 0 
    else:
        outs = int(outs)
        return outs
jeopardy['clean_value'] = jeopardy['Value'].apply(conv_str)
jeopardy[['clean_question', 'clean_answer', 'clean_value']].head(3)

Unnamed: 0,clean_question,clean_answer,clean_value
0,for the last 8 years of his life galileo was u...,copernicus,200
1,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,the city of yuma in this state has a record av...,arizona,200


In [27]:
# converting the Air Date column to a datetime column
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])
jeopardy[['Air Date', 'clean_question', 'clean_answer', 'clean_value']].head(3)

Unnamed: 0,Air Date,clean_question,clean_answer,clean_value
0,2004-12-31,for the last 8 years of his life galileo was u...,copernicus,200
1,2004-12-31,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,2004-12-31,the city of yuma in this state has a record av...,arizona,200


In order to figure out whether to study past questions, study general knowledge, or not study it all, it would be helpful to figure out,
    1. How often the answer is deducible from the question.
    2. How often new questions are repeats of older questions.


In [28]:
# split quetion and answer and remove 'the' since it is commonly found in answers and questions but not useful answers
# finding out how many words in answers are also found in quetions
def sp_col(row):
    split_answer = row['clean_answer'].split(' ')
    split_question = row['clean_question'].split(' ')
    match_count = 0
   
    if 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    else:
        for item in split_answer:
            if item in split_question:
                match_count += 1
        return match_count/len(split_answer)
    
jeopardy['answer_in_question'] = jeopardy.apply(sp_col, axis = 1)
jeopardy['answer_in_question'].mean()

0.060349756216006266

The answer only appears in the question about 6% of the time. Hearing a question wont't help figuring out the answer.

In [29]:
# finding out how often new questions are repeats of older ones
question_overlap = []
terms_used = set()
for index, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(' ')
    templs = []
    for word in split_question:
        if len(word) >= 6:
            templs.append(word)
    split_question = templs
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
        terms_used.add(word)
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
    question_overlap.append(match_count)
jeopardy['question_overlap'] = question_overlap
jeopardy['question_overlap'].mean()

0.6919565346637286

nearly 70% words were reused in quetions

#### which terms correspond to high-value questions?

In [None]:
# create high_value column
def val(row):
    if row['clean_value'] > 800:
        value = 1
    else:
        value = 0
    return value

jeopardy['high_value'] = jeopardy.apply(val, axis=1)

In [32]:
# Determine high and low values for questions 

def high_low_counts(word):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        split_row = row['clean_question'].split(' ')
        if word in split_row:
            if row['high_value']==1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

observed_expected = []   
comparison_terms = list(terms_used)[:5]
for item in comparison_terms:
    ls = high_low_counts(item)
    observed_expected.append(ls)
print(observed_expected)

[(1, 0), (0, 1), (0, 1), (0, 1), (1, 2)]


In [33]:
high_value_count = jeopardy[jeopardy['high_value']==1].shape[0]
low_value_count = jeopardy[jeopardy['high_value']==0].shape[0]
print(high_value_count, low_value_count)

5734 14266


In [35]:
# computing the chi-squared value and p-value given the expected and observed counts.
chi_squared =[]

from scipy.stats import chisquare
import numpy as np

for item in observed_expected:
    total = item[0]+item[1]
    total_prop = total / jeopardy.shape[0]
    high_counts = total_prop * high_value_count
    low_counts = total_prop * low_value_count
    
    observed = np.array([item[0], item[1]])
    expected = np.array([high_counts, low_counts])
    chi_square = chisquare(observed, expected)
    chi_squared.append(chi_square)
chi_squared

[Power_divergenceResult(statistic=2.4879665155214514, pvalue=0.11471986177699109),
 Power_divergenceResult(statistic=0.4019346698443852, pvalue=0.5260918005187468),
 Power_divergenceResult(statistic=0.4019346698443852, pvalue=0.5260918005187468),
 Power_divergenceResult(statistic=0.4019346698443852, pvalue=0.5260918005187468),
 Power_divergenceResult(statistic=0.03190173163299733, pvalue=0.8582435032724245)]

None of the terms had a significant difference in usage between high value and low value rows. Additionally, the frequencies were all lower than 5.   This test is invalid since the observed or expected frequencies in each category are too small, should be at least 5.