In [1]:
import pandas as pd

jeopardy = pd.read_csv("jeopardy.csv")

jeopardy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [2]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [3]:
jeopardy.columns = [column.replace(" ", "") for column in jeopardy.columns]

In [4]:
jeopardy.columns

Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [5]:
import re
def normalize_text(text):
    text = text.lower()
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    return text

def normalize_value(text):
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    try:
        text = int(text)
    except Exception:
        text = 0
    return text

In [6]:
jeopardy["clean_value"] = jeopardy["Value"].apply(normalize_value)
jeopardy["clean_question"] = jeopardy["Question"].apply(normalize_text)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize_text)

In [7]:
jeopardy.head(5)

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer,clean_value,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,200,for the last 8 years of his life galileo was u...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,200,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,200,the city of yuma in this state has a record av...,arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,200,in 1963 live on the art linkletter show this c...,mcdonalds
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,200,signer of the dec of indep framer of the const...,john adams


In [9]:
import pandas
jeopardy["Air Date"] = pandas.to_datetime(jeopardy["AirDate"])

In [10]:
jeopardy.dtypes

ShowNumber                 int64
AirDate                   object
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_value                int64
clean_question            object
clean_answer              object
Air Date          datetime64[ns]
dtype: object

In [12]:
def match_count(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    match_count = 0
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return match_count / len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(match_count, axis=1)

In [14]:
mean_of_match = jeopardy["answer_in_question"].mean()
mean_of_match

0.060493257069335872

In [21]:
question_overlap = []
terms_used = set()

for i, row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    split_question = [item for item in split_question if len(item) > 5]
    match_count = 0
    for term in split_question:
        if term in terms_used:
            match_count += 1
        else: 
            terms_used.add(term)
    if len(split_question) > 0:
        rate = match_count / len(split_question)
    question_overlap.append(rate)
            
jeopardy["question_overlap"] = question_overlap   

In [22]:
jeopardy["question_overlap"].mean()

0.7072847895011759

In [23]:
def value_evaluate(row):
    value = 0
    if row["clean_value"] > 800:
        value = 1
    return value

jeopardy["high_value"] = jeopardy.apply(value_evaluate, axis = 1)

In [24]:
def count(term):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        if term in split_question:
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return low_count, high_count

observed_expected = []
comparison_terms = list(terms_used)[:5]

for term in comparison_terms:
    observed_expected.append(count(term))

In [26]:
observed_expected

[(1, 0), (0, 1), (21, 9), (1, 0), (0, 1)]

In [27]:
comparison_terms

['cataria', 'covent', 'screen', 'conventions', 'diligently']

In [29]:
from scipy.stats import chisquare
import numpy as np

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
lower_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []

for item in observed_expected:
    total = sum(item)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    lower_value_exp = total_prop * lower_value_count
    
    observed = np.array([item[0], item[1]])
    expected = np.array([high_value_exp, lower_value_exp])
    
    chi_squared.append(chisquare(observed, expected))
    
chi_squared

[Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=25.055843807319008, pvalue=5.5693709635218382e-07),
 Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686)]