In [1]:
import pandas as pd, re
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
jeopardy = pd.read_csv('src/jeopardy.csv')
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [4]:
cols_clean = [x.strip(' ') for x in jeopardy.columns]
print(cols_clean)

['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']


In [5]:
jeopardy.columns = cols_clean

In [6]:
jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 7 columns):
Show Number    19999 non-null int64
Air Date       19999 non-null object
Round          19999 non-null object
Category       19999 non-null object
Value          19999 non-null object
Question       19999 non-null object
Answer         19999 non-null object
dtypes: int64(1), object(6)
memory usage: 1.1+ MB


In [8]:
# Remove punctuation and lower case
def str_normalise(str_):
    return re.sub(r'[^\w\s]', '', str_).lower()

In [9]:
jeopardy['clean_question'] = jeopardy['Question'].apply(str_normalise)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(str_normalise)
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams


In [9]:
def dollar_norm(str_):
    try:
        return int(re.sub(r'[^\w\s]', '', str_))
    except ValueError:
        return 0

In [10]:
jeopardy['clean_value'] = jeopardy['Value'].apply(dollar_norm)
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200


In [11]:
def word_analyser(row):
    split_answer = row['clean_answer'].split()
    split_question = row['clean_question'].split()
    match_count = 0
    if 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    else:
        for i in split_answer:
            if i in split_question:
                match_count += 1
    return match_count/len(split_answer)

In [12]:
jeopardy['answer_in_question'] = jeopardy.apply(word_analyser, axis=1)

In [13]:
mean_answer_in_question = jeopardy.answer_in_question.mean()
print(mean_answer_in_question)

0.05900196524977763


It looks like there is a 6% recycling rate.

In [14]:
# Sorting by Air Date
jeopardy.sort_values(by = ['Air Date'], axis=0, inplace=True)

In [15]:
question_overlap = []
terms_used = set()

for index, row in jeopardy.iterrows():
    split_question = row['clean_question'].split()
    split_question = [i for i in split_question if len(i) < 6]
    match_count = 0
    for w in split_question:
        if w in terms_used:
            match_count += 1
        terms_used.add(w)
    if len(split_question) > 0:
        question_overlap.append(match_count/len(split_question))
    else:
        question_overlap.append(0)

In [16]:
print(jeopardy.shape)

(19999, 11)


In [17]:
print(len(question_overlap))

19999


In [18]:
jeopardy['question_overlap'] = question_overlap

In [19]:
print(jeopardy.question_overlap.mean())

0.9371340767458503


In [20]:
def is_high(row):
    if row['clean_value'] > 800:
        return 1
    else:
        return 0

In [21]:
jeopardy['high_value'] = jeopardy.apply(is_high, axis=1)

In [22]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value,answer_in_question,question_overlap,high_value
19325,10,1984-09-21,Final Jeopardy!,U.S. PRESIDENTS,,"Adventurous 26th president, he was 1st to ride...",Theodore Roosevelt,adventurous 26th president he was 1st to ride ...,theodore roosevelt,0,0.0,0.111111,0
19301,10,1984-09-21,Double Jeopardy!,LABOR UNIONS,$200,Notorious labor leader missing since '75,Jimmy Hoffa,notorious labor leader missing since 75,jimmy hoffa,200,0.0,0.0,0
19302,10,1984-09-21,Double Jeopardy!,1789,$200,"Washington proclaimed Nov. 26, 1789 this first...",Thanksgiving,washington proclaimed nov 26 1789 this first n...,thanksgiving,200,0.0,0.0,0
19303,10,1984-09-21,Double Jeopardy!,TOURIST TRAPS,$200,Both Ferde Grofe' & the Colorado River dug thi...,the Grand Canyon,both ferde grofe the colorado river dug this ...,the grand canyon,200,0.0,0.125,0
19304,10,1984-09-21,Double Jeopardy!,LITERATURE,$200,"Depending on the book, he could be a ""Jones"", ...",Tom,depending on the book he could be a jones a sa...,tom,200,0.0,0.333333,0


In [31]:
def check_value(word):
    low_count = 0
    high_count = 0
    for index, row in jeopardy.iterrows():
        split_question = row['clean_question'].split(' ')
        if word in split_question:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

In [32]:
observed_expected = []
terms_used = list(terms_used)
comparison_terms = terms_used[:30]
print(comparison_terms)

['ufos', '3830', 'stipe', 'ours', 'huti', 'disco', 'roark', 'mansa', 'xrays', 'v05', 'ayn', 'elon', 'hbos', 'pogo', 'neigh', 'f117a', 'floor', 'javan', 'shops', 'ladd', 'toure', 'hunk', 'dale', '2844', 'aiken', 'pois', 'ok', 'corum', 'took', '29']


In [33]:
for t in comparison_terms:
    observed_expected.append(check_value(t))
    
print(observed_expected)

[(0, 1), (1, 0), (1, 1), (1, 4), (0, 1), (0, 1), (1, 0), (0, 1), (2, 0), (1, 0), (1, 0), (1, 0), (1, 1), (0, 1), (0, 1), (0, 1), (3, 15), (1, 0), (0, 5), (0, 2), (1, 0), (0, 2), (1, 0), (2, 0), (0, 1), (0, 1), (1, 4), (0, 1), (38, 82), (7, 14)]


In [34]:
total_rows = jeopardy.shape[0]
high_value_count = jeopardy.high_value.sum()
low_value_count = total_rows - high_value_count
print(high_value_count)
print(low_value_count)

5734
14265


In [35]:
from scipy.stats import chisquare

chi_squared = []
for l in observed_expected:
    total = sum(l)
    total_prop = total/total_rows
    exp_list = []
    exp_list.append(total_prop * high_value_count)
    exp_list.append(total_prop * low_value_count)
    chi_squared.append(chisquare(l, exp_list))

In [38]:
for rown, row in enumerate(chi_squared):
    print(comparison_terms[rown])
    print(row)

ufos
Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469)
3830
Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047)
stipe
Power_divergenceResult(statistic=0.4448774816612795, pvalue=0.5047776487545996)
ours
Power_divergenceResult(statistic=0.18383953104516373, pvalue=0.6680941623250602)
huti
Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469)
disco
Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469)
roark
Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047)
mansa
Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469)
xrays
Power_divergenceResult(statistic=4.97558423439135, pvalue=0.025707519787911092)
v05
Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047)
ayn
Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047)
elon
Power_divergenceResult(statistic=2.487792117195675, pvalue=0