In [20]:
import pandas as pd
import matplotlib as plt
import numpy as np
import re

In [21]:
dataset = pd.read_csv("jeopardy.csv")
dataset.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [22]:
print(dataset.columns)

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')


In [23]:
name_list = list()
old_list = list()

for i in dataset.columns:
    old_list.append(i)
    if i.startswith(" "):
        name_list.append(i.replace(" ", "", 1))
    else:
        name_list.append(i)

new_names = dict(zip(old_list, name_list))
dataset.rename(index=str, columns=new_names, inplace=True)
print(old_list)
print(dataset.columns)

['Show Number', ' Air Date', ' Round', ' Category', ' Value', ' Question', ' Answer']
Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')


In [24]:
def norm_text(text):
    pattern = re.compile(r"[^A-Za-z0-9\s]")
    text = text.split()
    text = [i.lower() for i in text]
    text = [re.sub(pattern,"", i) for i in text]
    text = " ".join(text)
    return text
test = "No. 2: 1912 Olympian; football star at"
hi = norm_text(test)
dataset['clean_question'] = dataset['Question'].apply(norm_text)
dataset['clean_answer'] = dataset['Answer'].apply(norm_text)


In [25]:
def norm_int(integer):
    pattern = re.compile(r"[^A-Za-z0-9\s]")
    integer = integer.split()
    integer = [re.sub(pattern,"", i) for i in integer]
    integer = " ".join(integer)
    
    try:
        integer = int(integer)
    except ValueError:
        integer = 0
    return integer

dataset['clean_value'] = dataset['Value'].apply(norm_int)

dataset['Air Date'] = pd.to_datetime(dataset['Air Date'],format="%Y/%m/%d")


In [26]:
def count_occurrences(row):
    split_answer = row['clean_answer'].split()
    split_question = row['clean_question'].split()

    match_count = 0
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    for i in split_answer:
        if i in split_question:
            match_count +=1
    return match_count/len(split_answer)

answer_in_question = dataset.apply(count_occurrences, axis=1)
print(answer_in_question.mean())



0.0590019652498


There ** only 6%  that question would be asked again **.  
This is extremely low chance. We have to make sure that percentage of full   data our dataset represent.

In [27]:
question_overlap = list()
terms_used = set()
dataset.sort_values(by=['Air Date'], ascending=True)
cnt = 0
for k,v in dataset.iterrows():
    split_question = v['clean_question'].split()
    split_question = [i for i in split_question if len(i) >= 6]
    match_count = 0
    for i in split_question:
        if i in terms_used:
            match_count += 1
        terms_used.add(i)
    if len(split_question) > 0:
        match_count = match_count/len(split_question)
    question_overlap.append(match_count)

dataset['question_overlap'] = question_overlap
print("mean ",dataset['question_overlap'].mean())
# print(dataset['question_overlap'].sum()/len(question_overlap))

mean  0.692596005734


The mean is about 65% that means we have a good chance to meet the question   in the future. Still it's only part of dataset, so we have to simulate the   other data.

In [28]:
 # HERE
def filter_value(row):
    if row['clean_value'] > 800:
        value = 1
    else:
        value = 0
    return value
dataset['high_value'] = dataset.apply(filter_value, axis=1)

def count_low_high(word):
    low_count = 0
    high_count = 0
    for k,v in dataset.iterrows():
        clean_q = v['clean_question'].split()
        if word in clean_q:
            if v['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return low_count, high_count
observed_expected = list()
terms_used = list(terms_used)
comparison_terms = terms_used[:5]


for i in comparison_terms:
    observed_expected.append(count_low_high(i))
print(observed_expected)


[(1, 0), (1, 0), (1, 3), (1, 0), (0, 1)]


In [29]:
print(list(enumerate(dataset.columns)))

[(0, 'Show Number'), (1, 'Air Date'), (2, 'Round'), (3, 'Category'), (4, 'Value'), (5, 'Question'), (6, 'Answer'), (7, 'clean_question'), (8, 'clean_answer'), (9, 'clean_value'), (10, 'question_overlap'), (11, 'high_value')]


In [30]:
high_value_count = dataset[dataset['high_value'] == 1].shape[0]
low_value_count = dataset[dataset['high_value'] == 0].shape[0]
print(high_value_count,low_value_count)

5734 14265


In [31]:
from scipy.stats import chisquare
chi_squared = list()

for i in observed_expected:
    total = i[0] + i[1]
    tottal_prop = total/dataset.shape[0]
    expected_high = tottal_prop * high_value_count
    expected_low = tottal_prop * low_value_count
    chi_sq, p_value = chisquare([i[0],i[1]], [expected_high,expected_low])
    chi_squared.append((chi_sq,p_value))

In [32]:
chi_squared

[(2.4877921171956752, 0.11473257634454047),
 (2.4877921171956752, 0.11473257634454047),
 (0.026364433084407689, 0.87101348468892104),
 (2.4877921171956752, 0.11473257634454047),
 (0.40196284612688399, 0.52607729857054686)]

Definitly there is no statistically significant data that we are looking at.  

Null hypothesis was that we can't learn the answer for question to answer   correctly and it wasn't rejected.  

I want mention that it can depend from data cleaning. We dosen't study   cleaned text. So I can't say that it was good cleaned from useless sumbols  