# Jupyter notebook sample

In [8]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import gdown
import os
import string

In [7]:


# URL of the file
url = 'https://drive.google.com/uc?id=0BwT5wj_P7BKXUl9tOUJWYzVvUjA&export=download'  # Changed to correct download URL format
output = 'jeopardy.csv'  # Specify the output filename

# Download the file from Google Drive
# Check if the file already exists
if not os.path.exists(output):
    print(f"{output} not found, downloading the file...")
    # Download the file
    gdown.download(url, output, quiet=False)
else:
    print(f"{output} already exists, skipping download.")

# Read the dataset into a DataFrame
jeopardy = pd.read_csv(output)

# Print the first 5 rows of the DataFrame
print(jeopardy.head())

# Print the column names of the DataFrame
print(jeopardy.columns)

# Strip spaces from the column names
jeopardy.columns = jeopardy.columns.str.strip()

# Print the fixed column names
print(jeopardy.columns)

jeopardy.csv already exists, skipping download.
   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  
Index(['Show Number', ' Air Date', ' Round', ' Category', ' Val

In [11]:


# Define the normalization function
def normalize_string(text):
    # Convert the text to lowercase
    text = text.lower()

    # Remove all punctuation using string.punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    return text


# Apply normalization to both Question and Answer columns and create clean columns
jeopardy['clean_question'] = jeopardy['Question'].astype(str).apply(normalize_string)
jeopardy['clean_answer'] = jeopardy['Answer'].astype(str).apply(normalize_string)

# Display the first 5 rows of the normalized columns
print(jeopardy[['Question', 'clean_question', 'Answer', 'clean_answer']].head())

                                            Question  \
0  For the last 8 years of his life, Galileo was ...   
1  No. 2: 1912 Olympian; football star at Carlisl...   
2  The city of Yuma in this state has a record av...   
3  In 1963, live on "The Art Linkletter Show", th...   
4  Signer of the Dec. of Indep., framer of the Co...   

                                      clean_question      Answer clean_answer  
0  for the last 8 years of his life galileo was u...  Copernicus   copernicus  
1  no 2 1912 olympian football star at carlisle i...  Jim Thorpe   jim thorpe  
2  the city of yuma in this state has a record av...     Arizona      arizona  
3  in 1963 live on the art linkletter show this c...  McDonald's    mcdonalds  
4  signer of the dec of indep framer of the const...  John Adams   john adams  


In [13]:
# Function to normalize dollar values
def normalize_dollar_value(value):
    try:
        # Remove any punctuation (commas, dollar signs) and convert to integer
        value = value.translate(str.maketrans('', '', string.punctuation))
        return int(value)
    except (ValueError, TypeError):
        # Return 0 if there's a conversion error
        return 0


# Assuming 'jeopardy' is the DataFrame containing the data
# Normalize the Value column and create a clean_value column
jeopardy['clean_value'] = jeopardy['Value'].astype(str).apply(normalize_dollar_value)

# Convert the Air Date column to a datetime column
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'], errors='coerce')

# Print the first 5 rows of the cleaned DataFrame to see the changes
print(jeopardy[['Value', 'clean_value', 'Air Date']].head())

  Value  clean_value   Air Date
0  $200          200 2004-12-31
1  $200          200 2004-12-31
2  $200          200 2004-12-31
3  $200          200 2004-12-31
4  $200          200 2004-12-31


In [14]:

# Define the function to count how often words in the answer appear in the question
def answer_in_question(row):
    # Split clean_answer and clean_question into lists of words
    split_answer = row['clean_answer'].split()
    split_question = row['clean_question'].split()

    # Create a variable to track matches
    match_count = 0

    # Remove the word 'the' from the answer, as it's too common
    if 'the' in split_answer:
        split_answer.remove('the')

    # Avoid division by zero, if the answer list is empty
    if len(split_answer) == 0:
        return 0

    # Count how many words in split_answer occur in split_question
    for word in split_answer:
        if word in split_question:
            match_count += 1

    # Return the ratio of matching words to the total words in split_answer
    return match_count / len(split_answer)


# Apply the function to each row in the DataFrame and create a new column answer_in_question
jeopardy['answer_in_question'] = jeopardy.apply(answer_in_question, axis=1)

# Calculate the mean of the answer_in_question column
mean_answer_in_question = jeopardy['answer_in_question'].mean()

# Print the result
print("Mean of words in the answer that appear in the question:", mean_answer_in_question)


Mean of words in the answer that appear in the question: 0.057891233559097824


Since very few words appear in previous questions we're better off studying general knowledge

In [15]:
# Create an empty list to store the question overlap ratios
question_overlap = []

# Create an empty set to store previously used terms
terms_used = set()

# Sort jeopardy by ascending air date
jeopardy = jeopardy.sort_values('Air Date')

# Iterate through each row in the DataFrame using iterrows
for index, row in jeopardy.iterrows():
    # Split the clean_question column into words
    split_question = row['clean_question'].split()

    # Remove words shorter than 6 characters
    split_question = [word for word in split_question if len(word) >= 6]

    # Initialize match_count to track matching words
    match_count = 0

    # Check for overlapping words with terms_used
    for word in split_question:
        if word in terms_used:
            match_count += 1

    # Add the words to the terms_used set
    for word in split_question:
        terms_used.add(word)

    # Avoid division by zero and calculate overlap ratio
    if len(split_question) > 0:
        question_overlap_ratio = match_count / len(split_question)
    else:
        question_overlap_ratio = 0

    # Append the overlap ratio to question_overlap
    question_overlap.append(question_overlap_ratio)

# Assign the overlap list to the question_overlap column in jeopardy
jeopardy['question_overlap'] = question_overlap

# Calculate and print the mean of the question_overlap column
mean_question_overlap = jeopardy['question_overlap'].mean()
print("Mean of question overlap:", mean_question_overlap)

Mean of question overlap: 0.871721036062983


Since the mean is high, terms in previous questions have been used before. It may be worth studying previous questions

In [17]:
import random


# Function to classify questions as high value (1) or low value (0)
def classify_value(row):
    return 1 if row['clean_value'] > 800 else 0


# Apply the function to each row to create a high_value column
jeopardy['high_value'] = jeopardy.apply(classify_value, axis=1)


# Function to count occurrences of a word in high and low value questions
def count_word_occurrences(word):
    low_count = 0
    high_count = 0

    # Iterate through each row in the DataFrame
    for index, row in jeopardy.iterrows():
        split_question = row['clean_question'].split()

        # Check if the word is in the question
        if word in split_question:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1

    return high_count, low_count


# Assuming terms_used is already defined with words from previous steps
# Randomly select 10 terms from terms_used
comparison_terms = random.sample(list(terms_used), 10)

# List to store observed counts for each word
observed_expected = []

# Loop through each term in comparison_terms
for term in comparison_terms:
    # Get the high and low counts for the term
    high_count, low_count = count_word_occurrences(term)

    # Append the result to the observed_expected list
    observed_expected.append([term, high_count, low_count])

# Print the observed_expected list with term counts
for term_data in observed_expected:
    term, high_count, low_count = term_data
    print(f"Term: {term}, High Count: {high_count}, Low Count: {low_count}")


Term: eduskunta, High Count: 1, Low Count: 0
Term: targetblankcornerstonea, High Count: 0, Low Count: 1
Term: frutti, High Count: 0, Low Count: 5
Term: dooryard, High Count: 4, Low Count: 3
Term: panchakosi, High Count: 0, Low Count: 1
Term: accompany, High Count: 4, Low Count: 13
Term: tenniel, High Count: 0, Low Count: 1
Term: recruits, High Count: 3, Low Count: 7
Term: fraudulent, High Count: 1, Low Count: 5
Term: supertournament, High Count: 0, Low Count: 1


In [18]:
from scipy.stats import chisquare

# Find the number of rows where high_value is 1 and 0
high_value_count = jeopardy[jeopardy['high_value'] == 1].shape[0]
low_value_count = jeopardy[jeopardy['high_value'] == 0].shape[0]

# Create an empty list to store chi-squared results
chi_squared = []

# Total number of rows in jeopardy
total_rows = jeopardy.shape[0]

# Loop through the observed counts
for term_data in observed_expected:
    term, high_count, low_count = term_data

    # Add the high and low counts to get the total count for the term
    total = high_count + low_count

    # Calculate the proportion of times the term appears in the dataset
    total_prop = total / total_rows

    # Calculate the expected counts for high and low value questions
    expected_high = total_prop * high_value_count
    expected_low = total_prop * low_value_count

    # Compute the chi-squared value and p-value
    observed = [high_count, low_count]
    expected = [expected_high, expected_low]

    chi2, p_value = chisquare(f_obs=observed, f_exp=expected)

    # Append the term, chi-squared value, and p-value to chi_squared list
    chi_squared.append([term, chi2, p_value])

# Print the chi-squared results
for result in chi_squared:
    term, chi2, p_value = result
    print(f"Term: {term}, Chi-Squared: {chi2}, p-value: {p_value}")

Term: eduskunta, Chi-Squared: 2.5317964247338085, p-value: 0.11157312838169751
Term: targetblankcornerstonea, Chi-Squared: 0.3949764642333513, p-value: 0.5296950912486695
Term: frutti, Chi-Squared: 1.974882321166757, p-value: 0.15993058334750937
Term: dooryard, Chi-Squared: 2.8662187105487287, p-value: 0.09045716995788881
Term: panchakosi, Chi-Squared: 0.3949764642333513, p-value: 0.5296950912486695
Term: accompany, Chi-Squared: 0.1917508971280775, p-value: 0.6614633388942732
Term: tenniel, Chi-Squared: 0.3949764642333513, p-value: 0.5296950912486695
Term: recruits, Chi-Squared: 0.014001457003849405, p-value: 0.9058079685768663
Term: fraudulent, Chi-Squared: 0.4010346717612653, p-value: 0.5265553925560026
Term: supertournament, Chi-Squared: 0.3949764642333513, p-value: 0.5296950912486695


Conclusion. It doesn't seem like the terms used have high chi-squared and most have a p-value above 0.1, so correlation is loose and could be due to chance