The goal of this project is to figure out some patterns in the questions that could help winning based on a dataset of Jeopardy questions.

In [22]:
import pandas as pd
import numpy as np
jeopardy = pd.read_csv('jeopardy.csv')
jeopardy = jeopardy[:20000]
jeopardy.head(3)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona


In [23]:
jeopardy.columns = jeopardy.columns.str.strip()
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

Normalizing text

In [25]:
import string
def norm(x):
    """ 
    Converting the string to lowercase. Remove all punctuation in the string 
    so that Don't and don't aren't considered to be different words.
    """
    x = x.str.lower()
    x = [''.join(c for c in s if c not in string.punctuation) for s in x]
    x = [s for s in x if s]
    return x

jeopardy['clean_question'] = norm(jeopardy['Question'])
jeopardy['clean_answer'] = norm(jeopardy['Answer'])
jeopardy[['clean_question', 'clean_answer']].head(3)

Unnamed: 0,clean_question,clean_answer
0,for the last 8 years of his life galileo was u...,copernicus
1,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,the city of yuma in this state has a record av...,arizona


Normalizing columns

In [26]:
# normalize dollar values by Removing punctuations in the string and Converting the string to an integer.  
# If the conversion has an error, assign 0.
def conv_str(ins):
    outs = ins.replace('$','').replace(',', '')
    if outs == 'None':
        return 0 
    else:
        outs = int(outs)
        return outs
jeopardy['clean_value'] = jeopardy['Value'].apply(conv_str)
jeopardy[['clean_question', 'clean_answer', 'clean_value']].head(3)

Unnamed: 0,clean_question,clean_answer,clean_value
0,for the last 8 years of his life galileo was u...,copernicus,200
1,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,the city of yuma in this state has a record av...,arizona,200


In [27]:
# converting the Air Date column to a datetime column
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])
jeopardy[['Air Date', 'clean_question', 'clean_answer', 'clean_value']].head(3)

Unnamed: 0,Air Date,clean_question,clean_answer,clean_value
0,2004-12-31,for the last 8 years of his life galileo was u...,copernicus,200
1,2004-12-31,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,2004-12-31,the city of yuma in this state has a record av...,arizona,200


In order to figure out whether to study past questions, study general knowledge, or not study it all, it would be helpful to figure out,
    1. How often the answer is deducible from the question.
    2. How often new questions are repeats of older questions.


In [28]:
# split quetion and answer and remove 'the' since it is commonly found in answers and questions but not useful answers
# finding out how many words in answers are also found in quetions
def sp_col(row):
    split_answer = row['clean_answer'].split(' ')
    split_question = row['clean_question'].split(' ')
    match_count = 0
   
    if 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    else:
        for item in split_answer:
            if item in split_question:
                match_count += 1
        return match_count/len(split_answer)
    
jeopardy['answer_in_question'] = jeopardy.apply(sp_col, axis = 1)
jeopardy['answer_in_question'].mean()

0.060349756216006266

The answer only appears in the question about 6% of the time. Hearing a question wont't help figuring out the answer.

In [29]:
# finding out how often new questions are repeats of older ones
question_overlap = []
terms_used = set()
for index, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(' ')
    templs = []
    for word in split_question:
        if len(word) >= 6:
            templs.append(word)
    split_question = templs
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
        terms_used.add(word)
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
    question_overlap.append(match_count)
jeopardy['question_overlap'] = question_overlap
jeopardy['question_overlap'].mean()

0.6919565346637286

nearly 70% words were reused in quetions

#### which terms correspond to high-value questions?

In [None]:
# create high_value column
def val(row):
    if row['clean_value'] > 800:
        value = 1
    else:
        value = 0
    return value

jeopardy['high_value'] = jeopardy.apply(val, axis=1)

In [32]:
# Determine high and low values for questions 

def high_low_counts(word):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        split_row = row['clean_question'].split(' ')
        if word in split_row:
            if row['high_value']==1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

observed_expected = []   
comparison_terms = list(terms_used)[:5]
for item in comparison_terms:
    ls = high_low_counts(item)
    observed_expected.append(ls)
print(observed_expected)

[(1, 0), (0, 1), (0, 1), (0, 1), (1, 2)]


In [33]:
high_value_count = jeopardy[jeopardy['high_value']==1].shape[0]
low_value_count = jeopardy[jeopardy['high_value']==0].shape[0]
print(high_value_count, low_value_count)

5734 14266


### <b> Computing chi-squared value and p-value given the expected and observed counts.

In [35]:
chi_squared =[]

from scipy.stats import chisquare
import numpy as np

for item in observed_expected:
    total = item[0]+item[1]
    total_prop = total / jeopardy.shape[0]
    high_counts = total_prop * high_value_count
    low_counts = total_prop * low_value_count
    
    observed = np.array([item[0], item[1]])
    expected = np.array([high_counts, low_counts])
    chi_square = chisquare(observed, expected)
    chi_squared.append(chi_square)
chi_squared

[Power_divergenceResult(statistic=2.4879665155214514, pvalue=0.11471986177699109),
 Power_divergenceResult(statistic=0.4019346698443852, pvalue=0.5260918005187468),
 Power_divergenceResult(statistic=0.4019346698443852, pvalue=0.5260918005187468),
 Power_divergenceResult(statistic=0.4019346698443852, pvalue=0.5260918005187468),
 Power_divergenceResult(statistic=0.03190173163299733, pvalue=0.8582435032724245)]

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
houses = pd.read_table('Raw_data\AmesHousing_1.txt')
houses['SalePrice'].plot.kde(xlim = (houses['SalePrice'].min(),
                                    houses['SalePrice'].max()))
st_dev = houses['SalePrice'].std(ddof = 0)
mean = houses['SalePrice'].mean()
plt.axvline(mean, color='black', label='Mean')
plt.axvline(mean+st_dev, color='red', label='Standard deviation')  
plt.axvline(220000, color='Orange', label='220000') 
plt.legend()
very_expensive = False

In [None]:
# a faster and more precise way to measure how far off a value is from the mean.  The standard score or the z-score = the number of standard deviations away from the mean

st_devs_away = (220000 - houses['SalePrice'].mean())/houses['SalePrice'].std(ddof = 0)
st_devs_away

### Z-score

In [None]:
min_val = houses['SalePrice'].min()
mean_val = houses['SalePrice'].mean()
max_val = houses['SalePrice'].max()

def find_z(n, arr, bessel = 0):
    import numpy as np
    m = sum(arr)/len(arr)
    s = np.std(arr)
    dis = n-m
    z = dis/s
    return z
min_z = find_z(min_val, houses['SalePrice'])
mean_z = find_z(mean_val, houses['SalePrice'])
max_z = find_z(max_val, houses['SalePrice'])
print(min_z, mean_z, max_z)

In [None]:
def z_score(value, array, bessel = 0):
    mean = sum(array) / len(array)
    from numpy import std
    st_dev = std(array, ddof = bessel)
    distance = value - mean
    z = distance / st_dev
    return z
north_ames = houses[houses['Neighborhood'] == 'NAmes']
college_cr = houses[houses['Neighborhood'] == 'CollgCr']
old_town = houses[houses['Neighborhood'] == 'OldTown']
edwards = houses[houses['Neighborhood'] == 'Edwards']
somerset = houses[houses['Neighborhood'] == 'Somerst']

north_ames_z = z_score(200000, north_ames['SalePrice'], bessel = 0)
college_cr_z = z_score(200000, college_cr['SalePrice'], bessel = 0)
old_town_z = z_score(200000, old_town['SalePrice'], bessel = 0)
edwards_z = z_score(200000, edwards['SalePrice'], bessel = 0)
somerset_z = z_score(200000, somerset['SalePrice'], bessel = 0)
print(north_ames_z, college_cr_z, old_town_z, edwards_z, somerset_z)

In [None]:
# Z-scores are often used to transform entire distributions by converting all the values to z-scores. For every distribution of z-scores, 
# the mean is always 0 and the standard deviation is always 1

mean = houses['SalePrice'].mean()
st_dev = houses['SalePrice'].std(ddof = 0)
houses['z_prices'] = houses['SalePrice'].apply(
    lambda x: ((x - mean) / st_dev)
    )
z_mean_price = houses['z_prices'].mean()
z_stdev_price = houses['z_prices'].std(ddof = 0)

mean = houses['Lot Area'].mean()
st_dev = houses['Lot Area'].std(ddof = 0)
houses['z_area'] = houses['Lot Area'].apply(
    lambda x: ((x - mean) / st_dev)
    )
z_mean_area = houses['z_area'].mean()
z_stdev_area = houses['z_area'].std(ddof = 0)
print(z_mean_area, z_stdev_area )

In [None]:
from numpy import std, mean
population = [0,8,0,8]
m = mean(population)
s = std(population)
pop_z = (population - m)/s
mean_z = mean(pop_z)
stdev_z = std(pop_z)
print(mean_z, stdev_z)

In [None]:
# the formula for the sample standard deviation (the formula containing Bessel's correction)

from numpy import std, mean
sample = [0,8,0,8]

x_bar = mean(sample)
s = std(sample, ddof = 1)

standardized_sample = (sample - x_bar)/s
stdev_sample = std(standardized_sample, ddof = 1)
print(standardized_sample, stdev_sample)

In [None]:
# Standardizing distributions can prove very useful when we need to compare values coming from different systems of measurement

mean_index1 = houses['index_1'].mean()
stdev_index1 = houses['index_1'].std(ddof = 0)
houses['z_1'] = houses['index_1'].apply(lambda x: 
                                      (x - mean_index1) / stdev_index1
                                     )

mean_index2 = houses['index_2'].mean()
stdev_index2 = houses['index_2'].std(ddof = 0)
houses['z_2'] = houses['index_2'].apply(lambda x: 
                                      (x - mean_index2) / stdev_index2
                                     )

print(houses[['z_1', 'z_2']].head(2))
better = 'first'


In [None]:
# use the formula x=z*theta + mu to convert z-scores to more intuitive values. 

x= houses['z_merged']*10 + 50
mean_transformed = x.mean()
stdev_transformed = x.std(ddof =0)

### Probability

In [None]:
# INITIAL CODE
from numpy.random import seed, randint

seed(1)

def coin_toss():
    if randint(0,2) == 1:
        return 'HEAD'
    else:
        return 'TAIL'
    
probabilities = []
heads = 0
for n in range(1, 10001):
    outcome = coin_toss()
    if outcome =='HEAD':
        heads += 1
    current_probability = heads/n
    probabilities.append(current_probability)
    
print(probabilities[0:10])

In [None]:
# sample space = the set of all possible outcomes

coin_toss_omega = ['HH', 'HT', 'TT', 'TH']

# Venn diagrams.  Events that don't intersect are called mutually exclusive.  Events that intersect are called mutually non-exclusive


### Solving Complex Probability Problems

In [None]:
#  two ads on the same web page (ad "A" and ad "B") to 100 users. At the end of the trial, they found: 12 users clicked on ad "A", 17 users clicked on ad "B", 3 users clicked on both ad "A" and ad "B"

p_a = 12/100
p_b = 17/100
p_a_and_b = 3/100
p_a_or_b = p_a + p_b - p_a_and_b
print(p_a_or_b)

In [None]:
# Event A (it takes three flips or more for a coin to land heads up)
# The opposite of event A=coin flipped twice lands heads up at least once: non-A={1,2}

# Find the probability that it takes four flips or more for a coin to land heads up (event "B").

p_non_b = 7/8
p_b = 1 - p_non_b

In [None]:
# multiplication rule of probability
P(H1 & H2) = P(H1) * P(H2)

# What is the probability of getting at least one 6 in four throws of a single six-sided die?  Event A is getting at least one 6 in four throws, 
# event AC is not getting any 6 in four throws. So event AC is equivalent to getting any of the outcomes {1, 2, 3, 4, 5} four times in a row. 
P(A^C) = (5/6)^4 = 0.4823
P(A) = 1-(5/6)^4 = 0.5177

# Find the probability of getting at least one double-six in 24 throws of two six-sided dice (the two dice are thrown simultaneously).
p_one_double_6 =  1 - (35/36)**24

In [None]:
#  find the probability of getting four aces in a row when drawing cards from a standard 52-card deck. The deck has four aces and a total of 52 cards.
P(Ace) = 4/52 # probability of drawing an ace
P(AAAA) = (4/52)^4 = 0.000035 # if put the card back

### Permutations and Combinations

In [None]:
# Consider the composite experiment E1E2, where E1 is rolling a fair six-sided die once, and E2 is rolling the same die again. One of the outcomes of E1E2 could be (1, 6), which means we get a 1 for the first roll and a 6 for the second one.
the total number of outcomes(n_outcomes) = 6*6 # the rule of product
the probability of getting a (6,6) = 1/n_outcomes
the probability of not getting a (5,5) = 1-1/n_outcomes

In [None]:
#  if we have an experiment E1 with a outcomes, followed by an experiment E2 with b outcomes, followed by an experiment En with z outcomes, the total number of outcomes for the composite experiment E1E2 ... En can be found by multiplying their individual outcomes.abs

# roll a fair six-sided die three times and then randomly draw a card from a standard 52-card deck. One of the outcomes could be (6, 6, 6, ace of diamonds), which means getting three 6's in a row when we roll the die, followed by drawing an ace of diamonds from the deck
total number of outcomes = 6 * 6 * 6 * 52
probability of getting (6, 6, 6, ace of diamonds) = 1/6*1/6*1/6*1/52

In [None]:
# probability of cracking a 4-digit PIN code using the code 8362 
total_outcomes_4_pin = 10**4
p_crack_4 = 1 / total_outcomes_4_pin
total_outcomes_6_pin = 10**6
p_crack_6 = 1/total_outcomes_6_pin

In [None]:
# permutation = a certain arrangement where the order of the individual elements matters.

In [None]:
def factorial(n):
    final_product = 1
    for i in range(n, 0, -1):
        final_product *= i
    return final_product

permutations_1 = factorial(6) 
permutations_2 = factorial(52)
print(permutations_1)
print(permutations_2)

In [None]:
# the number of permutations when taking only k objects from a group of n objects

def permutation(n, k):
    numerator = factorial(n)
    denominator = factorial(n-k)
    return numerator/denominator

A fictional mobile app to heap treating lottery addiction that will enalble user to build functions that enable users to answer probability questions about playing the 6/49 lottery games such as:

What is the probability of winning the big prize with a single ticket?
What is the probability of winning the big prize if we play 40 different tickets (or any other number)?
What is the probability of having at least five (or four, or three, or two) winning numbers on a single ticket?

In [None]:
def factorial(n):
    total = 1
    for i in range(n, 0, -1):
        total *= n
    return total

def combination(n, k):
    a = factorial(n)
    b = factorial(k)*factorial(n-k)
    return a/b

def one_ticket_probability(w_nums):
    l = len(w_nums)
    total_outcome = combination(49, l)
    p_win = 1/total_outcome
    print("Hey, you only got {} chance to win!".format(p_win))

one_ticket_probability([1,2,3,4,5,6])

In [None]:
import pandas as pd
df = pd.read_csv("Raw_data\649.csv")
print(df.shape)
df.head(3)

In [None]:
# find winning probabily of a given set of numbers
def extract_numbers(num_ls):
    for n in num_ls:
        df1 = df[df["NUMBER DRAWN 1"] == num_ls[0]]
        df2 = df1[df1["NUMBER DRAWN 2"] == num_ls[1]]
        df3 = df2[df2["NUMBER DRAWN 3"] == num_ls[2]]
        df4 = df3[df3["NUMBER DRAWN 4"] == num_ls[3]]
        df5 = df4[df4["NUMBER DRAWN 5"] == num_ls[4]]
        df6 = df5[df5["NUMBER DRAWN 6"] == num_ls[5]]
    prob_win = df6.shape[0] / df.shape[0]
    print("n_times_selected = {0}".format(df6.shape[0]))
    print("The probability you wining is {:.5f}".format(prob_win))

extract_numbers([3, 11,12,14,41,43])

In [None]:
# calculate probabilities based on number of tickets bought
def multi_ticket_probability(ls):
    total_outcome = combination(49,6)
    for i in ls:
        prob = i/total_outcome
        print("Your chance of winning with {} tickets is {:.5f}%".format(i, prob*100))

ls= [1, 10, 100, 10000, 1000000, 6991908, 13983816]
multi_ticket_probability(ls)

In [None]:
# calculate probability  of winning with less than 6 matching numbers
def probability_less_6(n):
    n_num_comb = combination(6,n)
    total_outcome = combination(49,6)
    total_n_num_comb = (49-n)*n_num_comb
    prob = total_n_num_comb/total_outcome
    print("Your chance of winning with {} matching numbers is {:.5f}%".format(n, prob*100))

for n in [2, 3, 4, 5]:
    probability_less_6(n)

None of the terms had a significant difference in usage between high value and low value rows. Additionally, the frequencies were all lower than 5.   This test is invalid since the observed or expected frequencies in each category are too small, should be at least 5.