In [95]:
import pandas as pd

jeo = pd.read_csv('jeopardy.csv')
jeo.head()
#print(jeo.info())
jeo.columns = ['show_number', 'air_date', 'round', 'category', 'value', 'question', 'answer']


In [96]:
#we will do some cleaning on the text columns for questin & answer
import re

def clean_string(strings):

    clean = strings.lower()
    clean= re.sub("[^A-Za-z0-9\s]", "", clean)
    clean = re.sub("\s+", " ", clean)
    return clean

jeo['clean_question'] = jeo['question'].apply(clean_string)
jeo['clean_answer'] = jeo['answer'].apply(clean_string)

#we also need to convert the value column to numeric

def convert_values(text):
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    try:
        text = float(text)
    except Exception:
        text = 0
    return text

jeo['value'] = jeo['value'].apply(convert_values)
jeo['air_date'] = pd.to_datetime(jeo['air_date'])

jeo.head()
jeo.info()
    

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   show_number     19999 non-null  int64         
 1   air_date        19999 non-null  datetime64[ns]
 2   round           19999 non-null  object        
 3   category        19999 non-null  object        
 4   value           19999 non-null  float64       
 5   question        19999 non-null  object        
 6   answer          19999 non-null  object        
 7   clean_question  19999 non-null  object        
 8   clean_answer    19999 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(6)
memory usage: 1.4+ MB


How often the answer can be used for a question?

To answer this question we can find out how many times words in the answer is repeated in the question

below is the logic which will use to find this:
    1. split the words in each row of clean_question into lists and assign the lists to a new column questions_list.
    2. split the words in each row of clean_answer into lists and assign the lists to a new column answers_list.
    3. create an empty list -- Counts n
    4. Now loop through each row in the answers_list column and do the following
        a. 'the' is very common, so we will remove it with the list.remove() method. For each list, count the number of occurences of the word 'the'. Since remove only removes the first occurence we have to repeat this step for the no of counts of the word.
        b. create a variable for storing the freq,now iterate over the words in the answers_list, for each word in answer thats also in question, add one to the variable.
        c. append the final count variable/ length of answer_list to the counts list
    5. Use the enumerate function to create a dictionary with the counts list. this will have the same length as the no of rows in the dataset.
    6. Turn the dictionary into a pd series and add this to the original dataset

In [97]:
jeo['questions_list'] = jeo['clean_question'].str.split()
jeo['answers_list'] = jeo['clean_answer'].str.split()


counts = []
index = 0

for row in jeo['answers_list']:
     
    length = len(row)
    if length != 0:
        repeat_words = 0  #no of times a word is repeated in answer and question
        no_of_the = 0
        for word in row:
            if word =='the':
                no_of_the += 1
        for i in range(no_of_the):
            row.remove('the')
        for entry in row:
            if entry in jeo['questions_list'].iloc[index]:
                repeat_words += 1
        
        counts.append(repeat_words/length)
        index += 1
    else:
        counts.append(0)
        index += 1
    
#print(counts, len(counts))

freq_dictionary = dict(enumerate(counts))
#print(freq_dictionary)

#now convert dictionary to pd series

freq_dict = pd.DataFrame.from_dict(freq_dictionary, orient = 'index', columns = ['repeat_freq'])

#freq_dict.head(20)

#now add to the jeo dataset

jeo_new = pd.concat([jeo,freq_dict], axis = 1)


#find the mean of the repeat_freq column

mean_repeat = jeo_new['repeat_freq'].mean()
mean_repeat

#the results (average mean of about 0.06) shows that answers are only repeated in questions about 6% of the time. 


0.05518310949957564

now lets investigate how many times questions are repeated

1. sort the jeo dataset by the air date
2. create a set that will contain all the unique words that are 6 and above letters in length
3. create an empty list that will save the frequency of repeating words (length >= 6) in each questions
4. Loop through the rows in the dataset['question_list'] and do the following:
    a. save each row into a list
    b. Loop through the list and remove all words less than 6 in length
    c. after removing words less than 6 in length from the lists, check the new length of the list
    d. if the length of the list is 0, update the freq list with 0
    e.otherwise, loop through the list.
        i. match_count = 0
        ii. for each word in the list, if the word is already in the unique_word set, match_count += 1. Else, add the word to unique set
        iii. update freq list with (match_count/length)
5. Use the enumerate function to create a dictionary with the counts list. this will have the same length as the no of rows in the dataset.
6. Turn the dictionary into a pd series and add this to the original dataset
    

In [98]:
jeopardy = jeo.sort_values('air_date')
unique_words = set()
repeat_word_freq = []

for entry in jeopardy['questions_list']:
    q_list = entry
    for word in q_list:
        if len(word) < 6:
            q_list.remove(word)
    length = len(q_list)
    if length == 0:
        repeat_word_freq.append(0)
    else:
        match_count = 0
        for term in q_list:
            if term in unique_words:
                match_count += 1
            else:
                unique_words.add(term)
        repeat_word_freq.append(match_count/length)
        
freq_d = dict(enumerate(repeat_word_freq))

freq_series = pd.DataFrame.from_dict(freq_d, orient = 'index', columns = ['repeat_word_freq'])

jeopardy_new = pd.concat([jeopardy,freq_series], axis = 1)

mean_repeat_q = jeopardy_new['repeat_word_freq'].mean()
mean_repeat_q

#There is about 70% overlap between terms in new questions and terms in old questions. This only looks at a small set of questions, and it doesn't look at phrases, it looks at single terms. This makes it relatively insignificant, but it does mean that it's worth looking more into the recycling of questions.



0.7989239490507691