In [None]:
# Dependencies.
import pandas as pd
import spacy
import gensim
import re

In [None]:
# Load clue data file.
clueDf = pd.read_csv("clueClustered.csv",encoding="utf8",index_col=0)

In [None]:
# SpaCy model. To download, run python -m spacy download en_core_web_lg.
nlp = spacy.load("en_core_web_lg")

In [None]:
# As an input into the Doc2Vec model, we look to use the same preprocessing as the clustering process.
# However, some answers consist solely of stop words. This is fine for clustering as the document is large,
# but for answer matching we need that original data. Thus we process the category and clue, but keep the
# answer as is (regardless, Doc2Vec doesn't need lemmatized input data).
clueDf["inputRaw"] = clueDf['category'] + ' ' + clueDf['clue']
texts = []
for doc in clueDf['inputRaw']:
    docRaw = nlp(doc)
    docProcessed = []
    for token in docRaw:
        if not token.is_stop and token.pos_ not in ["PUNCT","PART","CONJ","CCONJ", "SPACE"]:
            docProcessed.append(token.lemma_)
    texts.append(" ".join(docProcessed))
clueDf['inputProcessed'] = texts

In [None]:
# Build final input column, drop intermediate columns.
clueDf['input'] = clueDf['inputProcessed'] + ' ' + clueDf['answer']
clueDf.drop(columns=["inputRaw","inputProcessed"],inplace=True)

In [None]:
# Build gensim corpus
corpus = []
for i, doc in enumerate(clueDf['input']):
    tokens = gensim.utils.simple_preprocess(doc,min_len=1,max_len=99)
    corpus.append(gensim.models.doc2vec.TaggedDocument(tokens,[i]))

In [None]:
# Initialize Doc2Vec model. Most of the parameters are rather arbitrary, as there is not
# a concrete way of quantifying performance in this use case (finding appropriate near-miss answers).
model = gensim.models.doc2vec.Doc2Vec(vector_size=256,dm=0,epochs=25,min_count=1,window=5)

In [None]:
# Training the Doc2Vec model.
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
# Before building the near-miss answer set, we want to ensure we are not choosing duplicates.
# This is a slightly modified processing procedure, which removes all symbols and stopwords, sets to lowercase,
# and does not lemmatize (as the same word can have different lemmas based on capitalization).
def process(strr):
    strr = re.sub(r'\W+'," ",strr,re.UNICODE)
    docRaw = nlp(strr)
    docProcessed = []
    for token in docRaw:
        if not token.is_stop and token.pos_ not in ["PUNCT","PART","CONJ","CCONJ","SPACE"]:
            docProcessed.append(token.text.lower())
    return " ".join(docProcessed)

In [None]:
# Spaces are also added before and after the processed text as "in" will be used to determine duplicates,
# and we do not want to disqualify answers based on substring matching. These values will be used many times
# in the near-miss answer building process, so these columns are built in advance to minimize process function calls.
processedClueList = []
processedAnswerList = []
for index in range(len(clueDf)):
    processedClueList.append(" "+process(clueDf.loc[index]['clue'])+" ")
    processedAnswerList.append(" "+process(clueDf.loc[index]['answer'])+" ")
clueDf['processedClue'] = processedClueList
clueDf['processedAnswer'] = processedAnswerList

In [None]:
# Building the list of near-miss answers to each clue.
matchesList = [[f"answer{i}" for i in range(1,12)]]
for index in range(len(gameDf)):
    processedClue = gameDf.loc[index]['processedClue']
    processedAnswer = gameDf.loc[index]['processedAnswer']
    matches = [gameDf.loc[index]['answer']]
    processedMatches = [processedAnswer]
    # First find the 100 most similar documents to the document itself. It is possible that we do not find 5 near misses in this list, but doesn't happen in practice.
    for row in model.docvecs.most_similar([model.infer_vector(corpus[index].words,epochs=50)],topn=100):
        # Find the proposed near-miss answer and its processed equivalent.
        candidate = gameDf.loc[row[0]]['answer']
        if len(processedMatches) < 6:
            processedCandidate = gameDf.loc[row[0]]['processedAnswer']
            # Ensure that the processed near-miss answer does not exist within the processed clue or answer and vice versa.
            if processedCandidate not in processedClue and sum([1 if processedCandidate in processedMatch else 0 for processedMatch in processedMatches])==0 and sum([1 if processedMatch in processedCandidate else 0 for processedMatch in processedMatches])==0:
                matches.append(candidate)
                processedMatches.append(processedCandidate)
    # Repeat the process with the 100 most similar documents to just the answer (such that the emphasis is placed on the structure of the answer as opposed to the clue's meaning).
    for row in model.docvecs.most_similar([model.infer_vector(gensim.utils.simple_preprocess(gameDf.loc[index]['answer'],min_len=1,max_len=99),epochs=50)],topn=100):
        candidate = gameDf.loc[row[0]]['answer']
        if len(processedMatches) < 11:
            processedCandidate = gameDf.loc[row[0]]['processedAnswer']
            if processedCandidate not in processedClue and sum([1 if processedCandidate in processedMatch else 0 for processedMatch in processedMatches])==0 and sum([1 if processedMatch in processedCandidate else 0 for processedMatch in processedMatches])==0:
                matches.append(candidate)
                processedMatches.append(processedCandidate)   
    # This process will break if the clue is a list of answers for contestants to choose from. I suppose the process can be repeated without "processedCandidate not in processedClue",
    # however that greatly increases the occurence of duplicates.
    # Lastly, shuffle the near-miss answers.
    matchesList.append([matches[i] for i in [0,1,6,2,7,3,8,4,9,5,10]])

In [None]:
# Convert list of lists to dataframe, and then merge with clue dataframe.
colNames = matchesList.pop(0)
answerDf = pd.DataFrame(matchesList,columns=colNames)
clueDf = clueDf.merge(answerDf,left_index=True,right_index=True).drop(columns=["answer","input","processedClue","processedAnswer"])

In [None]:
# Merge clues and metadata for one large file for further manual investigation, also to prepare for normalization.ipynb
metadataDf = pd.read_csv("metadata.csv",encoding="utf8",index_col=0)
clueDf = clueDf.merge(metadataDf,on="filename")
clueDf.to_csv("clueAnswers.csv")