In [203]:
# https://preppindata.blogspot.com/2021/06/2021-week-22-answer-smash.html

import pandas as pd
import numpy as np

### Input the data

In [204]:
df_answer = pd.read_excel(r'data\PD 2021 Wk 22 input.xlsx', sheet_name='Answer Smash')
df_names = pd.read_excel(r'data\PD 2021 Wk 22 input.xlsx', sheet_name='Names')
df_questions = pd.read_excel(r'data\PD 2021 Wk 22 input.xlsx', sheet_name='Questions')
df_cat = pd.read_excel(r'data\PD 2021 Wk 22 input.xlsx', sheet_name='Category')

### The category dataset requires some cleaning so that Category and Answer are 2 separate fields

In [205]:
df_cat[['Category','Answer']] = df_cat['Category: Answer'].str.split(': ',expand=True)

#lower case to match the answer response
df_cat['Answer'] = df_cat['Answer'].str.lower()
df_cat['Answer'] = df_cat['Answer'].str.strip()
df_cat

Unnamed: 0,Category: Answer,Category,Answer
0,Animals: Aardvark,Animals,aardvark
1,Companies: Amazon,Companies,amazon
2,Companies: Annies Burger Shack,Companies,annies burger shack
3,Science: Astrophysics,Science,astrophysics
4,Companies: Barnes & Noble,Companies,barnes & noble
5,Characters: Bert and Ernie,Characters,bert and ernie
6,Characters: Big bird,Characters,big bird
7,Science: Brain,Science,brain
8,Animals: Brown Bear,Animals,brown bear
9,Companies: Byron Burgers,Companies,byron burgers


### Join the datasets together, making sure to keep an eye on row counts 

In [206]:
df_QA = pd.merge(df_questions, df_answer, on='Q No', how='inner')
df_QA

Unnamed: 0,Q No,Category,Question,Answer Smash
0,1,Animals,Which mammal has the latin name panthera uncia?,Mo Hassnow leopard
1,2,Characters,Name the famous Sesame Street duo.,Kelly Gilbert and ernie
2,3,Science,What are joules or therms an example of?,Arsenergy units
3,4,Science,What parts of the body contain immune cells to...,Nicolas Mieszalymph nodes
4,5,Science,Which branch of space science applies the laws...,Amalia García-Vellido Santíastrophysics
5,6,Companies,What's the name of the American bookseller fou...,Owen Barnes & Noble
6,7,Companies,Which British cycle retailer was founded in 1921?,Simon Evans Cycles
7,8,Food,Which side dish consists mainly of shredded ra...,Donna Coleslaw
8,9,Characters,"Who is the a genius, billionaire, playboy, phi...",Will Suttony Stark
9,10,Animals,Which animal is frequently confused with the a...,Shahzad Ziaardvark


### Filter the data so that each answer smash is matched with the corresponding name and answer

In [207]:
matches = df_QA['Answer Smash'].apply(lambda x: next((substring for substring in df_names['Name'] if substring in x), None))
df_QA['Name'] = matches
df_QA

Unnamed: 0,Q No,Category,Question,Answer Smash,Name
0,1,Animals,Which mammal has the latin name panthera uncia?,Mo Hassnow leopard,Mo Hassn
1,2,Characters,Name the famous Sesame Street duo.,Kelly Gilbert and ernie,Kelly Gilbert
2,3,Science,What are joules or therms an example of?,Arsenergy units,Arsene
3,4,Science,What parts of the body contain immune cells to...,Nicolas Mieszalymph nodes,Nicolas Mieszaly
4,5,Science,Which branch of space science applies the laws...,Amalia García-Vellido Santíastrophysics,Amalia García-Vellido Santías
5,6,Companies,What's the name of the American bookseller fou...,Owen Barnes & Noble,Owen Barnes
6,7,Companies,Which British cycle retailer was founded in 1921?,Simon Evans Cycles,Simon Evans
7,8,Food,Which side dish consists mainly of shredded ra...,Donna Coleslaw,Donna Coles
8,9,Characters,"Who is the a genius, billionaire, playboy, phi...",Will Suttony Stark,Will Sutton
9,10,Animals,Which animal is frequently confused with the a...,Shahzad Ziaardvark,Shahzad Zia


In [208]:
# the cases are inconsistent. Convert all into lower case
df_QA['Answer Smash'] = df_QA['Answer Smash'].str.lower()

def find_matching_answer(x, category, names_df):
    for substring in names_df['Answer']:
        # Check if substring is in x, and loop up the category of that substring to see if category matches
        if substring in x and names_df.loc[names_df['Answer'] == substring, 'Category'].iloc[0] == category:
            return substring
    return None

# Apply the function to create a new column 'Match' in df_QA
matches = df_QA.apply(lambda row: find_matching_answer(row['Answer Smash'], row['Category'], df_cat), axis=1)
df_QA['Answer'] = matches
df_QA

Unnamed: 0,Q No,Category,Question,Answer Smash,Name,Answer
0,1,Animals,Which mammal has the latin name panthera uncia?,mo hassnow leopard,Mo Hassn,snow leopard
1,2,Characters,Name the famous Sesame Street duo.,kelly gilbert and ernie,Kelly Gilbert,bert and ernie
2,3,Science,What are joules or therms an example of?,arsenergy units,Arsene,energy units
3,4,Science,What parts of the body contain immune cells to...,nicolas mieszalymph nodes,Nicolas Mieszaly,lymph nodes
4,5,Science,Which branch of space science applies the laws...,amalia garcía-vellido santíastrophysics,Amalia García-Vellido Santías,astrophysics
5,6,Companies,What's the name of the American bookseller fou...,owen barnes & noble,Owen Barnes,barnes & noble
6,7,Companies,Which British cycle retailer was founded in 1921?,simon evans cycles,Simon Evans,evans cycles
7,8,Food,Which side dish consists mainly of shredded ra...,donna coleslaw,Donna Coles,coleslaw
8,9,Characters,"Who is the a genius, billionaire, playboy, phi...",will suttony stark,Will Sutton,tony stark
9,10,Animals,Which animal is frequently confused with the a...,shahzad ziaardvark,Shahzad Zia,aardvark


### Remove unnecessary columns

In [209]:
df_QA = df_QA[['Q No','Name','Question','Answer','Answer Smash']]
df_QA.head()

Unnamed: 0,Q No,Name,Question,Answer,Answer Smash
0,1,Mo Hassn,Which mammal has the latin name panthera uncia?,snow leopard,mo hassnow leopard
1,2,Kelly Gilbert,Name the famous Sesame Street duo.,bert and ernie,kelly gilbert and ernie
2,3,Arsene,What are joules or therms an example of?,energy units,arsenergy units
3,4,Nicolas Mieszaly,What parts of the body contain immune cells to...,lymph nodes,nicolas mieszalymph nodes
4,5,Amalia García-Vellido Santías,Which branch of space science applies the laws...,astrophysics,amalia garcía-vellido santíastrophysics


### Output the data

In [210]:
df_QA.to_csv(r'output/2021-week22-output.csv')