In [68]:
import pandas as pd
from random import sample, choice

# Load data

In [45]:
df = pd.read_csv('../data/interim/processed_dataset.csv')
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True, )
df.drop(['index', 'question_subcategory', 'train_phase', 'question_description'], axis=1, inplace=True)
df.head()

Unnamed: 0,question_id,question_keyword,question_type
0,benefit_search-preventive_care-1,Preventive care,1
1,benefit_search-preventive_care-2,Preventive care,2
2,benefit_search-preventive_care-1,Abdominal aortic aneurysm screening,1
3,benefit_search-preventive_care-2,Abdominal aortic aneurysm screening,2
4,benefit_search-preventive_care-1,Alcohol misuse screenings & counseling,1


# Load phrases

In [46]:
phrase_df = pd.DataFrame({'question':[], 'question_type':[]})
phrase_df['question_type'] = phrase_df['question_type'].astype(int)
for i in range(1, 5):
    temp_df = pd.read_excel('../data/external/paraphrased_questions.xlsx', sheet_name=f"question_type_{i}")
    temp_df['question_type'] = int(i)
    phrase_df = pd.concat(objs=[phrase_df, temp_df])
phrase_df['question'] = phrase_df['question'].str.replace('…', '...', regex=False)
phrase_df['insert'] = phrase_df['question'].str.contains('...', regex=False) 
print(phrase_df.info())
phrase_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 171 entries, 0 to 79
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   question       171 non-null    object
 1   question_type  171 non-null    int64 
 2   insert         171 non-null    bool  
dtypes: bool(1), int64(1), object(1)
memory usage: 4.2+ KB
None


Unnamed: 0,question,question_type,insert
0,Which plans cover ...?,1,True
1,Which plan offers ...?,1,True
2,Which plan has ... coverage?,1,True
3,Which plans offer ... coverage?,1,True
4,plans offer ...,1,True


In [47]:
phrase_df.question_type.value_counts(), phrase_df['insert'].value_counts(dropna=False)


(4    80
 2    43
 3    29
 1    19
 Name: question_type, dtype: int64,
 True     93
 False    78
 Name: insert, dtype: int64)

# Merge data

In [226]:
def uncapitalize(s):
    return s[:1].lower() + s[1:] if s else ''

def keyword_aug(s):
    words = s.split()
    if choice([0,1]) and len(words)>5:
        range_top = choice(list(range(6, 15)))
        s = " ".join(words[:range_top])
    if choice([0,1]):
        s = s.lower()
    else:
        s = uncapitalize(s) if not s[1].isupper() else s
    return s

In [227]:
merged_dataset = pd.DataFrame({'data':[],'label':[]})

# dataset_list = []
# for idx, row in df.iteritems():
#     data_dict = {}
#     question_type = row['question_type']
#     mask = phrase_df.question_type == question_type
#     phrases = phrase_df[mask].

dataset_list = []
for type_idx in range(5):
    mask = (df.question_type == type_idx) & (~df.question_keyword.isnull())
    target_questions = df[mask].copy()
    if type_idx>0:
        mask = phrase_df.question_type == type_idx
        target_phrases = phrase_df[mask].question.values
        insert_flags = phrase_df[mask].question_type.values
    
        for _, row in target_questions.iterrows():
            k = row['question_keyword']
            l = row['question_id']
            data_dict = {}
            data_dict['question'] = k
            data_dict['label'] = l
            data_dict['question_type'] = type_idx
            dataset_list.append(data_dict)
            
            for phrase, insert_flag in zip(target_phrases, insert_flags):
                q = phrase.replace('...', keyword_aug(k))
                
                data_dict = {}
                data_dict['question'] = q
                data_dict['label'] = l
                data_dict['question_type'] = type_idx
                dataset_list.append(data_dict)
                
    elif type_idx==0:
        mask = phrase_df.question_type == 1
        target_phrases_left = phrase_df[mask].question.values
        insert_flags_left = phrase_df[mask].question_type.values
        
        mask = phrase_df.question_type == 2
        target_phrases_right = phrase_df[mask].question.str.lower().values
        insert_flags_right = phrase_df[mask].question_type.values
    
        for _, row in target_questions.iterrows():
            k = row['question_keyword']
            l = row['question_id']
            data_dict = {}
            data_dict['question'] = k
            data_dict['label'] = l
            data_dict['question_type'] = type_idx
            dataset_list.append(data_dict)
            
            for phrase_left in target_phrases_left:
                phrase_right = choice(target_phrases_right)
                q = phrase_left.replace('...', keyword_aug(k)).replace('?','') + ' and ' + phrase_right.replace('...', choice(['it', 'this', 'the', '']))
                
                data_dict = {}
                data_dict['question'] = q
                data_dict['label'] = l
                data_dict['question_type'] = type_idx
                dataset_list.append(data_dict)
            

In [228]:
dataset_df = pd.DataFrame(dataset_list)
dataset_df.drop_duplicates(inplace=True)
print(dataset_df.info())
dataset_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 297314 entries, 0 to 300287
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   question       297314 non-null  object
 1   label          297314 non-null  object
 2   question_type  297314 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 9.1+ MB
None


Unnamed: 0,question,label,question_type
0,LYUMJEV,benefit_search-pharmacy-1,0
1,Which plans cover LYUMJEV and what is the pric...,benefit_search-pharmacy-1,0
2,Which plan offers lyumjev and price list for it,benefit_search-pharmacy-1,0
3,Which plan has lyumjev coverage and this cost,benefit_search-pharmacy-1,0
4,Which plans offer lyumjev coverage and is it f...,benefit_search-pharmacy-1,0


In [229]:
dataset_df.to_csv('../data/processed/dataset.csv', index=False)

In [209]:
dataset_df.label.value_counts()

benefit_search-pharmacy-1                              109440
benefit_search-lab_services-2                           74354
benefit_search-lab_services-1                           34156
benefit_search-outpatient_X-rays-2                       6791
benefit_search-diagnostic_radiology-2                    5171
                                                        ...  
benefit_search-blood_sugar_meters-1                        20
benefit_search-blood_sugar_test_strips-1                   20
benefit_search-equipment,_prosthetics,_&_supplies-1        20
benefit_search-canes-1                                     20
benefit_search-hearing_aids-1                              20
Name: label, Length: 296, dtype: int64