In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

DATA_FOLDER = 'data/'
TRAIN = 'train_data.csv'
TEST = 'test_data.csv'

In [2]:
from string import punctuation
EXPRESSIONS_TO_REMOVE = ["\\"+x for x in list(punctuation)]


def prepare_data(data:pd.DataFrame) -> pd.DataFrame:
    data.dropna(inplace=True)
    data = data[['answer','clue']]   #Keep only relevant columns
    prepare_noun_required(data)
    prepare_fill_blank(data)
    clean_strings(data)
    data.drop_duplicates(['answer','clue'],keep='first', inplace=True)  #Drop repetitions of same answer/clue pairs
    return data

def prepare_noun_required(data:pd.DataFrame) -> pd.DataFrame:
    data['noun_required'] = data['clue'].str.contains('[A-Z].*[A-Z]',regex=True)
    return data

def prepare_fill_blank(data:pd.DataFrame) -> pd.DataFrame:
    data['fill_blank'] = data['clue'].str.contains('_', regex=False)
    return data

def clean_strings(data:pd.DataFrame) -> pd.DataFrame:
    data['answer'] = data['answer'].str.lower().str.strip()  
    data['clue'] = data['clue'].str.lower().str.strip()
    data.replace('$', ' money ',            regex=False,    inplace=True)
    data.replace('``', '"',                 regex=False,    inplace=True)
    data.replace(r'\b\w{1,1}\b','',         regex=True,     inplace=True) 
    data.replace(EXPRESSIONS_TO_REMOVE, ' ',regex=True,    inplace=True)
    data.replace('\d+', '',                 regex=True,     inplace=True)
    data.replace('  ', ' ',                 regex=False,    inplace=True)
    return data
    

### Clean data, split to train and test sets, save files to data folder 

In [None]:
df = pd.read_table('data/clues.tsv')
df = prepare_data(df)
train, test = train_test_split(df, test_size=0.3, random_state=42)
train.to_csv(DATA_FOLDER+TRAIN,index=False)
test.to_csv(DATA_FOLDER+TEST,index=False)

In [4]:
df_train = pd.read_csv(DATA_FOLDER+TRAIN)
df_test = pd.read_csv(DATA_FOLDER+TEST)

In [6]:
df_train.head(5)

Unnamed: 0,answer,clue,noun_required,fill_blank
0,reed,oboe or clarinet,False,False
1,lor,relative of lawd,True,False
2,rem,sleep phase that michael jackson allegedly wen...,True,False
3,aegean,sea of the cyclades,True,False
4,pidgeoncoupe,walter transportation,False,False


In [7]:
df_test.head(5)

Unnamed: 0,answer,clue,noun_required,fill_blank
0,rungs,ladder parts,False,False
1,cellist,orchestra member whose instrument rests on the...,False,False
2,astor,last name in fur,False,False
3,erasure,obliteration,False,False
4,vera,see across,True,False
