In [2]:
import pandas as pd
import numpy as np


In [3]:
df = pd.read_table('data/clues.tsv')

In [4]:
df.head(5)

Unnamed: 0,pubid,year,answer,clue
0,atc,1997,,
1,atc,1997,ABA,Litigator's group
2,atc,1997,ACTOR,Thespian
3,atc,1997,ADES,Summer coolers
4,atc,1997,ADS,Newspaper revenue


In [5]:
df.drop(['pubid','year'], axis=1,inplace=True)
df.dropna(inplace=True)
df.shape

(6459190, 2)

In [6]:
df['answer'].nunique()

315686

In [7]:
df['clue'].nunique()

2650153

In [8]:
df['clue'].nunique() / df['answer'].nunique()

8.394901896187985

In [9]:
(df['answer'] + df['clue']).nunique()

3141343

In [10]:

def sample_strings_containing(containing:list,  data:pd.DataFrame, column:str, n_samples:int=1) -> dict:
    """Searches a column within a dataframe and returns random smaples of entries containing the key words
    in the given list.

    Args:
        containing (list): List of key words to look for
        data (pd.DataFrame): Dataframe to search
        column (str): column to serch within datdrame
        n_samples (int, optional): number of samples to return for each key. Defaults to 1.

    """
    samples = {}
    for x in containing:
        matches = data[df[column].str.contains(x, regex=False)]
        if matches.shape[0] > 0:  
            samples[x] = []
            for r in np.random.choice(range(matches.shape[0]),n_samples, replace=False):         
                r = np.random.randint(0, matches.shape[0])
                samples[x].append(matches.iloc[r][column])
    return samples

In [11]:
from string import punctuation
punctuation_samples = sample_strings_containing(punctuation,df,'clue',n_samples=5)

In [12]:
pd.DataFrame(punctuation_samples).T

Unnamed: 0,0,1,2,3,4
!,It'll never happen!,"Slangy ""certainly!""",Party!,Pronto!,[Brrrr!]
"""","Prince ""Get ___""","Cereal ""for kids""","He ""was here""","The ""Say Hey Kid""","Kay Kyser's ""_____ Reveille"""
#,1986 #1 hit for Whitney Houston,"Word accompanying ""Much,"" ""Little"" and ""Late"" ...",Singer with the 2009 #1 hit 'Tik Tok',"2019 #1 album by Tyler, the Creator","Rose ___, group with the 1977 #1 hit ""Car Wash"""
$,"$2.50 per 1/5 mile, e.g.",$200 Monopoly props.,"Baseball's $252 million man, for short","Word for the symbols ""@#$%&!"" used in comic st...","$5 bill, informally"
%,50% to start?,Super Bowl winner more than 50% of the time: A...,They represent over 40% of Scrabble tiles,Give the server 30%,"Say ""@#$%"" to"
&,"AT&T Worldnet, for one",+ & #,Old AT&T rival,"Geena's ""Thelma & Louise"" co-star",Field & __ Magazine
','60s-'80s Brit. sports car,Pothead's purchase,Stern's opposite,Cabooses' spot,Gin's companion.
(,Mai ___ (rum-based beverage),Run ___ (accumulate debt at the bar),___ khan (tiger),Covert ___ (military assignment),"___ precedent (establishes usage, perhaps)"
),Deprive (of),Hinge (on),Id ___ (that is),___ Spring (revolutionary movement of the 2010s),Mai ___ (cocktail)
*,*Rocking good time,*Common military uniform color (3 to 7),"*Singer often called ""The Queen of Country""","*""Have a nice day"" graphic","*Mountain, for Boulder (see letters 3 to 9)"


In [13]:
df[df['clue'].str.contains('Nobel')].shape

(3823, 2)

In [14]:
df[df['clue'].str.contains('actor')].shape

(9693, 2)

In [15]:
df[df['clue'].str.contains('slang')].shape

(7147, 2)

In [16]:
df[df['clue'].str.contains('%')].shape

(1242, 2)

In [17]:
df[df['clue'].str.contains('Across')].shape

(27728, 2)

In [18]:
df[df['clue'].str.contains('-Down')].shape

(11317, 2)

In [19]:
df[df['clue'].str.contains('-Across')].shape

(19302, 2)

In [20]:
df[df['clue'].str.contains('(',regex=False)].shape

(111995, 2)

In [21]:
df[df['clue'].str.contains('Abbr.',regex=False)].shape

(49424, 2)

In [22]:
df[df['clue'].str.contains(';',regex=False)].shape

(1940, 2)

In [23]:
df[df['clue'].str.contains('@',regex=False)].shape

(174, 2)

In [24]:
df[df['clue'].str.contains('[',regex=False)].shape

(2063, 2)

In [25]:
df[df['clue'].str.contains('^',regex=False)].shape

(285, 2)

In [26]:
df[df['clue'].str.contains('_',regex=False)].shape

(382745, 2)

# Cleaning / Handling Strategies
#### Brainstorming ideas for how to handle the variety of different questions and also clean strings to be readable by algorithm. Bullet points are seperate ideas that might be used individually or in conjunction with some/all of the others



Capital letters not at the start of sentence:
* flag as involving a noun
* flag as requiring a noun
* convert into ____
* use in generating a topic

Numbers:
* remove
* flag as involving number

Single characters:
* remove

Containing "-Across" or "-Down"
* remove from dataset (only of use in a later iteration of model that solve entire puzzle rather than single questions)

!
* If at end of sentance, seperate with space and use it as a word otherwise delete

"
* Remove " from string
* Remove text outside of " " 
* Remove text inside of " " but use it for generating a topic as a new value

\#
* Remove

$
* Replace with " money " or something similar
* Remove

%
* Remove

&
* Remove

'
* Remove
* If followed by an 's' remove that too

( )
* Remove

\*

* Remove
* an asterisk at the start of clue might be useful for a later itertion of model that solves entire puzzle rather than a single question

\+

* Remove
* Seperate into multiple questions using + as seperator and create a new clue composed of the solutions of each segment

,

* Remove

\-
* Remove

.	
* Remove

/
* Remove

:
* Remove
* Use text before to generate topic and text after as clue

;
* Remove

<>
* Remove

=
* Remove
* Use text after to generate topic and text before as clue

?
* Remove

@
* Remove

[]
* Remove

\
* Remove

^
* Remove
* Replace with "squared" or something similar

_
* Flag for requiring a blank within sentence
* Remove

`
* Remove
* If double, treat as "

{}
* Remove

|
* Remove

~
* Remove


### Features:
* answer
* clue_vectorized (sparse vector)
* topic  (bool for each topic)
* requires_noun (bool)
* fill_blank (bool)

In [56]:
EXPRESSIONS_TO_REMOVE = ['\d+',]

def prepare_data(data:pd.DataFrame) -> pd.DataFrame:
    prepare_noun_required(data)
    prepare_fill_blank(data)
    data.drop_duplicates(['answer','clue'],keep='first', inplace=True)
    return data

def prepare_noun_required(data:pd.DataFrame) -> pd.DataFrame:
    data['noun_required'] = data['clue'].str.contains('[A-Z].*[A-Z]',regex=True)
    return data

def prepare_fill_blank(data:pd.DataFrame) -> pd.DataFrame:
    data['fill_blank'] = data['clue'].str.contains('_', regex=False)
    return data

def clean_strings(data:pd.DataFrame) -> pd.DataFrame:
    data['answer'] = data['answer'].str.lower()
    data['clue'] = data['clue'].str.lower()
    data.replace('$', ' money ')
    data.replace('``', '"')
    

In [57]:
prepare_data(df)

Unnamed: 0,answer,clue,noun_required,fill_blank
1,ABA,Litigator's group,False,False
2,ACTOR,Thespian,False,False
3,ADES,Summer coolers,False,False
4,ADS,Newspaper revenue,False,False
5,AFRO,Head of hair,False,False
...,...,...,...,...
6535227,ZERO,"Love, in courts",False,False
6535228,ZEST,Citrus peel used for flavor,False,False
6535229,ZEUS,Chainer of Prometheus,True,False
6535231,ZOOMCLASS,Computer training?,False,False
