In [56]:
import pandas as pd
import numpy as np


In [82]:
df = pd.read_table('data/clues.tsv')

In [83]:
df.head(5)

Unnamed: 0,pubid,year,answer,clue
0,atc,1997,,
1,atc,1997,ABA,Litigator's group
2,atc,1997,ACTOR,Thespian
3,atc,1997,ADES,Summer coolers
4,atc,1997,ADS,Newspaper revenue


In [84]:
df.drop(['pubid','year'], axis=1,inplace=True)
df.dropna(inplace=True)
df.shape

(6459190, 2)

In [85]:
df['answer'].nunique()

315686

In [87]:
df.shape[0] / df['answer'].nunique()

20.460805990762974

In [99]:

def sample_strings_containing(containing:list,  data:pd.DataFrame, column:str, n_samples:int=1) -> dict:
    """Searches a column within a dataframe and returns random smaples of entries containing the key words
    in the given list.

    Args:
        containing (list): List of key words to look for
        data (pd.DataFrame): Dataframe to search
        column (str): column to serch within datdrame
        n_samples (int, optional): number of samples to return for each key. Defaults to 1.

    """
    samples = {}
    for x in containing:
        matches = data[df[column].str.contains(x, regex=False)]
        if matches.shape[0] > 0:  
            samples[x] = []
            for r in np.random.choice(range(matches.shape[0]),n_samples, replace=False):         
                r = np.random.randint(0, matches.shape[0])
                samples[x].append(matches.iloc[r][column])
    return samples

In [62]:
from string import punctuation
punctuation_samples = sample_strings_containing(punctuation,df,'clue',n_samples=5)

In [63]:
pd.DataFrame(punctuation_samples).T

Unnamed: 0,0,1,2,3,4
!,Neener-neener!,"Slangy lead-in for ""boy!""",Nonsense!,"Out, you!",Give me just a ___!
"""",Palace used as police headquarters on the orig...,"Baseball's ""Little Giant""","Narrator of ""Moby Dick.""","Milne's ""The House at __ Corner""","Head of state known to his people as ""Dear Lea..."
#,Rejected scratch 'n' sniff sticker #5,"Song that knocked ""Good Vibrations"" out of the...","The Heat's #32, to fans",1973 #1 hit for the Rolling Stones,1972 #1 hit
$,$$$,"$5 bill, slangily",Like a $2 bill,$50 bill depiction,"$1,000, slangily"
%,"Item at a 95% markdown, say",What about 40% of the U.S. corn crop is used for,"!@#$%, in comics",10% of a quarter of a quarter of a quarter sec...,60% of V
&,Henry & June diarist,"Pam of ""Mork & Mindy""",'Law & Order: --',Texas A & M athlete,Procter & Gamble brand with a paw print in the...
',Perfumer's compounds,Franciscus TV drama of the 60's,Communists' means of quelling opposition.,Motorist's protest,Kite's weapon
(,Take ___ (doze),___ out (rationed),Search (out),Maker of fine English china (1754–1827).,"Elaine ___ (""Seinfeld"" role)"
),___-in (like some mailing lists),Commander of the Constitution (War of 1812),___ Khan (Sir Tiger).,Agnus ___ (pre-communion prayer),Lavish affection (on)
*,*War and pinochle,"*""Anatomy of a Murder"" actor, 1959",*Submerged,*Precocious lad,**Deliberately delay


In [65]:
df[df['clue'].str.contains('Nobel')].shape

(3823, 2)

In [68]:
df[df['clue'].str.contains('actor')].shape

(9693, 2)

In [72]:
df[df['clue'].str.contains('slang')].shape

(7147, 2)

In [73]:
df[df['clue'].str.contains('%')].shape

(1242, 2)

In [74]:
df[df['clue'].str.contains('Across')].shape

(27728, 2)

In [75]:
df[df['clue'].str.contains('-Down')].shape

(11317, 2)

In [76]:
df[df['clue'].str.contains('-Across')].shape

(19302, 2)

In [90]:
df[df['clue'].str.contains('(',regex=False)].shape

(111995, 2)

In [93]:
df[df['clue'].str.contains('Abbr.',regex=False)].shape

(49424, 2)

In [94]:
df[df['clue'].str.contains(';',regex=False)].shape

(1940, 2)

In [95]:
df[df['clue'].str.contains('@',regex=False)].shape

(174, 2)

In [96]:
df[df['clue'].str.contains('[',regex=False)].shape

(2063, 2)

In [97]:
df[df['clue'].str.contains('^',regex=False)].shape

(285, 2)

In [98]:
df[df['clue'].str.contains('_',regex=False)].shape

(382745, 2)

# Cleaning / Handling Strategies
#### Brainstorming ideas for how to handle the variety of different questions and also clean strings to be readable by algorithm. Bullet points are seperate ideas that might be used individually or in conjunction with some/all of the others



Capital letters not at the start of sentence:
* flag as involving a noun
* flag as requiring a noun
* convert into ____
* use in generating a topic

Numbers:
* remove
* flag as involving number

Single characters:
* remove

Containing "-Across" or "-Down"
* remove from dataset (only of use in a later iteration of model that solve entire puzzle rather than single questions)

!
* If at end of sentance, seperate with space and use it as a word otherwise delete

"
* Remove " from string
* Remove text outside of " " 
* Remove text inside of " " but use it for generating a topic as a new value

\#
* Remove

$
* Replace with " money " or something similar
* Remove

%
* Remove

&
* Remove

'
* Remove
* If followed by an 's' remove that too

( )
* Remove

\*

* Remove
* an asterisk at the start of clue might be useful for a later itertion of model that solves entire puzzle rather than a single question

\+

* Remove
* Seperate into multiple questions using + as seperator and create a new clue composed of the solutions of each segment

,

* Remove

\-
* Remove

.	
* Remove

/
* Remove

:
* Remove
* Use text before to generate topic and text after as clue

;
* Remove

<>
* Remove

=
* Remove
* Use text after to generate topic and text before as clue

?
* Remove

@
* Remove

[]
* Remove

\
* Remove

^
* Remove
* Replace with "squared" or something similar

_
* Flag for requiring a blank within sentence
* Remove

`
* Remove
* If double, treat as "

{}
* Remove

|
* Remove

~
* Remove
