In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go


In [18]:
df = pd.read_csv('data/clean_data.csv',keep_default_na=False)

In [19]:
df.head(5)

Unnamed: 0,answer,clue,noun_required,fill_blank
0,aba,litigator group,False,False
1,actor,thespian,False,False
2,ades,summer coolers,False,False
3,ads,newspaper revenue,False,False
4,afro,head of hair,False,False


# Proportions of unique clues and answers

In [20]:
proportions = { 'clues':df['clue'].nunique(), 
                'answers': df['answer'].nunique(),
                'clues per answer':df['clue'].nunique() / df['answer'].nunique(),
                'pairings':(df['answer'] + df['clue']).nunique()
              }

print(f"""
      Unique Clues:                       {proportions['clues']} 
      Unique Answers:                     {proportions['answers']}
      Unique Answer + Clue Pairings:      {proportions['pairings']}
      
      Clues per Answer: {proportions['clues per answer']}
      """)





      Unique Clues:                       2579749 
      Unique Answers:                     315116
      Unique Answer + Clue Pairings:      3103325
      
      Clues per Answer: 8.186664593356097
      


In [21]:
labels = [  'Unique Clue + Answer pairings',
            'Unique Clues',
            'Unique Answers'
            ]
values = [  proportions['pairings'],
            proportions['clues'],
            proportions['answers']
            ]

fig = go.Figure(data=[go.Bar(
            x=labels, 
            y=values,
            text=values,
            textposition='auto',
        )])

fig.show()

# Frequency of repeated answers

In [22]:
frequencies = { '100+':    (df['answer'].value_counts() > 100).sum(), 
                '11-100':  ((df['answer'].value_counts() < 100) & (df['answer'].value_counts() > 10)).sum(),
                '2-10':     ((df['answer'].value_counts() <= 10) & (df['answer'].value_counts() > 1)).sum(),
                '1':        (df['answer'].value_counts() == 1).sum()
              }
print(f"""
      Answers with more than 100 occurances:        {frequencies['100+']}
      Answers with between 11 and 99 occurances:    {frequencies['11-100']}
      Answers with between 2 and 10 occurances:     {frequencies['2-10']}
      Answers with only one occurance:              {frequencies['1']}
      """)




      Answers with more than 100 occurances:        5991
      Answers with between 11 and 99 occurances:    38857
      Answers with between 2 and 10 occurances:     104758
      Answers with only one occurance:              165444
      


In [23]:
labels = list(frequencies.keys())
values = list(frequencies.values())

fig = go.Figure(data=[go.Pie(labels=labels, values=values, textinfo='label+percent',
                             insidetextorientation='radial'
                            )])
fig.show()

# Samples of punctuation occuring within clues (Using uncleaned data)

In [39]:

def sample_strings_containing(containing:list,  data:pd.DataFrame, column:str, n_samples:int=1) -> dict:
    """Searches a column within a dataframe and returns random smaples of entries containing the key words
    in the given list.

    Args:
        containing (list): List of key words to look for
        data (pd.DataFrame): Dataframe to search
        column (str): column to serch within datdrame
        n_samples (int, optional): number of samples to return for each key. Defaults to 1.

    """
    samples = {}
    for x in containing:
        matches = data[data[column].str.contains(x, regex=False)]
        if matches.shape[0] > 0:  
            samples[x] = []
            for r in np.random.choice(range(matches.shape[0]),n_samples, replace=False):         
                r = np.random.randint(0, matches.shape[0])
                samples[x].append(matches.iloc[r][column])
    return samples

In [40]:
from string import punctuation
original_df = pd.read_table('data/clues.tsv')
original_df = original_df[['clue','answer']]
original_df.dropna(inplace=True)

punctuation_samples = sample_strings_containing(list(punctuation),original_df,'clue',n_samples=5)
pd.DataFrame(punctuation_samples).T


Unnamed: 0,0,1,2,3,4
!,Too bad!,___ be an honor!,___ roll!,"*""Everyone off!""",Ciao! cousin
"""","Actress Taylor of ""The Nanny""","Like the accent in ""crème""","Alex voices her on ""Family Guy""","Irish air ""The Rose of ___""","One of Chekhov's ""Three Sisters"""
#,Cash-strapped college student's survival ploy #5,#1,__ Go Again: Whitesnake #1 song,Boom Boom ___ (#1 song by the Black Eyed Peas),1958 Presley #1 hit
$,@#$%! This wake is getting totally out of hand!,Change for a $20 bill,$200 Monopoly props.,"His portrait is on $10,000 bills.","$ star, 1971"
%,100% convinced,Article of clothing than an estimated 80% wear...,Say '#@%!',Like about 25% of legal U.S. immigrants,20% of diez
&,"B. & O., Santa Fe, etc.","Some shoe purchases from the ""Big & Tall"" store",___ Jay Lerner of Lerner & Loewe,"Piece of wood that's ""better than bad, it's go...",Alternative to rap and R&B
',___ d'oeuvres,Don't overdo it,TV Hulk actor's first name,"*""Midnight's Children"" author",Didn't pay yet
(,Two-__ (kind of car),Mai ___ (cocktail),Out of ___ (uncoordinated),___-mell (helter-skelter),401(k) supplement
),Ich ___ dich (German term of endearment),__ about (circa),He's ___ nowhere man (Beatles),"'Til ___ (""The Producers"" song)",Burkina ___ (African land)
*,*___ soup,*Not sportsmanlike,M*A*S*H drink,W*D,*Abraham Lincoln


In [10]:
df[df['clue'].str.contains('Nobel')].shape

(3823, 2)

In [11]:
df[df['clue'].str.contains('actor')].shape

(9693, 2)

In [12]:
df[df['clue'].str.contains('slang')].shape

(7147, 2)

In [14]:
df[df['clue'].str.contains('%')].shape

(1242, 2)

In [15]:
df[df['clue'].str.contains('Across')].shape

(27728, 2)

In [16]:
df[df['clue'].str.contains('-Down')].shape

(11317, 2)

In [17]:
df[df['clue'].str.contains('-Across')].shape

(19302, 2)

In [18]:
df[df['clue'].str.contains('(',regex=False)].shape

(111995, 2)

In [19]:
df[df['clue'].str.contains('Abbr.',regex=False)].shape

(49424, 2)

In [20]:
df[df['clue'].str.contains(';',regex=False)].shape

(1940, 2)

In [21]:
df[df['clue'].str.contains('@',regex=False)].shape

(174, 2)

In [22]:
df[df['clue'].str.contains('[',regex=False)].shape

(2063, 2)

In [23]:
df[df['clue'].str.contains('^',regex=False)].shape

(285, 2)

In [24]:
df[df['clue'].str.contains('_',regex=False)].shape

(382745, 2)

# Cleaning / Handling Strategies
#### Brainstorming ideas for how to handle the variety of different questions and also clean strings to be readable by algorithm. Bullet points are seperate ideas that might be used individually or in conjunction with some/all of the others



Capital letters not at the start of sentence:
* flag as involving a noun
* flag as requiring a noun
* convert into ____
* use in generating a topic

Numbers:
* remove
* flag as involving number

Single characters:
* remove

Containing "-Across" or "-Down"
* remove from dataset (only of use in a later iteration of model that solve entire puzzle rather than single questions)

!
* Remove
* If at end of sentance, seperate with space and use it as a word otherwise delete

"
* Remove " from string
* Remove text outside of " " 
* Remove text inside of " " but use it for generating a topic as a new value

\#
* Remove

$
* Replace with " money " or something similar
* Remove

%
* Remove

&
* Remove

'
* Remove
* If followed by an 's' remove that too

( )
* Remove

\*

* Remove
* an asterisk at the start of clue might be useful for a later itertion of model that solves entire puzzle rather than a single question

\+

* Remove
* Seperate into multiple questions using + as seperator and create a new clue composed of the solutions of each segment

,

* Remove

\-
* Remove

.	
* Remove

/
* Remove

:
* Remove
* Use text before to generate topic and text after as clue

;
* Remove

<>
* Remove

=
* Remove
* Use text after to generate topic and text before as clue

?
* Remove

@
* Remove

[]
* Remove

\
* Remove

^
* Remove
* Replace with "squared" or something similar

_
* Flag for requiring a blank within sentence
* Remove

`
* Remove
* If double, treat as "

{}
* Remove

|
* Remove

~
* Remove
