In [2]:
import pandas as pd
import numpy as np


In [3]:
df = pd.read_csv('data/clean_data.csv')

In [4]:
df.head(5)

Unnamed: 0,answer,clue,noun_required,fill_blank
0,aba,litigator group,False,False
1,actor,thespian,False,False
2,ades,summer coolers,False,False
3,ads,newspaper revenue,False,False
4,afro,head of hair,False,False


# Proportions of unique clues and answers

In [5]:
proportions = { 'clues':df['clue'].nunique(), 
                'answers': df['answer'].nunique(),
                'clues per answer':df['clue'].nunique() / df['answer'].nunique(),
                'pairings':(df['answer'] + df['clue']).nunique()
              }



In [9]:
print(f"""
      Unique Clues: {proportions['clues']} 
      Unique Answers: {proportions['answers']}
      Unique Answer + Clue Pairings: {proportions['pairings']}
      
      Clues per Answer: {proportions['clues per answer']}
      """)

import plotly.graph_objects as go
labels = [  'Unique Clue + Answer pairings',
            'Unique Clues',
            'Unique Answers'
            ]
values = [  proportions['pairings'],
            proportions['clues'],
            proportions['answers']
            ]

fig = go.Figure(data=[go.Bar(
            x=labels, 
            y=values,
            text=values,
            textposition='auto',
        )])

fig.show()



      Unique Clues: 2579975 
      Unique Answers: 315339
      Unique Answer + Clue Pairings: 3103324
      
      Clues per Answer: 8.181591874141796
      


In [15]:
df['answer'].value_counts()

one              1622
era              1586
aria             1519
ore              1435
erie             1426
                 ... 
eggsinone           1
electroplater       1
elegits             1
elire               1
zoomrooms           1
Name: answer, Length: 315339, dtype: int64

In [18]:
(df['answer'].value_counts() > 1000).value_counts()

False    315303
True         36
Name: answer, dtype: int64

In [22]:
(df['answer'].value_counts() < 10).value_counts()

True     267072
False     48267
Name: answer, dtype: int64

In [23]:
(df['answer'].value_counts() == 1).value_counts()

True     165635
False    149704
Name: answer, dtype: int64

In [9]:

def sample_strings_containing(containing:list,  data:pd.DataFrame, column:str, n_samples:int=1) -> dict:
    """Searches a column within a dataframe and returns random smaples of entries containing the key words
    in the given list.

    Args:
        containing (list): List of key words to look for
        data (pd.DataFrame): Dataframe to search
        column (str): column to serch within datdrame
        n_samples (int, optional): number of samples to return for each key. Defaults to 1.

    """
    samples = {}
    for x in containing:
        matches = data[df[column].str.contains(x, regex=False)]
        if matches.shape[0] > 0:  
            samples[x] = []
            for r in np.random.choice(range(matches.shape[0]),n_samples, replace=False):         
                r = np.random.randint(0, matches.shape[0])
                samples[x].append(matches.iloc[r][column])
    return samples

In [10]:
from string import punctuation
punctuation_samples = sample_strings_containing(punctuation,df,'clue',n_samples=5)

In [16]:
pd.DataFrame(punctuation_samples).T

Unnamed: 0,0,1,2,3,4
!,"Older cousin of ""Cool!""",Heavy metal band whose name is a euphemism for...,Oklahoma! gal,Look ___!,___ only money!
"""","Hathaway of ""Get Smart""","Texter's ""Yikes!""","Hero of kid-lit's ""The Phantom Tollbooth""","Stalin called it ""the language of spies""","What Steely Dan will do to ""The Years"""
#,Royals #5,"#3: central Asia, red and yellow","AFI's #79 (with ""The"") (1978)","___ Ko, youngest #1-ranked golfer ever",Element #99
$,"Potent Potables for $200, __","$1000, slangily","Saying ""&%$#@!!""","He's on a $10,000 bill",1867 purchase for a little over $7 million
%,90% of the game is half mental speaker,Are you getting 100%? cereal,Belief of roughly 25% of the world's population,You could get one if you're over .08%,100%
&,When I Was ___: G. & S.,___ Rock (Simon & Garfunkel hit),Electric & Musical Industries Ltd. for short,B&Bs,___ & Wesson has agreed to make products child...
',Klutz's outcry,World's fair,Coty's capital.,(You're) Having My Baby singer,Sterne's hero.
(,Posed (for),___ Speed Wagon (vintage auto),In ___ quo (as is),___ mecum (manual),League with Bulls and Timberwolves (abbr.)
),___-l'oeil (illusion),Write-___ (some vote-getters),Go (through),Chooses (to),Isn't ___ bit like you and me? (Beatles lyric)
*,*,"Alda of ""M*A*S*H""",*Deep-sea diver's concern,*Brunch options,*Whopper alternative (1969-1986)


In [10]:
df[df['clue'].str.contains('Nobel')].shape

(3823, 2)

In [11]:
df[df['clue'].str.contains('actor')].shape

(9693, 2)

In [12]:
df[df['clue'].str.contains('slang')].shape

(7147, 2)

In [14]:
df[df['clue'].str.contains('%')].shape

(1242, 2)

In [15]:
df[df['clue'].str.contains('Across')].shape

(27728, 2)

In [16]:
df[df['clue'].str.contains('-Down')].shape

(11317, 2)

In [17]:
df[df['clue'].str.contains('-Across')].shape

(19302, 2)

In [18]:
df[df['clue'].str.contains('(',regex=False)].shape

(111995, 2)

In [19]:
df[df['clue'].str.contains('Abbr.',regex=False)].shape

(49424, 2)

In [20]:
df[df['clue'].str.contains(';',regex=False)].shape

(1940, 2)

In [21]:
df[df['clue'].str.contains('@',regex=False)].shape

(174, 2)

In [22]:
df[df['clue'].str.contains('[',regex=False)].shape

(2063, 2)

In [23]:
df[df['clue'].str.contains('^',regex=False)].shape

(285, 2)

In [24]:
df[df['clue'].str.contains('_',regex=False)].shape

(382745, 2)

# Cleaning / Handling Strategies
#### Brainstorming ideas for how to handle the variety of different questions and also clean strings to be readable by algorithm. Bullet points are seperate ideas that might be used individually or in conjunction with some/all of the others



Capital letters not at the start of sentence:
* flag as involving a noun
* flag as requiring a noun
* convert into ____
* use in generating a topic

Numbers:
* remove
* flag as involving number

Single characters:
* remove

Containing "-Across" or "-Down"
* remove from dataset (only of use in a later iteration of model that solve entire puzzle rather than single questions)

!
* Remove
* If at end of sentance, seperate with space and use it as a word otherwise delete

"
* Remove " from string
* Remove text outside of " " 
* Remove text inside of " " but use it for generating a topic as a new value

\#
* Remove

$
* Replace with " money " or something similar
* Remove

%
* Remove

&
* Remove

'
* Remove
* If followed by an 's' remove that too

( )
* Remove

\*

* Remove
* an asterisk at the start of clue might be useful for a later itertion of model that solves entire puzzle rather than a single question

\+

* Remove
* Seperate into multiple questions using + as seperator and create a new clue composed of the solutions of each segment

,

* Remove

\-
* Remove

.	
* Remove

/
* Remove

:
* Remove
* Use text before to generate topic and text after as clue

;
* Remove

<>
* Remove

=
* Remove
* Use text after to generate topic and text before as clue

?
* Remove

@
* Remove

[]
* Remove

\
* Remove

^
* Remove
* Replace with "squared" or something similar

_
* Flag for requiring a blank within sentence
* Remove

`
* Remove
* If double, treat as "

{}
* Remove

|
* Remove

~
* Remove
