In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go


In [4]:
df = pd.read_csv('data/clean_data.csv',keep_default_na=False)

In [3]:
df.head(5)

Unnamed: 0,answer,clue,noun_required,fill_blank
0,aba,litigator group,False,False
1,actor,thespian,False,False
2,ades,summer coolers,False,False
3,ads,newspaper revenue,False,False
4,afro,head of hair,False,False


# Proportions of unique clues and answers

In [4]:
proportions = { 'clues':df['clue'].nunique(), 
                'answers': df['answer'].nunique(),
                'clues per answer':df['clue'].nunique() / df['answer'].nunique(),
                'pairings':(df['answer'] + df['clue']).nunique()
              }

print(f"""
      Unique Clues:                       {proportions['clues']} 
      Unique Answers:                     {proportions['answers']}
      Unique Answer + Clue Pairings:      {proportions['pairings']}
      
      Clues per Answer: {proportions['clues per answer']}
      """)





      Unique Clues:                       2579749 
      Unique Answers:                     315116
      Unique Answer + Clue Pairings:      3103325
      
      Clues per Answer: 8.186664593356097
      


In [5]:
labels = [  'Unique Clue + Answer pairings',
            'Unique Clues',
            'Unique Answers'
            ]
values = [  proportions['pairings'],
            proportions['clues'],
            proportions['answers']
            ]

fig = go.Figure(data=[go.Bar(
            x=labels, 
            y=values,
            text=values,
            textposition='auto',
        )])

fig.show()

# Frequency of repeated answers

In [6]:
frequencies = { '100+':    (df['answer'].value_counts() > 100).sum(), 
                '11-100':  ((df['answer'].value_counts() < 100) & (df['answer'].value_counts() > 10)).sum(),
                '2-10':     ((df['answer'].value_counts() <= 10) & (df['answer'].value_counts() > 1)).sum(),
                '1':        (df['answer'].value_counts() == 1).sum()
              }
print(f"""
      Answers with more than 100 occurances:        {frequencies['100+']}
      Answers with between 11 and 99 occurances:    {frequencies['11-100']}
      Answers with between 2 and 10 occurances:     {frequencies['2-10']}
      Answers with only one occurance:              {frequencies['1']}
      """)




      Answers with more than 100 occurances:        5991
      Answers with between 11 and 99 occurances:    38857
      Answers with between 2 and 10 occurances:     104758
      Answers with only one occurance:              165444
      


In [7]:
labels = list(frequencies.keys())
values = list(frequencies.values())

fig = go.Figure(data=[go.Pie(labels=labels, values=values, textinfo='label+percent',
                             insidetextorientation='radial'
                            )])
fig.show()

# Samples of punctuation occuring within clues (Using uncleaned data)

In [8]:

def sample_strings_containing(containing:list,  data:pd.DataFrame, column:str, n_samples:int=1) -> dict:
    """Searches a column within a dataframe and returns random smaples of entries containing the key words
    in the given list.

    Args:
        containing (list): List of key words to look for
        data (pd.DataFrame): Dataframe to search
        column (str): column to serch within datdrame
        n_samples (int, optional): number of samples to return for each key. Defaults to 1.

    """
    samples = {}
    for x in containing:
        matches = data[data[column].str.contains(x, regex=False)]
        if matches.shape[0] > 0:  
            samples[x] = []
            for r in np.random.choice(range(matches.shape[0]),n_samples, replace=False):         
                r = np.random.randint(0, matches.shape[0])
                samples[x].append(matches.iloc[r][column])
    return samples

In [9]:
from string import punctuation
original_df = pd.read_table('data/clues.tsv')
original_df = original_df[['clue','answer']]
original_df.dropna(inplace=True)

punctuation_samples = sample_strings_containing(list(punctuation),original_df,'clue',n_samples=5)
pd.DataFrame(punctuation_samples).T


Unnamed: 0,0,1,2,3,4
!,I __ my wit's end!,"Cartoon character who says ""Come over here, yo...",I completed the crossword!,More!,Don't worry about me!
"""","Alt-rock band that released ""Hysteria (I Want...",Reporter who made a popular 2006 documentary o...,"Sairey of ""Martin Chuzzlewit.""","Hardy's ""Pure Woman""","Linkin Park ""In the ___"""
#,#5: Manicurist?,#1 song,Radiohead album that was #428 on Rolling Stone...,Physicist Enrico after whom element #100 is named,"Classic ""shadow government"" #2"
$,Place that serves $28 pints of ale?,Chance card in Monopoly with a $15 fee,"Portrait on a $10,000 bill",Clear $,"Old Italian coin, the sequin ($2.25)."
%,Less than 1%,25% of Off! Deep Woods,"%$#* and ""@#&!""",80% of Earth's atmosphere,It's usually 15%
&,___ & Chandon champagne,R&B singer Janelle,Kenan & ___ (Nickelodeon show),The N of B&N,Henry & June role
',Goneril's father.,"Shelley's ""To a Skylark,"" e.g.",What the sky might do in an inebriate's dream?,Alpha's opposite,Reveals one's feelings
(,Thank-you-___ (bump),Banana ___ (desert),It may be tipped (h),The ___ Foxes (Lillian Hellman play),Ki ___ (Korea's legendary founder)
),Magical being (Var.),"ST: ___ (1990s sci-fi series, to fans)",___ She Lovely (Stevie Wonder tune),In ___ (working in harmony),"___ Diemen's Land (Tasmania, once)"
*,M*A*S*H extra,*'Vette option,M*A*S*H director,*Reason for rhinoplasty,"*Lateral epicondylitis, to a layperson"


# Frequency of Punctuation

In [17]:
def frequency_of_characters(characters:list,data:pd.DataFrame)->dict:
    frequencies = {}
    for character in characters:
        frequencies[character] = round((data[data['clue'].str.contains(character,regex=False)].shape[0] / data.shape[0] * 100 ),2 )
    return frequencies

In [18]:
punc_freq = frequency_of_characters(punctuation,original_df)

In [25]:
labels = list(punc_freq.keys())
values = list(punc_freq.values())
fig = go.Figure(data=[go.Bar(
                    x=labels, 
                    y=values,
                    text=values,
                    textposition='auto',
                            )
                     ]
               )
fig.update_layout(title_text='Percentages of samples containing each punctuation character')
fig.show()

In [2]:
import gensim.downloader
# Show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [61]:
glove_twitter_25_model = gensim.downloader.load('glove-twitter-25')
glove_twitter_25_vocab = glove_twitter_25_model.key_to_index.keys()
df['answer'].isin(glove_twitter_25_vocab).value_counts(normalize=True)

True     0.783903
False    0.216097
Name: answer, dtype: float64

In [3]:
conceptnet_model = gensim.downloader.load('conceptnet-numberbatch-17-06-300')
conceptnet_vocab = conceptnet_model.key_to_index.keys()
df['answer'].isin(conceptnet_vocab).value_counts(normalize=True)

NameError: name 'df' is not defined

In [None]:
google_news_model = gensim.downloader.load('word2vec-google-news-300')
google_news_vocab = google_news_model.key_to_index.keys()
df['answer'].isin(google_news_vocab).value_counts(normalize=True)

# Cleaning / Handling Strategies
#### Brainstorming ideas for how to handle the variety of different questions and also clean strings to be readable by algorithm. Bullet points are seperate ideas that might be used individually or in conjunction with some/all of the others



Capital letters not at the start of sentence:
* flag as involving a noun
* flag as requiring a noun
* convert into ____
* use in generating a topic

Numbers:
* remove
* flag as involving number

Single characters:
* remove

Containing "-Across" or "-Down"
* remove from dataset (only of use in a later iteration of model that solve entire puzzle rather than single questions)

!
* Remove
* If at end of sentance, seperate with space and use it as a word otherwise delete

"
* Remove " from string
* Remove text outside of " " 
* Remove text inside of " " but use it for generating a topic as a new value

\#
* Remove

$
* Replace with " money " or something similar
* Remove

%
* Remove

&
* Remove

'
* Remove
* If followed by an 's' remove that too

( )
* Remove

\*

* Remove
* an asterisk at the start of clue might be useful for a later itertion of model that solves entire puzzle rather than a single question

\+

* Remove
* Seperate into multiple questions using + as seperator and create a new clue composed of the solutions of each segment

,

* Remove

\-
* Remove

.	
* Remove

/
* Remove

:
* Remove
* Use text before to generate topic and text after as clue

;
* Remove

<>
* Remove

=
* Remove
* Use text after to generate topic and text before as clue

?
* Remove

@
* Remove

[]
* Remove

\
* Remove

^
* Remove
* Replace with "squared" or something similar

_
* Flag for requiring a blank within sentence
* Remove

`
* Remove
* If double, treat as "

{}
* Remove

|
* Remove

~
* Remove
