In [1]:
import pandas as pd
import numpy as np
import plotly
import plotly.graph_objects as go

GREY = '#696969'
GREEN = '#8CD186'

df = pd.read_csv('data/clean_data.csv',keep_default_na=False)

In [125]:
df.head(5)

Unnamed: 0,answer,clue,noun_required,fill_blank
0,aba,litigator group,False,False
1,actor,thespian,False,False
2,ades,summer coolers,False,False
3,ads,newspaper revenue,False,False
4,afro,head of hair,False,False


# Number of unique clues and answers

In [5]:
proportions = { 'clues':df['clue'].nunique(), 
                'answers': df['answer'].nunique(),
                'clues per answer':df['clue'].nunique() / df['answer'].nunique(),
                'pairings':(df['answer'] + df['clue']).nunique()
              }

print(f"""
      Unique Clues:                       {proportions['clues']} 
      Unique Answers:                     {proportions['answers']}
      Unique Answer + Clue Pairings:      {proportions['pairings']}
      
      Clues per Answer: {proportions['clues per answer']}
      """)





      Unique Clues:                       2579749 
      Unique Answers:                     315116
      Unique Answer + Clue Pairings:      3103325
      
      Clues per Answer: 8.186664593356097
      


In [126]:
labels = [  'Unique Pairings',
            'Unique Clues',
            'Unique Answers'
            ]
values = [  proportions['pairings'],
            proportions['clues'],
            proportions['answers']
            ]

fig = go.Figure()
trace = go.Bar( x=labels, 
                y=values,
                text=values,
                textposition='auto',
                texttemplate='%{text:,}', 
                marker_color=GREEN, 
                )

fig.add_trace(trace)

fig.update_layout(width=600, 
                  height=600,
                  plot_bgcolor='white', 
                  title = 'Number of unique clues and answers in cleaned dataset', 
                  font_color = GREY
                  )

fig.update_yaxes(showline=True, linewidth=2, linecolor=GREY, gridcolor=GREY)
fig.update_xaxes(showline=True, linewidth=2, linecolor=GREY, showgrid=False)
fig.show()

# Frequency of repeated answers

In [168]:
frequencies = { '100+ occurances':    (df['answer'].value_counts() > 100).sum(), 
                '11-100 occurances':  ((df['answer'].value_counts() < 100) & (df['answer'].value_counts() > 10)).sum(),
                '2-10 occurances':     ((df['answer'].value_counts() <= 10) & (df['answer'].value_counts() > 1)).sum(),
                '1 occurance':        (df['answer'].value_counts() == 1).sum()
              }
print(f"""
      Answers with more than 100 occurances:        {frequencies['100+ occurances']}
      Answers with between 11 and 99 occurances:    {frequencies['11-100 occurances']}
      Answers with between 2 and 10 occurances:     {frequencies['2-10 occurances']}
      Answers with only one occurance:              {frequencies['1 occurance']}
      """)




      Answers with more than 100 occurances:        5991
      Answers with between 11 and 99 occurances:    38857
      Answers with between 2 and 10 occurances:     104758
      Answers with only one occurance:              165444
      


In [273]:
labels = list(frequencies.keys())
values = list(frequencies.values())
primary_color = np.array(plotly.colors.hex_to_rgb(GREEN))
colors = ['rgb'+str(tuple(primary_color*1.2)),
          'rgb'+str(tuple(primary_color*1.1)),
          'rgb'+str(tuple(primary_color*1.0)),
          'rgb'+str(tuple(primary_color*0.9))]

fig = go.Figure()
trace = go.Pie( labels=labels, 
                values=values,
                text=values,
                textinfo='label+percent',
                textposition='outside',
                marker_colors=colors,
                hole=0.6 
                )

fig.add_trace(trace)

fig.update_layout(width=600, 
                  height=600,
                  plot_bgcolor='white', 
                  title = 'Frequency of repeated answers', 
                  font_color = GREY
                  )

fig.update_yaxes(showline=True, linewidth=2, linecolor=GREY, gridcolor=GREY)
fig.update_xaxes(showline=True, linewidth=2, linecolor=GREY, showgrid=False)
fig.show()


In [147]:
np.array(plotly.colors.hex_to_rgb(GREEN))


140

# Samples of punctuation occuring within clues (Using uncleaned data)

In [179]:

def sample_strings_containing(containing:list,  data:pd.DataFrame, column:str, n_samples:int=1) -> dict:
    """Searches a column within a dataframe and returns random smaples of entries containing the key words
    in the given list.

    Args:
        containing (list): List of key words to look for
        data (pd.DataFrame): Dataframe to search
        column (str): column to serch within datdrame
        n_samples (int, optional): number of samples to return for each key. Defaults to 1.

    """
    samples = {}
    for x in containing:
        matches = data[data[column].str.contains(x, regex=False)]
        if matches.shape[0] > 0:  
            samples[x] = []
            for r in np.random.choice(range(matches.shape[0]),n_samples, replace=False):         
                r = np.random.randint(0, matches.shape[0])
                samples[x].append(matches.iloc[r][column])
    return samples

In [180]:
from string import punctuation
original_df = pd.read_table('data/clues.tsv')
original_df = original_df[['clue','answer']]
original_df.dropna(inplace=True)



In [181]:
punctuation_samples = sample_strings_containing(list(punctuation),original_df,'clue',n_samples=5)
pd.DataFrame(punctuation_samples).T

Unnamed: 0,0,1,2,3,4
!,Whoa!,Oklahoma! aunt,Can't help ya!,That's so cool!,___ Campesinos!
"""","Politico who wrote ""The Truth (With Jokes)""","Marvin of ""Cat Ballou""","Former ""Entertainment Tonight"" cohost","San Francisco's public transit system, with ""the""","Beethoven's ""Für __"""
#,#1 hit by a 30-Across member,His #2 was retired by the Dodgers,Agcy. that's got your #,"Singer of the 2016 #1 hit ""Cheap Thrills""",World's #1 food company
$,"I'll take Presidents for $200, __","@#$%!, in comics",Vega$ gumshoe,Arli$$ network,$200 Monopoly properties: Abbr.
%,10%ers,10% of MMDX,20% of diez,Land that's more than 90% desert,100%
&,S&L protector,Org. with a Work & Retirement section on its w...,S&L offering,"Hatcher of ""Lois & Clark""","Hatcher of ""Lois & Clark"""
',Exercised one's discretion,Female fowl that's less showy than the male,"Reply to 'No, that's not!'",Something's fishy,"With 62-Across, Whoopi's ""Ghost"" role"
(,__ perpetua (Idaho's motto),Duchess of ___ (Goya subject),Say It ___ So (1983 Hall & Oates hit),___ to go (very eager),Play (with)
),"Not far (with ""a"")",Al ____ (airline),"Step 2: With 43- and 55-Across, do this in the...",___ cake (Chinese New Year delicacy),Hit ___ spot (rankle)
*,M*A*S*H regular,*Auto accessory,"*Tries to start a scandal, say",*Stay in power,*Unwelcome sci-fi visitor


# Frequency of Punctuation

In [225]:
def frequency_of_characters(characters:list,data:pd.DataFrame)->dict:
    frequencies = {}
    for character in characters:
        frequencies[character] = data[data['clue'].str.contains(character,regex=False)].shape[0] / data.shape[0]
    return frequencies

In [226]:
punc_freq = frequency_of_characters(punctuation,original_df)

In [271]:
labels = list(punc_freq.keys())
values = list(punc_freq.values())
# fig = go.Figure(data=[go.Bar(
#                     x=labels, 
#                     y=values,
#                     text=values,
#                     textposition='auto',
#                             )
#                      ]
#                )y
# fig.update_layout(title_text='Percentages of samples containing each punctuation character')
# fig.show()

fig = go.Figure()
trace = go.Bar( x=labels, 
                y=values,
                text=values,
                textposition='auto',
                texttemplate='%{text:1.2%}', 
                marker_color=GREEN,
                textfont_size=10,
                textangle=0
                )

fig.add_trace(trace)

fig.update_layout(width=1600, 
                  height=600,
                  bargap=0.1,
                  plot_bgcolor='white', 
                  title = 'Percentage of clues that contain each form of punctuation (before cleaning)', 
                  font_color = GREY,
                  # margin=dict(l=10, r=10, t=10, b=10)
                  )

fig.update_yaxes(showticklabels=False,showline=False, linewidth=2, linecolor=GREY, gridcolor=GREY,dtick = 0.01)
fig.update_xaxes(showline=True, linewidth=2, linecolor=GREY, showgrid=False)
fig.show()

In [3]:
import gensim.downloader
# Show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [61]:
glove_twitter_25_model = gensim.downloader.load('glove-twitter-25')
glove_twitter_25_vocab = glove_twitter_25_model.key_to_index.keys()
df['answer'].isin(glove_twitter_25_vocab).value_counts(normalize=True)

True     0.783903
False    0.216097
Name: answer, dtype: float64

In [28]:
conceptnet_model = gensim.downloader.load('conceptnet-numberbatch-17-06-300')
conceptnet_vocab = [x[6:] for x in conceptnet_model.key_to_index.keys()]
df['answer'].isin(conceptnet_vocab).value_counts(normalize=True)

True     0.79271
False    0.20729
Name: answer, dtype: float64

In [4]:
google_news_model = gensim.downloader.load('word2vec-google-news-300')
google_news_vocab = google_news_model.key_to_index.keys()
df['answer'].isin(google_news_vocab).value_counts(normalize=True)



True     0.709434
False    0.290566
Name: answer, dtype: float64

In [4]:
glove_twitter_25_model = gensim.downloader.load('glove-wiki-gigaword-300')
glove_twitter_25_vocab = glove_twitter_25_model.key_to_index.keys()
df['answer'].isin(glove_twitter_25_vocab).value_counts(normalize=True)



True     0.791816
False    0.208184
Name: answer, dtype: float64

In [11]:
labels = [  'conceptnet-numberbatch-17-06-300',
            'glove-wiki-gigaword-300',
            'glove-twitter-25',
            'word2vec-google-news-300', 
            ]
values = [  0.79271, 
            0.791816,
            0.783903, 
            0.709434
            ]

fig = go.Figure()
trace = go.Bar( x=labels, 
                y=values,
                text=values,
                textposition='auto',
                texttemplate='%{text:0.2%}', 
                marker_color=GREEN, 
                )

fig.add_trace(trace)

fig.update_layout(width=900, 
                  height=600,
                  plot_bgcolor='white', 
                  title = 'Percentage of answers contained in vocabularies of pre-trained word2vec models', 
                  font_color = GREY
                  )

fig.update_yaxes(showline=True, linewidth=2, linecolor=GREY, gridcolor=GREY)
fig.update_xaxes(showline=True, linewidth=2, linecolor=GREY, showgrid=False)
fig.show()

# Cleaning / Handling Strategies
#### Brainstorming ideas for how to handle the variety of different questions and also clean strings to be readable by algorithm. Bullet points are seperate ideas that might be used individually or in conjunction with some/all of the others



Capital letters not at the start of sentence:
* flag as involving a noun
* flag as requiring a noun
* convert into ____
* use in generating a topic

Numbers:
* remove
* flag as involving number

Single characters:
* remove

Containing "-Across" or "-Down"
* remove from dataset (only of use in a later iteration of model that solve entire puzzle rather than single questions)

!
* Remove
* If at end of sentance, seperate with space and use it as a word otherwise delete

"
* Remove " from string
* Remove text outside of " " 
* Remove text inside of " " but use it for generating a topic as a new value

\#
* Remove

$
* Replace with " money " or something similar
* Remove

%
* Remove

&
* Remove

'
* Remove
* If followed by an 's' remove that too

( )
* Remove

\*

* Remove
* an asterisk at the start of clue might be useful for a later itertion of model that solves entire puzzle rather than a single question

\+

* Remove
* Seperate into multiple questions using + as seperator and create a new clue composed of the solutions of each segment

,

* Remove

\-
* Remove

.	
* Remove

/
* Remove

:
* Remove
* Use text before to generate topic and text after as clue

;
* Remove

<>
* Remove

=
* Remove
* Use text after to generate topic and text before as clue

?
* Remove

@
* Remove

[]
* Remove

\
* Remove

^
* Remove
* Replace with "squared" or something similar

_
* Flag for requiring a blank within sentence
* Remove

`
* Remove
* If double, treat as "

{}
* Remove

|
* Remove

~
* Remove
