In [6]:
import pandas as pd
import numpy as np
import gensim.downloader
import custom_transformers
import gensim.downloader
import os.path
import swifter


from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.utils import resample

DATA_FOLDER = 'data/'
ORIGIN = 'origin_data.tsv'
CLEAN = 'clean_data.csv'
RESAMPLED ='resampled_data.csv'

SAMPLE_SIZE = 100000
RANDOM_STATE = 357394

### Clean Original Data 
 - drop nulls
 - clean strings
 - remove stop words where possible
 - eliminate duplicate clue and answer pairings
 - create new column of answer length / characters as would be available in the context of a crossword puzzle

In [41]:
if os.path.isfile(DATA_FOLDER+CLEAN):
    df = pd.read_csv(DATA_FOLDER+CLEAN)
else:
    #Select only relevant columns.
    df = pd.read_table(DATA_FOLDER+ORIGIN)[['answer','clue']]

    #Clean Strings
    df = custom_transformers.CleanStrings(df)
    df['answer'] = df['answer'].str.lower().str.strip()

    #Drop any rows with null values.
    df.dropna(inplace=True)

    # Drop repetitions of answer / clue pairings
    df.drop_duplicates(['answer','clue'],keep='first', inplace=True)
    df.reset_index(drop=True, inplace=True) 

    #Create length of answer, as would be available in the context of a crossword puzzle.
    df['answer_characters'] = '_'
    df['answer_characters'] = df['answer_characters'].str.repeat(df['answer'].str.len().astype(int))
    
    #Save to drive
    df.to_csv(DATA_FOLDER+CLEAN, index=False)

# Create smaller sam    
if os.path.isfile(DATA_FOLDER+RESAMPLED):
    sampled_df = pd.read_csv(DATA_FOLDER+RESAMPLED)
else:
    sampled_df = resample(df,n_samples=SAMPLE_SIZE, random_state=RANDOM_STATE, replace=False)
    sampled_df.to_csv(DATA_FOLDER+RESAMPLED, index=False)
    


### Generate Cosign similarities between clue and answer (to be used as target in machine learning)

In [45]:
def generate_cosign_similarity(data, model_dict):
    data = data.copy()
    for model_name, model in model_dict.items():
        # Vocabulary List of words in model
        vocab = model.index_to_key
        # Value to indicate if answer is contained in model vocabulary
        data[model_name] = False
        data.loc[data['answer'].isin(vocab), model_name] = True 
        
        # Series of string lists generated from clues, filtering out words not in vocabulary
        clue_words = data['clue'].astype(str).swifter.apply(lambda clue: [x for x in clue.split() if x in vocab])
        
        # Filter out entries not captured by vocabulary
        filter = (data[model_name] == True) & (clue_words.str.len() > 0 )
        clue_words = clue_words[filter]
        answer_vectors = model[data.loc[filter, 'answer']]
        
        # Assign cosgin similarity
        clue_vectors = [np.mean(model[x],axis=0) for x in clue_words]
        cos_sim = np.diagonal(cosine_similarity(answer_vectors, clue_vectors))
        data.loc[filter, model_name+'_cosine_similarity'] = cos_sim
    
    return data


In [None]:
# Load Gensim Models
twitter = gensim.downloader.load('glove-twitter-200')
google = gensim.downloader.load('word2vec-google-news-300')
wiki = gensim.downloader.load('glove-wiki-gigaword-300')
model_dict = {'twitter':twitter,
              'google':google,
              'wiki':wiki}

In [47]:
if not set(model_dict.keys()).issubset(set(sampled_df.columns)):
    sampled_df = generate_cosign_similarity(sampled_df,model_dict=model_dict)
    sampled_df.to_csv(DATA_FOLDER+RESAMPLED, index=False)

Pandas Apply: 100%|██████████| 100000/100000 [07:59<00:00, 208.42it/s]


Error: Canceled future for execute_request message before replies were done

### Assign model vocabulary booleans and save

In [29]:
def is_in_pretrained_word2vec_model(data:pd.DataFrame, model_name:str)->pd.DataFrame:
    trained_model = gensim.downloader.load(model_name)
    vocab = [x.split('/')[-1] for x in trained_model.index_to_key]
    data[model_name] = data['answer'].isin(vocab)
    return data


In [22]:
#glove-twitter-200
df = pd.read_csv(DATA_FOLDER+CLEAN)
df.to_csv(DATA_FOLDER+CLEAN, index=False)

### Split data into train and test sets

In [None]:
train, test = train_test_split(df, test_size=0.3, random_state=42)

df.to_csv(DATA_FOLDER+CLEAN, index=False)
train.to_csv(DATA_FOLDER+TRAIN,index=False)
test.to_csv(DATA_FOLDER+TEST,index=False)