In [2]:
import pandas as pd
import numpy as np
import gensim.downloader
from custom_transformers import StringFeatures
import os.path
# import swifter


from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.utils import resample

DATA_FOLDER = 'data/'
ORIGIN = 'origin_data.tsv'
CLEAN = 'clean_data.csv'
RESAMPLED ='resampled_data.csv'
TRAIN = 'train_data.csv'
TEST = 'test_data.csv'

SAMPLE_SIZE = 50000
RANDOM_STATE = 357394

### Clean Original Data 
 - drop nulls
 - clean strings
 - remove stop words where possible
 - eliminate duplicate clue and answer pairings
 - create new column of answer length / characters as would be available in the context of a crossword puzzle

In [5]:
if os.path.isfile(DATA_FOLDER+CLEAN):
    df = pd.read_csv(DATA_FOLDER+CLEAN)
else:
    string_features = StringFeatures(min_characters_for_wordcount=1, percent_of_known_characters=0)
    
    #Select only relevant columns, and remove any rows with null values
    df = pd.read_table(DATA_FOLDER+ORIGIN)[['answer','clue']].dropna()
    
    #Clean Strings and build features
    df = pd.concat(string_features.transform(df[['clue']], df['answer']), axis=1)    
    df['answer'] = df['answer'].str.lower().str.strip()

    # Drop repetitions of answer / clue pairings
    df.drop_duplicates(['answer','clue'],keep='first', inplace=True)
    df.reset_index(drop=True, inplace=True) 
    
    #Save to drive
    df.to_csv(DATA_FOLDER+CLEAN, index=False)

# Create smaller sample
if os.path.isfile(DATA_FOLDER+RESAMPLED):
    sampled_df = pd.read_csv(DATA_FOLDER+RESAMPLED)
else:
    sampled_df = resample(df,n_samples=SAMPLE_SIZE, random_state=RANDOM_STATE, replace=False)
    sampled_df.to_csv(DATA_FOLDER+RESAMPLED, index=False)
    


### Generate Cosign similarities between clue and answer (to be used as target in machine learning)

In [8]:
def generate_cosign_similarity(data, model_dict):
    data = data.copy()
    for model_name, model in model_dict.items():
        # Vocabulary List of words in model
        vocab = model.index_to_key
        # Value to indicate if answer is contained in model vocabulary
        data[model_name] = False
        data.loc[data['answer'].isin(vocab), model_name] = True 
        
        # Series of string lists generated from clues, filtering out words not in vocabulary
        clue_words = data['clue'].astype(str).apply(lambda clue: [x for x in clue.split() if x in vocab])
        
        # Filter out entries not captured by vocabulary
        filter = (data[model_name] == True) & (clue_words.str.len() > 0 )
        clue_words = clue_words[filter]
        answer_vectors = model[data.loc[filter, 'answer']]
        
        # Assign cosgin similarity
        clue_vectors = [np.mean(model[x],axis=0) for x in clue_words]
        cos_sim = np.diagonal(cosine_similarity(answer_vectors, clue_vectors))
        data.loc[filter, model_name+'_cosine_similarity'] = cos_sim
    
    return data


In [9]:
# Load Gensim Models
twitter = gensim.downloader.load('glove-twitter-25')
google = gensim.downloader.load('word2vec-google-news-300')
wiki = gensim.downloader.load('glove-wiki-gigaword-100')
model_dict = {'twitter':twitter,
              'google':google,
              'wiki':wiki}

In [10]:
sampled_df = pd.read_csv(DATA_FOLDER+RESAMPLED)
if not set(model_dict.keys()).issubset(set(sampled_df.columns)):
    sampled_df = generate_cosign_similarity(sampled_df,model_dict=model_dict)
    sampled_df.to_csv(DATA_FOLDER+RESAMPLED, index=False)

### Split data into train and test sets

In [12]:
train, test = train_test_split(sampled_df, test_size=0.3, random_state=42)
train.to_csv(DATA_FOLDER+TRAIN,index=False)
test.to_csv(DATA_FOLDER+TEST,index=False)

In [13]:
train

Unnamed: 0,clue,noun_involved,fill_blank,word_count,answer_length,answer_characters,answer,twitter,twitter_cosine_similarity,google,google_cosine_similarity,wiki,wiki_cosine_similarity
2043275,mewing passerines,0,0,2,8,________,catbirds,False,,True,0.631570,True,0.158731
536410,spry,0,0,1,15,_______________,aslooseasagoose,False,,False,,False,
1085273,cap pistol instance,0,0,3,6,______,toygun,False,,False,,False,
2733344,air force fighting falcon example,1,0,5,3,___,jet,True,0.852220,True,0.356891,True,0.631171
2636287,northern border dixie,1,0,3,12,____________,potomacriver,False,,False,,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1222670,piercer,0,0,1,5,_____,gorer,False,,False,,False,
1087384,marines training site,0,0,3,8,________,bootcamp,True,0.692117,True,0.369578,True,0.017907
2522513,comment,0,0,1,11,___________,wholeavesno,False,,False,,False,
792213,current news,0,0,2,6,______,latest,True,0.889236,True,0.455128,True,0.750022
