In [1]:
import pandas as pd
import numpy as np
import gensim.downloader
from custom_transformers import StringFeatures
import os.path
# import swifter


from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.utils import resample

DATA_FOLDER = 'data/'
ORIGIN = 'origin_data.tsv'
CLEAN = 'clean_data.csv'
RESAMPLED ='resampled_data.csv'
TRAIN = 'train_data.csv'
TEST = 'test_data.csv'

SAMPLE_SIZE = 50000
RANDOM_STATE = 357394

### Clean Original Data 
 - drop nulls
 - clean strings
 - remove stop words where possible
 - eliminate duplicate clue and answer pairings
 - create new column of answer length / characters as would be available in the context of a crossword puzzle

In [2]:
if os.path.isfile(DATA_FOLDER+CLEAN):
    df = pd.read_csv(DATA_FOLDER+CLEAN)
else:
    string_features = StringFeatures()
    
    #Select only relevant columns, and remove any rows with null values
    df = pd.read_table(DATA_FOLDER+ORIGIN)[['answer','clue']].dropna()
    
    #Clean Strings and build features
    df = pd.concat(string_features.transform(df[['clue']], df['answer']), axis=1)    
    df['answer'] = df['answer'].str.lower().str.strip()

    # Drop repetitions of answer / clue pairings
    df.drop_duplicates(['answer','clue'],keep='first', inplace=True)
    df.reset_index(drop=True, inplace=True) 
    
    #Save to drive
    df.to_csv(DATA_FOLDER+CLEAN, index=False)

# Create smaller sample
if os.path.isfile(DATA_FOLDER+RESAMPLED):
    sampled_df = pd.read_csv(DATA_FOLDER+RESAMPLED)
else:
    sampled_df = resample(df,n_samples=SAMPLE_SIZE, random_state=RANDOM_STATE, replace=False)
    sampled_df.to_csv(DATA_FOLDER+RESAMPLED, index=False)
    


### Generate Cosign similarities between clue and answer (to be used as target in machine learning)

In [3]:
def generate_cosign_similarity(data, model_dict):
    data = data.copy()
    for model_name, model in model_dict.items():
        # Vocabulary List of words in model
        vocab = model.index_to_key
        # Value to indicate if answer is contained in model vocabulary
        data[model_name] = False
        data.loc[data['answer'].isin(vocab), model_name] = True 
        
        # Series of string lists generated from clues, filtering out words not in vocabulary
        clue_words = data['clue'].astype(str).apply(lambda clue: [x for x in clue.split() if x in vocab])
        
        # Filter out entries not captured by vocabulary
        filter = (data[model_name] == True) & (clue_words.str.len() > 0 )
        clue_words = clue_words[filter]
        answer_vectors = model[data.loc[filter, 'answer']]
        
        # Assign cosgin similarity
        clue_vectors = [np.mean(model[x],axis=0) for x in clue_words]
        cos_sim = np.diagonal(cosine_similarity(answer_vectors, clue_vectors))
        data.loc[filter, model_name+'_cosine_similarity'] = cos_sim
    
    return data


In [4]:
# Load Gensim Models
twitter = gensim.downloader.load('glove-twitter-25')
google = gensim.downloader.load('word2vec-google-news-300')
wiki = gensim.downloader.load('glove-wiki-gigaword-100')
model_dict = {'twitter':twitter,
              'google':google,
              'wiki':wiki}

In [5]:
sampled_df = pd.read_csv(DATA_FOLDER+RESAMPLED)
if not set(model_dict.keys()).issubset(set(sampled_df.columns)):
    sampled_df = generate_cosign_similarity(sampled_df,model_dict=model_dict)
    sampled_df.to_csv(DATA_FOLDER+RESAMPLED, index=False)

### Split data into train and test sets

In [6]:
train, test = train_test_split(sampled_df, test_size=0.3, random_state=42)
train.to_csv(DATA_FOLDER+TRAIN,index=False)
test.to_csv(DATA_FOLDER+TEST,index=False)

In [7]:
train

Unnamed: 0,clue,noun_involved,fill_blank,word_count,answer_length,0.0%_known_characters,10.0%_known_characters,20.0%_known_characters,30.0%_known_characters,40.0%_known_characters,50.0%_known_characters,60.0%_known_characters,answer,twitter,twitter_cosine_similarity,google,google_cosine_similarity,wiki,wiki_cosine_similarity
38094,mewing passerines,0,0,2,8,________,___b____,c____r__,______ds,c___i__s,_a_b__ds,ca_b_r_s,catbirds,False,,True,0.631570,True,0.158731
40624,spry,0,0,1,15,_______________,a_____________e,_____se______s_,____o_e_s_g____,__l___e_s_g_o_e,__lo____sag_ose,asl_os_a__goo__,aslooseasagoose,False,,False,,False,
49425,cap pistol instance,0,0,3,6,______,_o____,t_____,__y__n,to____,__yg_n,to_gu_,toygun,False,,False,,False,
35734,air force fighting falcon example,1,0,5,3,___,___,_e_,_e_,j__,j_t,j_t,jet,True,0.852220,True,0.356891,True,0.631171
41708,northern border dixie,1,0,3,12,____________,p___________,p__________r,_____a___ver,po_o____i__r,p_t____rive_,po_om_c___er,potomacriver,False,,False,,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,piercer,0,0,1,5,_____,_____,g____,___er,__r_r,g_r__,g__er,gorer,False,,False,,False,
44732,marines training site,0,0,3,8,________,___t____,__o_c___,___t__m_,b_o___m_,_o_tca__,b_o_cam_,bootcamp,True,0.692117,True,0.369578,True,0.017907
38158,comment,0,0,1,11,___________,_______e___,w_______s__,_hol_______,w___e_v_s__,wh_le___sn_,wh_lea__s_o,wholeavesno,False,,False,,False,
860,current news,0,0,2,6,______,_a____,l_____,la____,___e_t,lat___,la_e_t,latest,True,0.889236,True,0.455128,True,0.750022
