In [4]:
import pandas as pd
import os
import gensim.downloader
import plotly.graph_objects as go
from custom_transformers import PCAFeatures, SimilarityPrediction, SelectTopNWords, SelectRandomWords
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, median_absolute_error

from constants import *

### Load Gensim Models

In [6]:
twitter_gensim = gensim.downloader.load('glove-twitter-25')
google_gensim = gensim.downloader.load('word2vec-google-news-300')
wiki_gensim = gensim.downloader.load('glove-wiki-gigaword-100')
gensim_model_dict = {'twitter':twitter_gensim,
              'google':google_gensim,
              'wiki':wiki_gensim}

### Convert clues into features using word vectorization and PCA

In [9]:
if os.path.isfile(DATA_FOLDER+PCA_TRAIN) and os.path.isfile(DATA_FOLDER+PCA_TEST) :
    train = pd.read_csv(DATA_FOLDER+PCA_TRAIN)
    test = pd.read_csv(DATA_FOLDER+PCA_TEST)
else:    
    train = pd.read_csv(DATA_FOLDER+TRAIN)
    test = pd.read_csv(DATA_FOLDER+TEST)
    train.loc[train['clue'].isna(), 'clue'] = ''
    pca_features = PCAFeatures(gensim_model_dict)
    pca_features.fit(train)
    train = pca_features.transform(train)
    test = pca_features.transform(test)
    train.to_csv(DATA_FOLDER+PCA_TRAIN, index= False)
    test.to_csv(DATA_FOLDER+PCA_TEST, index= False)
    
    

In [10]:
X_train = train.drop('answer',axis=1)
y_train = train['answer']
X_test = test.drop('answer', axis=1)
y_test = test['answer']

### Predict Cosine Similarity

In [11]:
if os.path.isfile(DATA_FOLDER+PREDICTED_SIMILARITIES):
    X_test = pd.read_csv(DATA_FOLDER+PREDICTED_SIMILARITIES)
else:      
    random_forest_dict = {'twitter': RandomForestRegressor(),
                          'google':RandomForestRegressor(),
                          'wiki':RandomForestRegressor()}  
    similarity_predictor = SimilarityPrediction(gensim_model_dict=gensim_model_dict,predictor_dict=random_forest_dict)
    similarity_predictor.fit(X_train)
    X_test = similarity_predictor.transform(X_test)
    X_test.to_csv(DATA_FOLDER+PREDICTED_SIMILARITIES,index= False)

In [12]:
for model_name in gensim_model_dict.keys():
    row_filter = X_test[f'{model_name}_cosine_similarity'].notna()
    true = X_test[row_filter][f'{model_name}_cosine_similarity']
    predict = X_test[row_filter][f'{model_name}_predicted_similarity']
    mean_error = mean_absolute_error(true,predict)
    median_error = median_absolute_error(true,predict)
    print(f'{model_name}:\nMean Absolute Error: {mean_error}\nMedian Absolute Error: {median_error}')

twitter:
Mean Absolute Error: 0.19433275376997242
Median Absolute Error: 0.17121899019999992
google:
Mean Absolute Error: 0.10628713424573702
Median Absolute Error: 0.08929215441999988
wiki:
Mean Absolute Error: 0.17708801120947698
Median Absolute Error: 0.1523335474065


###  Predict answers for test set
Because of the long times used for predictions, a script is used to predict small batches so that it does not have to run continously / data is not lost if an outlier causes a bug in the script.

In [13]:

N_SAMPLES = 20

word_selection = SelectTopNWords(5)
for known_characters in KNOWN_CHARACTER_SETTINGS:
    if os.path.isfile(DATA_FOLDER+known_characters+"_words.csv") and os.path.isfile(DATA_FOLDER+known_characters+'_scores.csv') :
        words = pd.read_csv(DATA_FOLDER+known_characters+"_words.csv", index_col=0)
        scores = pd.read_csv(DATA_FOLDER+known_characters+"_scores.csv", index_col=0)
    else:
        words, scores = word_selection.predict( X= X_test[:N_SAMPLES], 
                                        known_characters= X_test[:N_SAMPLES][known_characters], 
                                        gensim_models= gensim_model_dict)
        words.columns = words.columns.astype(str)
        scores.columns = scores.columns.astype(str)
        words.to_csv(DATA_FOLDER+known_characters+"_words.csv")
        scores.to_csv(DATA_FOLDER+known_characters+"_scores.csv")
        
    while words.index.max() < X_test.index.max():
        start_index = words.index.max()+1
        end_index = start_index+N_SAMPLES
        new_words, new_scores = word_selection.predict( X= X_test[start_index:end_index], 
                                known_characters= X_test[start_index:end_index][known_characters], 
                                gensim_models= gensim_model_dict)
        new_words.columns = new_words.columns.astype(str)
        new_scores.columns = new_scores.columns.astype(str)
        words = pd.concat([words,new_words])
        scores = pd.concat([scores,new_scores])
        words.to_csv(DATA_FOLDER+known_characters+"_words.csv")
        scores.to_csv(DATA_FOLDER+known_characters+"_scores.csv")
        

### Random Selection
Randomly select words from the same vocabulary without use of cosine similarity to act as baseline

In [14]:
N_SAMPLES = 50

random_Selection = SelectRandomWords()
for known_characters in KNOWN_CHARACTER_SETTINGS:
    if os.path.isfile(DATA_FOLDER+known_characters+"_random_words.csv"):
        words = pd.read_csv(DATA_FOLDER+known_characters+"_random_words.csv", index_col=0)
    else:
        words = random_Selection.predict(X_test[known_characters][:N_SAMPLES], 
                                                 gensim_models= gensim_model_dict)
        words.columns = words.columns.astype(str)
        words.to_csv(DATA_FOLDER+known_characters+"_random_words.csv")
        
    while words.index.max() < X_test.index.max():
        start_index = words.index.max()+1
        end_index = start_index+N_SAMPLES
        new_words = random_Selection.predict(X_test[known_characters][start_index:end_index], 
                                             gensim_models= gensim_model_dict)
        new_words.columns = new_words.columns.astype(str)
        words = pd.concat([words,new_words])
        words.to_csv(DATA_FOLDER+known_characters+"_random_words.csv")

### Visualize Results

In [186]:
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode

def chart_results():
    LABELS = [x.replace('_',' ').title() for x in KNOWN_CHARACTER_SETTINGS]
    PRIMARY_WIDTH = 0.3
    SECONDARY_WIDTH = 0.2
    BARGAP = 0.5
    VOCAB_LIMIT = 0.9002
    predictions = []
    top5 = []
    random_y = []
    random_x = []
    
    for known_chars in KNOWN_CHARACTER_SETTINGS:
        
        predicted_words = pd.read_csv(DATA_FOLDER+f"{known_chars}_words.csv", index_col= 0)
        results = pd.concat([y_test,predicted_words],axis=1)
        prediction_accuracy = ((results['answer'] == results['0']).value_counts(normalize=True)).loc[True]
        
        top5_accuracy = (((results['answer'] == results['0']) |
                        (results['answer'] == results['1']) | 
                        (results['answer'] == results['2']) | 
                        (results['answer'] == results['3']) |
                        (results['answer'] == results['4'])).value_counts(normalize=True)).loc[True]
        
        random_words = pd.read_csv(DATA_FOLDER+f"{known_chars}_random_words.csv", index_col= 0)
        random_results = pd.concat([y_test,random_words],axis=1)
        random_accuracy = ((random_results['answer'] == random_results['0']).value_counts(normalize=True)).loc[True]
    
        predictions.append(prediction_accuracy)
        top5.append(top5_accuracy)
        random_y += [random_accuracy,random_accuracy, None]
        
    for i in range(len(LABELS)):
        pos1 = i+0.2
        pos2 = i+0.75
        random_x += [pos1, pos2, None]
        
    fig = go.Figure()
    
    vocab_limit = go.Scatter( 
                                name='Vocabulary Limit', 
                                x=[0,len(LABELS)],
                                y=[VOCAB_LIMIT,VOCAB_LIMIT],
                                mode='lines',
                                xaxis='x2',marker_color= COLOR_4)
    
    trace_predictions = go.Bar( 
                                name= 'Predition',
                                x= LABELS, 
                                y= predictions,
                                text= predictions,
                                textposition= 'outside',
                                texttemplate= '%{text:0.2%}', 
                                textfont= dict(color=COLOR_1),
                                marker_color= COLOR_1,
                                width= PRIMARY_WIDTH)
    
    trace_top5 = go.Bar( 
                                name= 'Contained in top 5', 
                                x= LABELS, 
                                y= top5,
                                text= top5,
                                textposition= 'outside',
                                texttemplate= '%{text:0.2%}',
                                textfont= dict(color=COLOR_2), 
                                marker_color= COLOR_2,
                                width= SECONDARY_WIDTH)
    
    r_selection = go.Scatter(
                                xaxis='x2',
                                mode='lines',
                                x=random_x,
                                y=random_y,
                                name='Random Selection', 
                                marker_color= COLOR_5)
    
    r_text = go. Scatter(       
                                xaxis='x2',
                                mode='text',
                                showlegend=False,
                                x=[x  if i % len(LABELS) == 1 else None for i, x in enumerate(random_x)],
                                y=[y  if i % len(LABELS) == 1 else None for i, y in enumerate(random_y)],
                                texttemplate= '%{text:0.3%}',
                                text=[y  if i % len(LABELS) == 1 else None for i, y in enumerate(random_y)],
                                textposition = "bottom right",
                                textfont= dict(color=COLOR_5))

    fig.add_traces([vocab_limit, trace_predictions,trace_top5, r_selection,r_text])
    
    
    fig.update_layout(          width= 1200, 
                                height= 600,
                                plot_bgcolor= 'white', 
                                title = 'Crossword Solver Accuracy Overview',
                                title_x=0.5,
                                barmode = 'group',
                                bargap = BARGAP,
                                legend_x = 1, 
                                legend_y = 0.5,
                                font=dict( size= 16)
                                )
    
    fig.update_yaxes(           
                                tickformat= '.0%',
                                tickmode= 'array',
                                tickvals= [VOCAB_LIMIT,1], 
                                range= [-0.1,1]),
    
    fig.layout.xaxis2 = go.layout.XAxis(overlaying='x', 
                                        range=[0, len(trace_predictions.x)], 
                                        showticklabels=False)


    fig.add_hline(y=1)
    fig.show()
    
chart_results()
    