In [40]:
import pandas as pd
import numpy as np
import re
import gensim.downloader
from custom_transformers import PCAFeatures, SimilarityPrediction
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [4]:
train = pd.read_csv('data/train_data.csv').iloc[450:470]
test = pd.read_csv('data/test_data.csv').head()
train.loc[train['clue'].isna(), 'clue'] = ''

In [5]:
train.head(3)

Unnamed: 0,clue,noun_involved,fill_blank,word_count,answer_length,0.0%_known_characters,10.0%_known_characters,20.0%_known_characters,30.0%_known_characters,40.0%_known_characters,50.0%_known_characters,60.0%_known_characters,answer,twitter,twitter_cosine_similarity,google,google_cosine_similarity,wiki,wiki_cosine_similarity
450,water lily,0,0,2,5,_____,_____,l____,_o__s,lo___,l___s,l_to_,lotos,True,-0.031981,False,,True,-0.255879
451,personal aspects,0,0,2,5,_____,_____,____s,_i_n_,_ie__,__en_,mi__s,miens,True,0.066686,False,,False,
452,playing round,0,0,2,6,______,_____r,___f__,_ol___,g__f__,g___er,g__fer,golfer,True,0.441435,True,0.316615,True,0.443692


In [6]:
X_train = train.drop('answer',axis=1)
y_train = train['answer']
X_test = test.drop('answer', axis=1)
y_test = test['answer']

In [8]:
# Load Gensim Models
twitter_gensim = gensim.downloader.load('glove-twitter-25')
google_gensim = gensim.downloader.load('word2vec-google-news-300')
wiki_gensim = gensim.downloader.load('glove-wiki-gigaword-100')
gensim_model_dict = {'twitter':twitter_gensim,
              'google':google_gensim,
              'wiki':wiki_gensim}

In [9]:
# Train and fit PCA features
pca_features = PCAFeatures(gensim_model_dict)
pca_features.fit(X_train, y_train)
X_train = pca_features.transform(X_train)
X_test =  pca_features.transform(X_test)

In [10]:
# Load Regression Models
linear_regression_dict = {'twitter': LinearRegression(),
                          'google':LinearRegression(),
                          'wiki':LinearRegression()}

random_forest_dict = {'twitter': RandomForestRegressor(),
                          'google':RandomForestRegressor(),
                          'wiki':RandomForestRegressor()}

In [11]:
similarity_predictor = SimilarityPrediction(gensim_model_dict=gensim_model_dict,predictor_dict=linear_regression_dict)
similarity_predictor.fit(X_train)
X_test = similarity_predictor.transform(X_test)

twitter, LinearRegression()
Mean Absolute Error: 0.34016632534186153
Median Absolute Error: 0.35822516096548995
google, LinearRegression()
Mean Absolute Error: 0.10567658558756352
Median Absolute Error: 0.043020276509598526
wiki, LinearRegression()
Mean Absolute Error: 0.26165524985032396
Median Absolute Error: 0.28021248577326524


In [12]:
X_test[['google_cosine_similarity', 'google_predicted_similarity','twitter_cosine_similarity', 'twitter_predicted_similarity','wiki_cosine_similarity', 'wiki_predicted_similarity']]

Unnamed: 0,google_cosine_similarity,google_predicted_similarity,twitter_cosine_similarity,twitter_predicted_similarity,wiki_cosine_similarity,wiki_predicted_similarity
0,0.217814,0.159361,0.408657,0.471695,0.293712,-0.029522
1,0.152758,0.109738,-0.178549,0.402629,0.26116,0.541372
2,0.530712,0.152117,0.090369,0.33992,0.272002,0.193313
3,0.192443,0.199916,,0.220862,-0.20476,-0.283994
4,0.205076,0.164235,0.753809,0.28691,0.648196,0.101288


In [29]:
def vectorize_sentences(strings:pd.Series, model):
    vocab = model.index_to_key
    clues = strings.astype(str).apply(lambda clue: [x for x in clue.split() if x in vocab])
    df_filter = clues.str.len() > 0
    clues = clues[df_filter]
    clue_vectors = pd.Series([np.mean(model[x],axis=0) for x in clues])
    clue_vectors.index = clues.index
    return clue_vectors, df_filter

In [63]:
TOPN = 5

def predict(X:pd.DataFrame, known_characters:pd.Series, gensim_models:dict):
    all_predictions = {}
    for model_name, model in gensim_models.items():
        model_predictions = {}
        word_vectors, row_filter = vectorize_sentences(X['clue'], model)
        for index, vector in word_vectors.iteritems():
            target = X[f'{model_name}_predicted_similarity'].iloc[index]
            regex_pattern = re.compile('^'+''.join([x if not x == '_' else '[a-z]' for x in known_characters[index]])+'$')
            similarity_index = model.similar_by_vector(vector, topn=len(model.index_to_key))
            available_words = [x[0] for x in similarity_index if regex_pattern.match(x[0]) ]
            similarity_scores = np.asarray([x[1] for x in similarity_index if regex_pattern.match(x[0]) ])
            chosen_indices = np.abs(similarity_scores - target ).argsort()[:TOPN]
            word_matches = {}
            for i in chosen_indices:
                word_matches[available_words[i]] = 1 - abs(target - similarity_scores[i])
            model_predictions[index] = word_matches
        all_predictions[model_name] = model_predictions
    return all_predictions    
        
    
    

In [65]:
predictions = predict(X_test, X_test['60.0%_known_characters'], gensim_model_dict)

In [71]:
pd.DataFrame(predictions)

Unnamed: 0,twitter,google,wiki
0,"{'slags': 0.9977232081656362, 'slows': 0.99392...","{'slugs': 0.9879462413317008, 'sloes': 0.98650...","{'slats': 0.9818039634203439, 'sleds': 0.97951..."
1,"{'said': 0.99622441681669, 'paid': 0.993832422...","{'bail': 0.9979870852387321, 'saif': 0.9978023...","{'rain': 0.900533649370854, 'laid': 0.88004604..."
2,"{'bios': 0.999031542001019, 'clos': 0.99881139...","{'flos': 0.9971356863677489, 'hoos': 0.9961669...","{'ojos': 0.996674111843047, 'egos': 0.99336177..."
3,"{'dedektif': 0.7421012449863104, 'detektif': 0...",{'digestif': 0.9925262063073907},{'digestif': 0.9207659319036992}
4,"{'placid': 0.8452333233266827, 'placed': 0.533...","{'placed': 0.9591589799641069, 'placid': 0.912...","{'placid': 0.9969378940130648, 'placed': 0.453..."


In [259]:
def topn_similar_words(word_vector:np.ndarray, n_similar:int,  known_characters:str, target_cosine_similarity:float, model) -> list:
    #Create regex pattern``
    pattern = re.compile(generate_regex_pattern(known_characters))
    #List of all words and their similarity to given vector
    similarity_index = model.similar_by_vector(word_vector,topn=len(model.index_to_key))
    #Parse list and seperate words and scores
    words = [x[0] for x in similarity_index if pattern.match(x[0]) ]
    scores = np.asarray([x[1] for x in similarity_index if pattern.match(x[0]) ])
    #Select 5 closest indexes
    chosen = np.abs(scores - target_cosine_similarity ).argsort()[:n_similar]
    words[chosen], (1-abs(scores[chosen]-predicted))
    return [(x) for i, x in enumerate(words) if i in chosen]