In [1]:
import pandas as pd
import numpy as np
import re
import gensim.downloader
from custom_transformers import PCAFeatures, SimilarityPrediction
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
train = pd.read_csv('data/train_data.csv').iloc[450:470]
test = pd.read_csv('data/test_data.csv').head()
train.loc[train['clue'].isna(), 'clue'] = ''

In [3]:
train.head(3)

Unnamed: 0,clue,noun_involved,fill_blank,word_count,answer_length,0.0%_known_characters,10.0%_known_characters,20.0%_known_characters,30.0%_known_characters,40.0%_known_characters,50.0%_known_characters,60.0%_known_characters,answer,twitter,twitter_cosine_similarity,google,google_cosine_similarity,wiki,wiki_cosine_similarity
450,water lily,0,0,2,5,_____,_____,l____,_o__s,lo___,l___s,l_to_,lotos,True,-0.031981,False,,True,-0.255879
451,personal aspects,0,0,2,5,_____,_____,____s,_i_n_,_ie__,__en_,mi__s,miens,True,0.066686,False,,False,
452,playing round,0,0,2,6,______,_____r,___f__,_ol___,g__f__,g___er,g__fer,golfer,True,0.441435,True,0.316615,True,0.443692


In [4]:
X_train = train.drop('answer',axis=1)
y_train = train['answer']
X_test = test.drop('answer', axis=1)
y_test = test['answer']

In [5]:
# Load Gensim Models
twitter_gensim = gensim.downloader.load('glove-twitter-25')
google_gensim = gensim.downloader.load('word2vec-google-news-300')
wiki_gensim = gensim.downloader.load('glove-wiki-gigaword-100')
gensim_model_dict = {'twitter':twitter_gensim,
              'google':google_gensim,
              'wiki':wiki_gensim}

In [6]:
# Train and fit PCA features
pca_features = PCAFeatures(gensim_model_dict)
pca_features.fit(X_train, y_train)
X_train = pca_features.transform(X_train)
X_test =  pca_features.transform(X_test)

In [7]:
# Load Regression Models
linear_regression_dict = {'twitter': LinearRegression(),
                          'google':LinearRegression(),
                          'wiki':LinearRegression()}

random_forest_dict = {'twitter': RandomForestRegressor(),
                          'google':RandomForestRegressor(),
                          'wiki':RandomForestRegressor()}

In [10]:
similarity_predictor = SimilarityPrediction(gensim_model_dict=gensim_model_dict,predictor_dict=linear_regression_dict)
similarity_predictor.fit(X_train)
X_test = similarity_predictor.transform(X_test)

twitter, LinearRegression()
Mean Absolute Error: 0.34016632534186153
Median Absolute Error: 0.35822516096548995
google, LinearRegression()
Mean Absolute Error: 0.10567658558756352
Median Absolute Error: 0.043020276509598526
wiki, LinearRegression()
Mean Absolute Error: 0.26165524985032396
Median Absolute Error: 0.28021248577326524


In [11]:
X_test[['google_cosine_similarity', 'google_predicted_similarity','twitter_cosine_similarity', 'twitter_predicted_similarity','wiki_cosine_similarity', 'wiki_predicted_similarity']]

Unnamed: 0,google_cosine_similarity,google_predicted_similarity,twitter_cosine_similarity,twitter_predicted_similarity,wiki_cosine_similarity,wiki_predicted_similarity
0,0.217814,0.159361,0.408657,0.471695,0.293712,-0.029522
1,0.152758,0.109738,-0.178549,0.402629,0.26116,0.541372
2,0.530712,0.152117,0.090369,0.33992,0.272002,0.193313
3,0.192443,0.199916,,0.220862,-0.20476,-0.283994
4,0.205076,0.164235,0.753809,0.28691,0.648196,0.101288


In [12]:
def vectorize_sentences(strings:pd.Series, model):
    vocab = model.index_to_key
    clues = strings.astype(str).apply(lambda clue: [x for x in clue.split() if x in vocab])
    df_filter = clues.str.len() > 0
    clues = clues[df_filter]
    clue_vectors = pd.Series([np.mean(model[x],axis=0) for x in clues])
    clue_vectors.index = clues.index
    return clue_vectors, df_filter

In [129]:
TOPN = 5

def predict(X:pd.DataFrame, known_characters:pd.Series, gensim_models:dict):
    all_predictions = {}
    for model_name, model in gensim_models.items():
        model_predictions = {}
        word_vectors, row_filter = vectorize_sentences(X['clue'], model)
        for index, vector in word_vectors.iteritems():
            target = X[f'{model_name}_predicted_similarity'].iloc[index]
            regex_pattern = re.compile('^'+''.join([x if not x == '_' else '[a-z]' for x in known_characters[index]])+'$')
            similarity_index = model.similar_by_vector(vector, topn=len(model.index_to_key))
            available_words = [x[0] for x in similarity_index if regex_pattern.match(x[0]) ]
            similarity_scores = np.asarray([x[1] for x in similarity_index if regex_pattern.match(x[0]) ])
            chosen_indices = np.abs(similarity_scores - target ).argsort()[:TOPN*2]
            word_matches = {}
            for i in chosen_indices:
                word_matches[available_words[i]] = 1 - abs(target - similarity_scores[i])
            model_predictions[index] = word_matches
        all_predictions[model_name] = model_predictions
    return all_predictions    
        
    
    

In [130]:
predictions = predict(X_test, X_test['40.0%_known_characters'], gensim_model_dict)

In [131]:
len(predictions)

3

In [132]:

final_words = {}
final_scores = {}
for i, row in pd.DataFrame(predictions).iterrows():
    votes = {}
    for chosen_words in row:
        for word, score in chosen_words.items():
            if word in votes:
                votes[word]+=score
            else:
                votes[word]= score
    votes = sorted(votes.items(), key= lambda kv: kv[1], reverse=True)[:TOPN]
    final_words[i] = [vote[0] for vote in votes]
    final_scores[i] = [vote[1]/len(predictions) for vote in votes]


In [125]:
pd.DataFrame(compiled).T

Unnamed: 0,0,1,2,3,4
0,"(spurs, 1.9865374883650404)","(jours, 0.9997360564941125)","(roars, 0.9974066324704842)","(cuers, 0.9966586835855202)","(pours, 0.9957202809090708)"
1,"(laid, 1.7976235542596855)","(lain, 1.6767672095598258)","(loin, 1.5595243838443578)","(leib, 0.9762363299484467)","(luid, 0.9679294362907196)"
2,"(erom, 0.9990965408661465)","(cron, 0.9975837344825255)","(wron, 0.9962950629570584)","(bron, 0.9940086620471378)","(dros, 0.9922732735574232)"
3,"(fegelein, 0.9943979427224605)","(digestif, 0.9925262063073907)","(dagenais, 0.982965801656863)","(tigerair, 0.9574791893356831)","(gugelmin, 0.9292155608528645)"
4,"(smacks, 0.999314048004001)","(fracas, 0.9969996146227297)","(placid, 0.9969378940130648)","(spacex, 0.9945491573720928)","(stache, 0.994485405930901)"


In [133]:
pd.DataFrame(final_words).T

Unnamed: 0,0,1,2,3,4
0,doers,spurs,jours,roars,cuers
1,lair,lyin,laid,lein,laic
2,prof,iron,croc,erom,cron
3,digestif,tigerair,fegelein,dagenais,gugelmin
4,fracas,viacom,traced,braced,placer


In [134]:
pd.DataFrame(final_scores).T

Unnamed: 0,0,1,2,3,4
0,0.963883,0.662179,0.333245,0.332469,0.33222
1,0.778639,0.612023,0.599208,0.592155,0.570575
2,0.652581,0.649097,0.620954,0.333032,0.332528
3,0.637764,0.574151,0.331466,0.327655,0.309739
4,0.650904,0.64747,0.647454,0.645939,0.643852


In [128]:
y_test

0       slurs
1        lain
2        eros
3    digestif
4      placed
Name: answer, dtype: object

In [69]:
pd.DataFrame(predictions)

Unnamed: 0,twitter,google,wiki
0,"{'pours': 0.9957202809090708, 'spurs': 0.98673...","{'spurs': 0.9997995354407982, 'jours': 0.99973...","{'moors': 0.98623662954812, 'czars': 0.9861912..."
1,"{'leib': 0.9762363299484467, 'luid': 0.9679294...","{'loin': 0.9646871175921333, 'lain': 0.9569796...","{'laid': 0.8800460428454329, 'lain': 0.7197875..."
2,"{'erom': 0.9990965408661465, 'wron': 0.9962950...","{'cron': 0.9975837344825255, 'dros': 0.9922732...","{'drop': 0.9899819132090236, 'troy': 0.9884307..."
3,"{'opgeleid': 0.9184362407627252, 'begeleid': 0...",{'digestif': 0.9925262063073907},"{'fegelein': 0.9943979427224605, 'dagenais': 0..."
4,"{'spacex': 0.9945491573720928, 'stache': 0.994...","{'smacks': 0.999314048004001, 'reachs': 0.9934...","{'fracas': 0.9969996146227297, 'placid': 0.996..."


In [27]:
def test(X):
    for i, x in X[0].items():
        return(i, x)

In [28]:
pd.DataFrame(predictions).apply(test)

Unnamed: 0,twitter,google,wiki
0,slags,slugs,slats
1,0.997723,0.987946,0.981804
