In [1]:
import pandas as pd
import os
import gensim.downloader
from custom_transformers import PCAFeatures, SimilarityPrediction, SelectTopNWords
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, median_absolute_error

from constants import *

### Load Gensim Models

In [27]:
twitter_gensim = gensim.downloader.load('glove-twitter-25')
google_gensim = gensim.downloader.load('word2vec-google-news-300')
wiki_gensim = gensim.downloader.load('glove-wiki-gigaword-100')
gensim_model_dict = {'twitter':twitter_gensim,
              'google':google_gensim,
              'wiki':wiki_gensim}

### Convert clues into features using word vectorization and PCA

In [2]:
if os.path.isfile(DATA_FOLDER+PCA_TRAIN) and os.path.isfile(DATA_FOLDER+PCA_TEST) :
    train = pd.read_csv(DATA_FOLDER+PCA_TRAIN)
    test = pd.read_csv(DATA_FOLDER+PCA_TEST)
else:    
    train = pd.read_csv(DATA_FOLDER+TRAIN)
    test = pd.read_csv(DATA_FOLDER+TEST)
    train.loc[train['clue'].isna(), 'clue'] = ''
    pca_features = PCAFeatures(gensim_model_dict)
    pca_features.fit(train)
    train = pca_features.transform(train)
    test = pca_features.transform(test)
    train.to_csv(DATA_FOLDER+PCA_TRAIN, index= False)
    test.to_csv(DATA_FOLDER+PCA_TEST, index= False)
    
    

In [3]:
X_train = train.drop('answer',axis=1)
y_train = train['answer']
X_test = test.drop('answer', axis=1)
y_test = test['answer']

In [4]:
if os.path.isfile(DATA_FOLDER+PREDICTED_SIMILARITIES):
    X_test = pd.read_csv(DATA_FOLDER+PREDICTED_SIMILARITIES)
else:      
    random_forest_dict = {'twitter': RandomForestRegressor(),
                          'google':RandomForestRegressor(),
                          'wiki':RandomForestRegressor()}  
    similarity_predictor = SimilarityPrediction(gensim_model_dict=gensim_model_dict,predictor_dict=random_forest_dict)
    similarity_predictor.fit(X_train)
    X_test = similarity_predictor.transform(X_test)
    X_test.to_csv(DATA_FOLDER+PREDICTED_SIMILARITIES,index= False)

In [28]:
for model_name in gensim_model_dict.keys():
    row_filter = X_test[f'{model_name}_cosine_similarity'].notna()
    true = X_test[row_filter][f'{model_name}_cosine_similarity']
    predict = X_test[row_filter][f'{model_name}_predicted_similarity']
    mean_error = mean_absolute_error(true,predict)
    median_error = median_absolute_error(true,predict)
    print(f'{model_name}:\nMean Absolute Error: {mean_error}\nMedian Absolute Error: {median_error}')

twitter:
Mean Absolute Error: 0.19433275376997242
Median Absolute Error: 0.17121899019999992
google:
Mean Absolute Error: 0.10628713424573702
Median Absolute Error: 0.08929215441999988
wiki:
Mean Absolute Error: 0.17708801120947698
Median Absolute Error: 0.1523335474065


In [66]:
KNOWN_CHARACTER_SETTINGS = ['0.0%_known_characters','20.0%_known_characters','40.0%_known_characters']
N_SAMPLES = 10

word_selection = SelectTopNWords(5)
for known_characters in KNOWN_CHARACTER_SETTINGS:
    if os.path.isfile(DATA_FOLDER+known_characters+"_words.csv") and os.path.isfile(DATA_FOLDER+known_characters+'_scores.csv') :
        words = pd.read_csv(DATA_FOLDER+known_characters+"_words.csv", index_col=0)
        scores = pd.read_csv(DATA_FOLDER+known_characters+"_scores.csv", index_col=0)
    else:
        words, scores = word_selection.predict( X= X_test[:N_SAMPLES], 
                                        known_characters= X_test[:N_SAMPLES][known_characters], 
                                        gensim_models= gensim_model_dict)
        words.to_csv(DATA_FOLDER+known_characters+"_words.csv")
        scores.to_csv(DATA_FOLDER+known_characters+"_scores.csv")
        
    while words.index.max() < X_test.index.max():
        start_index = words.index.max()+1
        end_index = start_index+N_SAMPLES
        new_words, new_scores = word_selection.predict( X= X_test[start_index:end_index], 
                                known_characters= X_test[start_index:end_index][known_characters], 
                                gensim_models= gensim_model_dict)
        new_words.columns = new_words.columns.astype(str)
        new_scores.columns = new_scores.columns.astype(str)
        words = pd.concat([words,new_words])
        scores = pd.concat([scores,new_scores])
        words.to_csv(DATA_FOLDER+known_characters+"_words.csv")
        scores.to_csv(DATA_FOLDER+known_characters+"_scores.csv")
        

matching with twitter vocabulary
