In [22]:
import pandas as pd
import numpy as np
import gensim.downloader
from custom_transformers import PCAFeatures, SimilarityPrediction
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [25]:
train = pd.read_csv('data/train_data.csv').iloc[450:470]
test = pd.read_csv('data/test_data.csv').head()
train.loc[train['clue'].isna(), 'clue'] = ''

In [26]:
train.head(3)

Unnamed: 0,clue,noun_involved,fill_blank,word_count,answer_length,0.0%_known_characters,10.0%_known_characters,20.0%_known_characters,30.0%_known_characters,40.0%_known_characters,50.0%_known_characters,60.0%_known_characters,answer,twitter,twitter_cosine_similarity,google,google_cosine_similarity,wiki,wiki_cosine_similarity
450,water lily,0,0,2,5,_____,_____,l____,_o__s,lo___,l___s,l_to_,lotos,True,-0.031981,False,,True,-0.255879
451,personal aspects,0,0,2,5,_____,_____,____s,_i_n_,_ie__,__en_,mi__s,miens,True,0.066686,False,,False,
452,playing round,0,0,2,6,______,_____r,___f__,_ol___,g__f__,g___er,g__fer,golfer,True,0.441435,True,0.316615,True,0.443692


In [27]:
X_train = train.drop('answer',axis=1)
y_train = train['answer']
X_test = test.drop('answer', axis=1)
y_test = test['answer']

In [5]:
# Load Gensim Models
twitter_gensim = gensim.downloader.load('glove-twitter-25')
google_gensim = gensim.downloader.load('word2vec-google-news-300')
wiki_gensim = gensim.downloader.load('glove-wiki-gigaword-100')
gensim_model_dict = {'twitter':twitter_gensim,
              'google':google_gensim,
              'wiki':wiki_gensim}

In [36]:
# Train and fit PCA features
pca_features = PCAFeatures(gensim_model_dict)
pca_features.fit(X_train, y_train)
X_train, y_train = pca_features.transform(X_train,y_train)
X_test, y_test =  pca_features.transform(X_test,y_test)

In [33]:
# Load Regression Models
linear_regression_dict = {'twitter': LinearRegression(),
                          'google':LinearRegression(),
                          'wiki':LinearRegression()}

In [38]:
similarity_predictor = SimilarityPrediction(gensim_model_dict=gensim_model_dict,predictor_dict=linear_regression_dict)
similarity_predictor.fit(X_train, y_train)
# X_train, y_train = similarity_predictor.transform(X_train, y_train)
X_test, y_test = similarity_predictor.transform(X_test, y_test)

twitter, LinearRegression()
Mean Absolute Error: 0.34016632534186153
Median Absolute Error: 0.35822516096548995
google, LinearRegression()
Mean Absolute Error: 0.10567658558756352
Median Absolute Error: 0.043020276509598526
wiki, LinearRegression()
Mean Absolute Error: 0.26165524985032396
Median Absolute Error: 0.28021248577326524


In [39]:
X_test[['google_cosine_similarity', 'google_predicted_similarity']]

Unnamed: 0,google_cosine_similarity,google_predicted_similarity
0,0.217814,0.159361
1,0.152758,0.109738
2,0.530712,0.152117
3,0.192443,0.199916
4,0.205076,0.164235
