In [None]:
# setup
import numpy as np
import pandas as pd
import scipy as sc
import sklearn
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import r2_score
import statsmodels.api as sm
import sys
### Gensim is outside the anaconda distribution ###
### uncomment to install Gensim ###
#!{sys.executable} -m pip install gensim
import gensim
import gensim.downloader as model_api

# Load pretrained word embeddings
# This will download 60mb of data the first time it's loaded
word_vectors = model_api.load("glove-wiki-gigaword-50")

## Sentence embeddings

There is some good information decomposing word embeddings on [Jay Alammar's blog](http://jalammar.github.io/illustrated-word2vec/).

Word embedding dimensions capture high level concepts, which let algebra "work" in cosine distance.

The simplest and most effective way to represent a sentence is to sum or average the sentence's words. There are [some better methods](https://openreview.net/forum?id=SyK00v5xx) using weights, or using deep learning language models, but sentence embeddings are often just as good while being simpler.

In [None]:
# sentence embedding

df = pd.read_csv('../data/Restaurant_Reviews.tsv',delimiter='\t')
df.columns = [x.lower() for x in df.columns]
df

In [None]:
# split the words

words = df.review.str.split()
words = pd.DataFrame(words.tolist())
words


In [None]:
# clean up the words with regex 
import re

replaceDict = dict({
'{':" ", '}':" ", ',':"", '.':" ", '!':" ", '\\':" ", '/':" ", '$':" ", '%':" ",
'^':" ", '?':" ", '\'':" ", '"':" ", '(':" ", ')':" ", '*':" ", '+':" ", '-':" ",
'=':" ", ':':" ", ';':" ", ']':" ", '[':" ", '`':" ", '~':" ",
})

rep = dict((re.escape(k),v) for k, v in replaceDict.items())
pattern = re.compile('|'.join(rep.keys()))
def replacer(text):
    return rep[re.escape(text.group(0))]

words = df.review.str.replace(pattern, replacer).str.lower().str.split()
words = pd.DataFrame(words.tolist())
words

### Sentence embeddings quickly

This is a short way to generate sentence embeddings from a column.

It's not very efficient but can be optimized a lot.

In [None]:
def soft_get(w):
    try:
        return word_vectors[w] #either get the word or return 0s
    except KeyError:
        return np.zeros(word_vectors.vector_size)

def map_vectors(row):
    try:
        return np.sum(
            row.loc[words.iloc[0].notna()].apply(soft_get)
        ) # take the row and take the columns that are not NaN and get the soft_get and then take the sum of that
    except:
        return np.zeros(word_vectors.vector_size)

emb = pd.DataFrame(words.apply(map_vectors, axis=1).tolist())
emb

In [None]:
import statsmodels.api as sm

sm.OLS(df.liked, sm.add_constant(emb)).fit().summary()