In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.linear_model import Ridge
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler,Normalizer

import spacy

In [2]:
data = pd.read_csv('winemag-data-130k-v2.csv')

In [3]:
data = data.drop(columns = ['Unnamed: 0'])
data = data.loc[data['country'] == 'US']
data = data.drop(columns = ['designation', 'country','taster_name', 'taster_twitter_handle', 'title']) 

In [4]:
# number of missing values in each column
data.isnull().sum()

description       0
points            0
price           239
province          0
region_1        278
region_2       3993
variety           0
winery            0
dtype: int64

In [5]:
data = data.dropna()
y = data[['points']]
X = data.drop(columns = 'points')

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [7]:
nlp = spacy.load("en_core_web_lg",disable=["tagger", "parser", "ner"])

In [8]:
description_train = list(X_train["description"])
description_test = list(X_test["description"])

In [9]:
docs_train = [nlp(d).vector for d in description_train]
docs_test = [nlp(d).vector for d in description_test]

In [10]:
X_train_desc = (np.vstack(docs_train))
X_test_desc = (np.vstack(docs_test))

In [11]:
X_train_wo_desc = X_train.drop(columns = ["description"])
X_test_wo_desc = X_test.drop(columns = ["description"])

In [12]:
pipeline_new = Pipeline([('regressor', Ridge())])
param_grid = [{'regressor__alpha': np.logspace(-2,2,20)}]
grid = GridSearchCV(pipeline_new, param_grid, cv = 5)
grid.fit(X_train_desc, y_train)
print("--------------------------------------------------------------------------------")
print("Results for regression using Word Embedding only")
print("--------------------------------------------------------------------------------")
print("The best parameters after grid search: " + str(grid.best_params_))
print("The score on test set after grid search is: " + str(grid.score(X_test_desc, y_test)))

--------------------------------------------------------------------------------
Results for regression using Word Embedding only
--------------------------------------------------------------------------------
The best parameters after grid search: {'regressor__alpha': 0.29763514416313175}
The score on test set after grid search is: 0.5631487389020682


When we used only the word embeddings, the R2 decreased from 0.70 (using bag of words model) to 0.56.
Next, we try combining the BoW model with the word embeddings.

In [13]:
X_train_BOW_embed = pd.DataFrame(X_train_desc)
X_train_BOW_embed["descriptions"] = description_train
X_test_BOW_embed = pd.DataFrame(X_test_desc)
X_test_BOW_embed["descriptions"] = description_test

X_train_BOW_embed.columns = [str(i) for i in X_train_BOW_embed.columns]
X_test_BOW_embed.columns = [str(i) for i in X_test_BOW_embed.columns]

In [14]:
text_pipeline = make_pipeline(CountVectorizer())

preprocess = make_column_transformer((text_pipeline, 'descriptions'),
                                     remainder = "passthrough")

pipe = make_pipeline(preprocess, Ridge(alpha=0.3))
pipe.fit(X_train_BOW_embed, y_train)
s1 = pipe.score(X_test_BOW_embed,y_test)

print("--------------------------------------------------------------------------------")
print("Results for regression using Word Embedding and Bag Of Words:")
print("--------------------------------------------------------------------------------")
print("Score on the test set is: " +str(s1))

--------------------------------------------------------------------------------
Results for regression using Word Embedding and Bag Of Words:
--------------------------------------------------------------------------------
Score on the test set is: 0.6773992066579624


In [15]:
text_pipeline = make_pipeline(CountVectorizer(ngram_range=(1, 3), min_df=4, stop_words="english", analyzer="char_wb"), Normalizer(),TfidfTransformer())

preprocess = make_column_transformer((text_pipeline, 'descriptions'),
                                     remainder = "passthrough")

pipe = make_pipeline(preprocess, Ridge(alpha=0.3))
pipe.fit(X_train_BOW_embed, y_train)
s1 = pipe.score(X_test_BOW_embed,y_test)

print("--------------------------------------------------------------------------------")
print("Results for regression using Word Embedding and Bag Of Words:")
print("--------------------------------------------------------------------------------")
print("Score on the test set is: " +str(s1))

--------------------------------------------------------------------------------
Results for regression using Word Embedding and Bag Of Words:
--------------------------------------------------------------------------------
Score on the test set is: 0.7012543636242331


In [16]:
preprocess = make_column_transformer((StandardScaler(), ['price']), 
                                     (TargetEncoder(), ['province','region_1','region_2','variety','winery']))

In [17]:
X_train_wo_desc = preprocess.fit_transform(X_train_wo_desc, y_train)
X_test_wo_desc = preprocess.transform(X_test_wo_desc)

In [18]:
X_train_w_desc = np.hstack((X_train_desc,X_train_wo_desc))
X_test_w_desc = np.hstack((X_test_desc,X_test_wo_desc))

In [22]:
r = Ridge().fit(X_train_w_desc, y_train)
s1 = r.score(X_test_w_desc, y_test)

print("--------------------------------------------------------------------------------")
print("Results for regression using Word Embedding with non-text features")
print("--------------------------------------------------------------------------------")
# print("The best parameters after grid search: " + str(grid.best_params_))
print("The score on test set is: " + str(s1))

--------------------------------------------------------------------------------
Results for regression using Word Embedding with non-text features
--------------------------------------------------------------------------------
The score on test set is: 0.6734143946822839


After combining all the models, we get the following results - 
1. Word Embeddings alone = 0.56
2. BoW alone = 0.67
3. Tuned BoW alone = 0.700
4. BoW Basic model (using only CountVectorizer) + Word Embeddings = 0.67
5. Tuned BoW model (from 1.3) + Word Embeddings = 0.701
6. Non-text features + Word Embeddings = 0.67
7. Non-text features + Tuned BoW model = 0.78

So, we can see that the best model is using Non-text features + Tuned BoW model.
Adding the word embeddings to our tuned BoW model, does increase the R2 to 0.001.