# Task 1

In [4]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor

In [5]:
df = pd.read_csv("./data/winemag-data-130k-v2.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


## Initial cleaning for overall use

Before starting, we clean the data for use throughout the project. Tasks 1.1 and 1.2 differ in requirements, so we will clean each in different ways. However, both tasks will benefit from removing duplicates and dropping rows with missing values. We chose to drop rows with missing values instead of imputing based on past experience.

The original data had no duplicates, so our cleaned data baseline deals with removing null values and dropping columns that we are fairly confident will not be very important for any tasks. Since the name and Twitter handle of the evaluater likely has little to do with the wine itself, we dropped these columns. Because more than half of the samples were missing information for region_2, we decided to exclude this column instead of omitting a considerable amount of data.

In [6]:
# only keep US samples
df = df[df.country == "US"]


# duplicates
print("entries before dropping dups: ", len(df))
clean = df.drop_duplicates(['Unnamed: 0'])
print("entries after dropping dups:  ", len(df))

# dropping unimportant features
drop_features = ["taster_name", "taster_twitter_handle", "region_2"]
clean = clean.drop(drop_features, axis=1)

# dropping rows with null values
clean.isnull().sum()

entries before dropping dups:  54504
entries after dropping dups:   54504


Unnamed: 0         0
country            0
description        0
designation    17596
points             0
price            239
province           0
region_1         278
title              0
variety            0
winery             0
dtype: int64

In [7]:
clean = clean.dropna()
clean.isnull().sum()

Unnamed: 0     0
country        0
description    0
designation    0
points         0
price          0
province       0
region_1       0
title          0
variety        0
winery         0
dtype: int64

In [8]:
clean.to_csv('./wine_clean.csv', index=False)

## 1.1 Create a baseline model for predicting wine quality using only non-text features.

In [9]:
t1_drop = ['Unnamed: 0', 'description', 'title']
clean_t1 = clean.drop(t1_drop, axis=1)
clean_t1.columns

Index(['country', 'designation', 'points', 'price', 'province', 'region_1',
       'variety', 'winery'],
      dtype='object')

This task asks us to only use non-text features, so we can drop the text description of the wine and the title of the review. 

In [10]:
print("total samples: ", clean_t1['country'].count())
for c in clean_t1.columns:
    print(c, ": ", clean_t1[c].nunique())


total samples:  36558
country :  1
designation :  14063
points :  21
price :  159
province :  25
region_1 :  253
variety :  231
winery :  4022


Out of approximately 70K samples, there are 14K unique values for 'designation' and 4 for 'winery'. Though the vineyard and winery from might provide useful information for prediction, because there are so many possible values we'll drop these columns. This may be a choice we revisit after scoring the model. Since we are only considering wines from the US, we can also drop 'country'.

In [11]:
clean_t1 = clean_t1.drop('designation', axis=1)
clean_t1 = clean_t1.drop('winery', axis=1)
clean_t1 = clean_t1.drop('country', axis=1)


In [12]:
clean_t1 = pd.get_dummies(clean_t1)

In [13]:
clean_t1.head()

Unnamed: 0,points,price,province_Arizona,province_California,province_Colorado,province_Connecticut,province_Hawaii,province_Idaho,province_Illinois,province_Iowa,...,variety_Vignoles,variety_Viognier,variety_Viognier-Chardonnay,variety_Viognier-Gewürztraminer,variety_Viognier-Roussanne,variety_White Blend,variety_White Port,variety_White Riesling,variety_Zinfandel,variety_Zweigelt
3,87,13.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,87,65.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,87,19.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20,87,23.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23,87,22.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
features = clean_t1.columns.tolist()
features.remove('points')

X = clean_t1[features]
y = clean_t1['points']
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [15]:
np.mean(cross_val_score(GradientBoostingRegressor(loss='ls', n_estimators=100), X_train, y_train, cv=5))

0.33843173459922193

In [16]:
reg = Pipeline(steps=[('scaler', StandardScaler()),
 ('regressor', LinearRegression())])

param_grid = {'regressor': [Ridge(), Lasso(), ElasticNet()],
 'regressor__alpha': np.logspace(-3, 3, 7)}
grid = GridSearchCV(reg, param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best Estimator: ", grid.best_estimator_[1])

Best Estimator:  Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)


In [17]:
print("Best Score: ", grid.best_score_)

Best Score:  0.2705535224858449


With GB and Lasso as some baseline models trained on non-text features only, our accuracy is around 30%. This might be because the quality of a wine can't necessarily be determined soley on where it's made and what it's made of. The wines of a certain vineyard will not always be predictably better than the wines of another; a pinot noir is not always better than a chardonnay. To more accurately predict the quality of the wine, we'll need text data that reflects how wine enthusiasts receive it.

## 1.2 Create a simple text-based model using a bag-of-words approach and a linear model.

We'll start back with our baseline cleaned data which removed duplicates and rows with missing values. The features can be seen below.

In [18]:
clean.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,title,variety,winery
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
10,10,US,"Soft, supple plum envelopes an oaky structure ...",Mountain Cuvée,87,19.0,California,Napa Valley,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature
20,20,US,Ripe aromas of dark berries mingle with ample ...,Vin de Maison,87,23.0,Virginia,Virginia,Quiévremont 2012 Vin de Maison Red (Virginia),Red Blend,Quiévremont
23,23,US,This wine from the Geneseo district offers aro...,Signature Selection,87,22.0,California,Paso Robles,Bianchi 2011 Signature Selection Merlot (Paso ...,Merlot,Bianchi


In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegressionCV


We'll use description and title for our bag of words model, and we'll concatenate them into the same column.

In [20]:
desc = clean['description'].tolist()
title = clean['title'].tolist()
data = [i + j for i, j in zip(desc, title)] 
    
target = clean['points'].tolist()
text_trainval, y_trainval = data, target

In [21]:
print("type of text_train: ", type(text_trainval))
print("length of text_train: ", len(text_trainval))

type of text_train:  <class 'list'>
length of text_train:  36558


In [22]:
text_train, text_val, y_train, y_val = train_test_split(
    text_trainval, y_trainval, stratify=y_trainval, random_state=0)
vect = CountVectorizer()
X_train = vect.fit_transform(text_train)
X_val = vect.transform(text_val)
X_train

<27418x21560 sparse matrix of type '<class 'numpy.int64'>'
	with 1194151 stored elements in Compressed Sparse Row format>

In [23]:
feature_names = vect.get_feature_names()
print(feature_names[::200])


['000', '2000', '554', 'accurately', 'ago', 'amarone', 'anticipate', 'armida', 'au', 'balled', 'beaureagard', 'bethany', 'bloody', 'botticelli', 'briggs', 'bullhorn', 'caliber', 'carlton', 'centennial', 'cheese', 'cinquain', 'coatings', 'complejo', 'constrained', 'coteaux', 'crocker', 'dacon', 'declines', 'designations', 'discloses', 'dornfelder', 'durif', 'eleganty', 'enkidu', 'estrella', 'explosively', 'fault', 'finished', 'flowers', 'frances', 'fuse', 'gestalt', 'gooseberries', 'gripped', 'handrails', 'heins', 'holes', 'hélène', 'increasingly', 'interfere', 'jammiest', 'juiciness', 'kim', 'laker', 'leelanau', 'limitation', 'loosened', 'machado', 'mari', 'mayo', 'merit', 'mirassou', 'moraine', 'mustiness', 'never', 'nourished', 'okay', 'ouest', 'pagnano', 'patience', 'perfected', 'pin', 'plungerhead', 'pouring', 'pro', 'punchdown', 'raiding', 'recreate', 'replay', 'rhônes', 'rockroom', 'rule', 'sant', 'scopus', 'senility', 'shelton', 'silverwood', 'slither', 'soléna', 'spices', 'star

In [24]:
reg = Pipeline(steps=[('scaler', StandardScaler(with_mean=False)),
 ('regressor', Ridge(alpha=1000))])
np.mean(cross_val_score(reg, X_train, y_train, cv=5))

0.5234095030126218

Per Prof. Mueller's suggestion on Piazza, we used Ridge for our linear model alongside our bag of words approach. Our cross validation score was 54%, which is better than our baseline model, but still not great. This can likely be improved with TFIDF scaling and especially n-grams because our current model does not preserve much semantic meaning.

## 1.3 Try using n-grams, characters, tf-idf rescaling and possibly other ways to tune the BoW model. Be aware that you might need to adjust the (regularization of the) linear model for  different feature sets

Before looking into n-grams, characters, and tf-idf, we can tune the BoW model in a simple way by including stop words and by setting minimum document frequency.

In [25]:
vect_1 = CountVectorizer(stop_words='english')
vect_2 = CountVectorizer(min_df=15)
vect_3 = CountVectorizer(min_df=15, stop_words='english')
X_train_tune1 = vect_1.fit_transform(text_train)
X_train_tune2 = vect_2.fit_transform(text_train)
X_train_tune3 = vect_3.fit_transform(text_train)


reg = Pipeline(steps=[('scaler', StandardScaler(with_mean=False)),
 ('regressor', Ridge(alpha=1000))])
print("stop words only: ", np.mean(cross_val_score(reg, X_train_tune1, y_train, cv=5)))
print("minimum document frequency only: ", np.mean(cross_val_score(reg, X_train_tune2, y_train, cv=5)))
print("both: ", np.mean(cross_val_score(reg, X_train_tune3, y_train, cv=5)))

stop words only:  0.5098111212436269
minimum document frequency only:  0.6541313857409923
both:  0.6429228649369435


Tuning by just adding stopwords doesn't seem to improve the model, which is consistent with Prof. Mueller's note that stopwords tend to be less impactful in supervised learning. However, increasing min_df to 15 improved our accuracy from ~50% to ~70%. Combining stop_words with min_df=15 doesn't do better than min_df=15 on its own. This leads us to conclude that minimum document frequency is an important tuning parameter. This might be explained because many words can be used to describe wine. From the feature_names we extracted, we can see just the words describing taste range from "tuna" to "gooseberries" to "veggies." There are so many words that can potentially describe wine, so it makes sense that only incredibly common ones will be important in predicting quality.

We'll try TD-IDF next, continuing to include stop words and limit by document frequency:

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(text_train)
# print(vectorizer.get_feature_names())
reg = Pipeline(steps=[('scaler', StandardScaler(with_mean=False)),
 ('regressor', Ridge(alpha=1000))])
print(np.mean(cross_val_score(reg, X, y_train, cv=5)))

vectorizer2 = TfidfVectorizer(min_df=20, stop_words='english')
X2 = vectorizer2.fit_transform(text_train)
print(np.mean(cross_val_score(reg, X2, y_train, cv=5)))


0.5304077413155303
0.6671420305686425


TD-IDF seems to do little; we see that it scores the same as our original BoW model both with and without setting stopwords and min_df.

#### N-grams

In [27]:
vect_ngram = CountVectorizer(ngram_range=(1, 4), min_df=20, stop_words='english')
X_train_ngram = vect_ngram.fit_transform(text_train)

reg2 = Pipeline(steps=[('scaler', StandardScaler(with_mean=False)),
 ('regressor', Ridge(alpha=1000))])
print(np.mean(cross_val_score(reg2, X_train_ngram, y_train, cv=5)))

0.6053031301536882


In [28]:
print("Vocabulary:\n", vect_ngram.get_feature_names()[::200])


Vocabulary:
 ['000', '2014 red', 'adding complexity', 'apricot jam', 'barbecue', 'black cherry dark', 'blended', 'brilliant acidity', 'cadeau', 'chardonnay finger', 'chocolate smoky', 'combines', 'crisp green', 'delicious drink', 'drink rich', 'end', 'fade', 'flavors baked', 'floral note', 'fruit flavors wine', 'gradually', 'harvest wine', 'ignore', 'kendall', 'lemon orange', 'lohr', 'means', 'months new french oak', 'noir rosé willamette', 'offers ripe', 'patz', 'pinot noir arroyo', 'presence', 'raspberries', 'regular bottling', 'right', 'runs', 'savory', 'signature selection', 'softly tannic', 'state', 'sweet black', 'tannins need time', 'toasted', 'vanilla custard', 'vineyard reserve cabernet', 'wet rock', 'wine showing']


#### Characters

In [29]:
cv_char = CountVectorizer(ngram_range=(2, 3), analyzer="char_wb", min_df=20, stop_words='english')
X_train_char = cv_char.fit_transform(text_train)

reg3 = Pipeline(steps=[('scaler', StandardScaler(with_mean=False)),
 ('regressor', Ridge(alpha=100))])
print(np.mean(cross_val_score(reg2, X_train_char, y_train, cv=5)))


0.67964175240566


## 1.4 Combine the non-text features and the text features. How does adding those features improve upon just using bag-of-words?

As a reminder the dataframe clean_t1 is a one-hot encoding of the following features: points, price, region_1, variety:

In [30]:
clean_t1.head()

Unnamed: 0,points,price,province_Arizona,province_California,province_Colorado,province_Connecticut,province_Hawaii,province_Idaho,province_Illinois,province_Iowa,...,variety_Vignoles,variety_Viognier,variety_Viognier-Chardonnay,variety_Viognier-Gewürztraminer,variety_Viognier-Roussanne,variety_White Blend,variety_White Port,variety_White Riesling,variety_Zinfandel,variety_Zweigelt
3,87,13.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,87,65.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,87,19.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20,87,23.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23,87,22.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
len(clean_t1)

36558

In [32]:
tf = ['description', 'title']
text_features = clean[tf]
type(text_features)

nontext_features = clean_t1
type(nontext_features)

both = pd.concat([text_features, nontext_features], axis=1)
both = both.drop('points', axis=1)
both.head()


text_trainval, y_trainval = both, target

text_train, text_val, y_train, y_val = train_test_split(
    text_trainval, y_trainval, stratify=y_trainval, random_state=0)


preprocessor = make_column_transformer(
    (CountVectorizer(min_df=20, stop_words='english'), 'description'),
    (CountVectorizer(min_df=20, stop_words='english'), 'title'))


reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('scaler', StandardScaler(with_mean=False)),
                      ('regressor', Ridge(alpha=1000))])

print(np.mean(cross_val_score(reg, text_train, y_train)))








0.6735494454917417


In [36]:
clean_t1.to_csv("./clean_wine2.csv")