## Pritam Biswas (pb2796)

In [237]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from category_encoders import TargetEncoder
from category_encoders import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

## Task 1

### 1.1

In [41]:
data = pd.read_csv('winemag-data-130k-v2.csv')

In [42]:
data.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [43]:
# filtering USA rows
data=data.loc[data.country=='US']

In [44]:
len(data)

54504

In [45]:
data=data.drop(['Unnamed: 0', 'taster_twitter_handle', 'country'], axis=1)

In [46]:
data.head()

Unnamed: 0,description,designation,points,price,province,region_1,region_2,taster_name,title,variety,winery
2,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
10,"Soft, supple plum envelopes an oaky structure ...",Mountain Cuvée,87,19.0,California,Napa Valley,Napa,Virginie Boone,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature
12,"Slightly reduced, this wine offers a chalky, t...",,87,34.0,California,Alexander Valley,Sonoma,Virginie Boone,Louis M. Martini 2012 Cabernet Sauvignon (Alex...,Cabernet Sauvignon,Louis M. Martini


In [47]:
print(len(data))
data=data.sample(frac=0.5, random_state=1)

54504


In [48]:
print(len(data))

27252


In [49]:
## non text based features model
y=data.points
text_f=['description']
data_num=data.drop(text_f,axis=1)
data_num=data_num.drop(['points'], axis=1)
data_num.head()

Unnamed: 0,designation,price,province,region_1,region_2,taster_name,title,variety,winery
118731,,12.0,California,California,California Other,Jim Gordon,Mirassou 2014 Moscato (California),Moscato,Mirassou
12277,Estate Reserve,50.0,California,Napa Valley,Napa,Paul Gregutt,Storybook Mountain 1997 Estate Reserve Zinfand...,Zinfandel,Storybook Mountain
80706,Snakepit Red,60.0,California,Alexander Valley,Sonoma,Virginie Boone,Medlock Ames 2013 Snakepit Red Red (Alexander ...,Bordeaux-style Red Blend,Medlock Ames
1494,Vintage Port,34.0,California,Paso Robles,Central Coast,,Robert Hall 2005 Vintage Port Port (Paso Robles),Port,Robert Hall
43985,Les Vignes De Marcoux,15.0,Washington,Yakima Valley,Columbia Valley,Paul Gregutt,San Juan Vineyards 2009 Les Vignes De Marcoux ...,Riesling,San Juan Vineyards


In [52]:
def extract_date(titles):
    res=[]
    
    for title in titles:
        title=title.translate(str.maketrans('', '', string.punctuation))
        title=title.split(' ')
        num=''
        for token in title:
            if len(token)==4 and token.isdigit() and (token.startswith('1') or token.startswith('2')):
                num=token
        if num=='':
            res.append(np.nan)
        else:
            res.append(float(num))
    return res
                

In [53]:
tmp=list(data_num['title'].values)

In [54]:
date_list=extract_date(tmp)

In [55]:
data_num['title']=date_list

In [57]:
data_num.nunique()

designation    9097
price           148
province         26
region_1        238
region_2         17
taster_name      15
title            55
variety         203
winery         4471
dtype: int64

In [73]:
data_num.dtypes

designation     object
price          float64
province        object
region_1        object
region_2        object
taster_name     object
title          float64
variety         object
winery          object
dtype: object

In [74]:
## based on the dtypes and no. of unique values we segregate the features
num_features=['price','title']
onehot_features=['province','region_2','taster_name']
target_features=['designation','region_1','variety','winery']


In [85]:
cont_preprocessor = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
target_preprocessor = make_pipeline(TargetEncoder(), SimpleImputer(strategy='constant'), StandardScaler())
onehot_preprocessor = make_pipeline(SimpleImputer(strategy='constant'), OneHotEncoder(handle_unknown='ignore'))

In [86]:
preprocessor_1=make_column_transformer((cont_preprocessor, num_features),
                                    (target_preprocessor, target_features),
                                    (onehot_preprocessor, onehot_features))

In [87]:
X_train, X_test, y_train, y_test = train_test_split(data_num, y, random_state=1)

In [89]:
lasso_pipe=make_pipeline(preprocessor_1, Lasso())
# lasso_pipe.fit(X_train,y_train)
cv_score=np.mean(cross_val_score(lasso_pipe, X_train, y_train))
print(cv_score)

0.2495570679983114


In [90]:
ridge_pipe=make_pipeline(preprocessor_1, Ridge())
# lasso_pipe.fit(X_train,y_train)
cv_score=np.mean(cross_val_score(ridge_pipe, X_train, y_train))
print(cv_score)

0.41250709520915996


In [217]:
ridge_pipe.fit(X_train, y_train)
print(ridge_pipe.score(X_test, y_test))

0.42797230661150787


#### We  notice that Ridge regression performs better than Lasso, So we can use this as baseline model with no text features. 

### 1.2

In [165]:
data_text=data.description

In [166]:
tmp=list(data_text.values)

In [218]:
def preprocess_text(text_list): # tokenizing , removing punctuations, digits, lemmatizing
    res=[]
    lemm=WordNetLemmatizer()
    for text in text_list:
        if len(text)==0:
            res.append('')
            continue
        
        text=text.lower()
        text=text.translate(str.maketrans('', '', string.punctuation))
        pattern = '[0-9]'
        text=re.sub(pattern, '', text)
        items=word_tokenize(text)
        items = [lemm.lemmatize(i) for i in items]
        items = (' ').join(items)
        
        res.append(items)
    return res

In [168]:
text_list=np.array(preprocess_text(tmp))

In [220]:
X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(text_list, y, random_state=1)

In [221]:
cv=CountVectorizer(stop_words='english')

In [222]:
tmp=CountVectorizer(stop_words='english').fit(X_train_t)
tmp.transform(X_train_t)

<20439x15792 sparse matrix of type '<class 'numpy.int64'>'
	with 472816 stored elements in Compressed Sparse Row format>

In [223]:
f=tmp.get_feature_names()
print(len(f))
print(f[::1000]) ## some example features

15792
['aand', 'bay', 'cancer', 'concerned', 'disgorgement', 'faux', 'grandpa', 'integrating', 'locke', 'mumm', 'pearflavored', 'ramazzotti', 'scored', 'stanly', 'thanks', 'ventosas']


In [224]:
ridge_pipe2=make_pipeline(CountVectorizer(stop_words='english'), Ridge())
score=np.mean(cross_val_score(ridge_pipe2, X_train_t, y_train_t))
print(score)

0.6094347106328996


In [225]:
ridge_pipe2.fit(X_train_t, y_train_t)
print(ridge_pipe2.score(X_test_t, y_test_t))

0.6299945607136033


#### simple BOW model improves upon the result of non-text features model

### 1.3

In [187]:
tf_pipe2=make_pipeline(TfidfVectorizer(stop_words='english'), Ridge())
score=np.mean(cross_val_score(tf_pipe2, X_train_t, y_train_t))
print(score)

0.6486964820874511


In [191]:
preprocess = make_pipeline(CountVectorizer(ngram_range = (1,2),stop_words = 'english'))
ngram_pipe2=make_pipeline(CountVectorizer(ngram_range = (1,2),stop_words = 'english'), Ridge())
score=np.mean(cross_val_score(ngram_pipe2, X_train_t, y_train_t))
print(score)

0.662069561357225


In [227]:
grid={"countvectorizer__ngram_range":[(1,1),(1,2),(1,3)], "ridge__alpha":[0.01, 0.03,0.1,0.3,1,3,10] }
model=GridSearchCV(make_pipeline(CountVectorizer(stop_words='english'), Ridge()), param_grid=grid)
model.fit(X_train_t, y_train_t)


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('countvectorizer',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                              

In [228]:
print(model.best_params_)
print(model.best_score_)

{'countvectorizer__ngram_range': (1, 2), 'ridge__alpha': 10}
0.6748458256004974


In [229]:
print(model.score(X_test_t,y_test_t))

0.6911454329111661


#### We notice that n-gram based model performs better on cross-validation score than the tf-idf based model. So perform a grid search on it find the best parameters. The model with best params further improves the score on the test set

### 1.4

In [230]:
data_num['descript']=list(text_list)

In [231]:
data_num.head()

Unnamed: 0,designation,price,province,region_1,region_2,taster_name,title,variety,winery,descript
118731,,12.0,California,California,California Other,Jim Gordon,2014.0,Moscato,Mirassou,this is a great find in this category gentle f...
12277,Estate Reserve,50.0,California,Napa Valley,Napa,Paul Gregutt,1997.0,Zinfandel,Storybook Mountain,blackberry are the story here boatload of rich...
80706,Snakepit Red,60.0,California,Alexander Valley,Sonoma,Virginie Boone,2013.0,Bordeaux-style Red Blend,Medlock Ames,a majority of merlot speaks in this nicely app...
1494,Vintage Port,34.0,California,Paso Robles,Central Coast,,2005.0,Port,Robert Hall,made from authentic port variety this dessert ...
43985,Les Vignes De Marcoux,15.0,Washington,Yakima Valley,Columbia Valley,Paul Gregutt,2009.0,Riesling,San Juan Vineyards,offdry and stainless steel fermented it carry ...


In [232]:
ngram_preprocessor=make_pipeline(CountVectorizer(ngram_range = (1,3),stop_words = 'english'))
ngram_feature=['descript']
preprocessor_2=make_column_transformer((cont_preprocessor, num_features),
                                    (target_preprocessor, target_features),
                                    (onehot_preprocessor, onehot_features),
                                      (ngram_preprocessor,'descript'))

In [233]:
X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(data_num, y, random_state=1)

In [234]:
f_pipe2=make_pipeline(preprocessor_2, Ridge(alpha=10))
score=np.mean(cross_val_score(f_pipe2, X_train_f, y_train_f))
print(score)

0.7341897140128684


In [235]:
f_pipe2.fit(X_train_f, y_train_f)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                           

In [236]:
print(f_pipe2.score(X_test_f,y_test_f))

0.7578737116351038


#### Combining the non-text features with the n-gram based BOW model, we see that the score on the test goes upto ~0.76 , which shows that the wine quality is dependent on categorical, numeric features as well as keyword based features from the text description.