## Pritam Biswas (pb2796)

In [67]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from category_encoders import TargetEncoder
from category_encoders import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import gensim.downloader as api
from nltk.corpus import stopwords 

## Task 2

In [2]:
data = pd.read_csv('winemag-data-130k-v2.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [4]:
data=data.loc[data.country=='US']
data=data.drop(['Unnamed: 0', 'taster_twitter_handle', 'country'], axis=1)
print(len(data))
data=data.sample(frac=0.5, random_state=1)
print(len(data))

54504
27252


In [20]:
def extract_date(titles):
    res=[]
    
    for title in titles:
        title=title.translate(str.maketrans('', '', string.punctuation))
        title=title.split(' ')
        num=''
        for token in title:
            if len(token)==4 and token.isdigit() and (token.startswith('1') or token.startswith('2')):
                num=token
        if num=='':
            res.append(np.nan)
        else:
            res.append(float(num))
    return res


In [21]:
def preprocess_text(text_list):
    res=[]
    stp=stopwords.words('english')
    lemm=WordNetLemmatizer()
    for text in text_list:
        if len(text)==0:
            res.append('')
            continue
        
        text=text.lower()
        text=text.translate(str.maketrans('', '', string.punctuation))
        pattern = '[0-9]'
        text=re.sub(pattern, '', text)
        items=word_tokenize(text)
        items=[i for i in items if not i in stp]
        items = [lemm.lemmatize(i) for i in items]
        items = (' ').join(items)
        
        res.append(items)
    return res

#### function to preprocess the text by tokenizing and passing through linguistic modules

In [22]:
data_text=data.description
y=data.points
tmp=list(data_text.values)
text_list=preprocess_text(tmp)


In [10]:
word2vec = api.load("word2vec-google-news-300") ## loading word embeddings of word2vec

In [40]:
def get_vector(text_list, word2vec): # get the sentence embedding by average of the word embeddings
    res=[]
    
    for text in text_list:
        text=text.split(' ')
        vec=[word2vec[i] for i in text if i in word2vec]
        tmp=0*np.ones(vec[0].shape[0])
        for i in vec:
            tmp+=i
        tmp/=len(vec)
        res.append(list(tmp))
    return res

In [41]:
res_list=get_vector(text_list, word2vec)

In [44]:
res_arr=np.array(res_list)

In [64]:
X_train_v, X_test_v, y_train_v, y_test_v = train_test_split(res_arr, y, random_state=1)

In [65]:
v_pipe2=make_pipeline(Ridge(alpha=10))
score=np.mean(cross_val_score(v_pipe2, X_train_v, y_train_v))
print(score)

0.5008412901784894


In [66]:
v_pipe2.fit(X_train_v, y_train_v)
print(v_pipe2.score(X_test_v,y_test_v))

0.5016206265232663


#### We observe that the test score is not very good by taking the sentence embedding averaged over the word embeddings

#### Now  combining BOW with embeddings

In [50]:
t=pd.DataFrame(res_arr)

In [57]:
t['text']=text_list

In [60]:
ngram_preprocessor=make_pipeline(CountVectorizer(ngram_range = (1,3),stop_words = 'english'))
preprocessor_2=make_column_transformer((ngram_preprocessor,'text'), remainder='passthrough')

In [61]:
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(t, y, random_state=1)

In [62]:
b_pipe2=make_pipeline(preprocessor_2, Ridge(alpha=1))
score=np.mean(cross_val_score(b_pipe2, X_train_b, y_train_b))
print(score)

0.6918420066734507


In [63]:
b_pipe2.fit(X_train_b, y_train_b)
print(b_pipe2.score(X_test_b, y_test_b))

0.7085734017699875


#### We notice that combining leads to some improvement in score in the test set. But we see that the score is not as good as the ngram based BOW model. 

#### The embedding score did not perform well because it was run on the wine description. The embedding score would have made more sense if it was run on the wine user review. Then we could have used the user sentiment from the sentence representation as the indicator of quality of wine.

#### So for datasets such as these, keyword based metrics would be useful ( which is basically a BOW set of features) as there would lot of common terms in the wine descriptions, and the presence/absence of these terms could help deciding the quality of wine.