In [None]:
import re, nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
!pip install umap-learn
import plotly.graph_objs as go
import plotly.figure_factory as ff
from sklearn import metrics
from sklearn.svm import LinearSVC
import joblib


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
import umap.umap_ as umap

In [None]:
# Reading dataset as dataframe
df = pd.read_csv("/Wines.csv", encoding = "ISO-8859-1") # You can also use "utf-8"
pd.set_option('display.max_colwidth', None) # Setting this so we can see the full content of cells
pd.set_option('display.max_columns', None) # to make sure we can see all the columns in output window

In [None]:
# Converting structured categorical features to numerical features
df['return'] = df['return'].map({'High':0, 'Low':1})

In [None]:
# Converting unstructured 'Review' column to a TF-IDF matrix
def cleaner(tasting_note): # Cleaning reviews
    soup = BeautifulSoup(tasting_note, 'lxml') # removing HTML entities such as ‘&amp’,’&quot’,'&gt'; lxml is the html parser and shoulp be installed using 'pip install lxml'
    souped = soup.get_text()
    re1 = re.sub(r"(@|http://|https://|www|\\x)\S*", " ", souped) # substituting @mentions, urls, etc with whitespace
    re2 = re.sub("[^A-Za-z]+"," ", re1) # substituting any non-alphabetic character that repeats one or more times with whitespace

    """
    For more info on regular expressions visit -
    https://docs.python.org/3/howto/regex.html
    """

    tokens = nltk.word_tokenize(re2)
    lower_case = [t.lower() for t in tokens]

    stop_words = set(stopwords.words('english'))
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))

    wordnet_lemmatizer = WordNetLemmatizer()
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas

In [None]:
df['cleaned_tasting_note'] = df.tasting_note.apply(cleaner)
df = df[df['cleaned_tasting_note'].map(len) > 0] # removing rows with cleaned reviews of length 0
print("Printing top 5 rows of dataframe showing original and cleaned tasting notes....")
print(df[['tasting_note','cleaned_tasting_note']].head())


Printing top 5 rows of dataframe showing original and cleaned tasting notes....
                                                                                                                                                                                                                                                          tasting_note  \
0  Oak and earth intermingle around robust aromas of wet forest floor in this vineyard-designated Pinot that hails from a high-elevation site. Small in production, it offers intense, full-bodied raspberry and blackberry steeped in smoky spice and smooth texture.   
1                                                                                              Syrupy and dense, this wine is jammy in plum and vanilla, with indeterminate structure and plenty of oak. Ripe and full-bodied, it has accents of graphite and leather.   
2                                                                                  This is made from equal parts Cabernet 

In [None]:
df['cleaned_tasting_note'] = [" ".join(row) for row in df['cleaned_tasting_note'].values] # joining tokens to create strings. TfidfVectorizer does not accept tokens as input
data = df['cleaned_tasting_note']
y = df['return'] # target column
tfidf = TfidfVectorizer(min_df=.00284, ngram_range=(1,3)) # min_df=.00284 means that each ngram (unigram, bigram, & trigram) must be present in at least 30 documents for it to be considered as a token (10540*.00284=30). This is a clever way of feature engineering
tfidf.fit(data) # learn vocabulary of entire data
data_tfidf = tfidf.transform(data) # creating tfidf values
pd.DataFrame.from_dict(data=dict([word, i] for i, word in enumerate(tfidf.get_feature_names())), orient='index').to_csv('vocabulary_tasting_notes.csv', header=False) # Saving vocabulary to csv
print("Shape of tfidf matrix: ", data_tfidf.shape)
print(type(data_tfidf))

Shape of tfidf matrix:  (10540, 2230)
<class 'scipy.sparse.csr.csr_matrix'>


In [None]:
################################ Implementing UMAP to visualize dataset
u = umap.UMAP(n_neighbors=150, min_dist=0.4)
x_umap = u.fit_transform(data_tfidf)

investment = list(df['return'])
notes = list(df['tasting_note'])

data_ = [go.Scatter(x=x_umap[:,0], y=x_umap[:,1], mode='markers',
                    marker = dict(color=df['return'], colorscale='Rainbow', opacity=0.5),
                                text=[f'return: {a}<br>tasting_note: {b}' for a,b in list(zip(investment, notes))],
                                hoverinfo='text')]

layout = go.Layout(title = 'UMAP Dimensionality Reduction', width = 700, height = 700,
                    xaxis = dict(title='First Dimension'),
                    yaxis = dict(title='Second Dimension'))
fig = go.Figure(data=data_, layout=layout)
fig.show()

In [None]:
from imblearn.over_sampling import SMOTE  # imblearn library can be installed using pip install imblearn

In [None]:
################################## Implementing Support Vector Classifier
model = LinearSVC() # kernel = 'linear' and C = 1

# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
smote = SMOTE(random_state = 101)
for train_index, test_index in kf.split(data_tfidf, y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, y_train = data_tfidf[train_index], y[train_index]
    X_test, y_test = data_tfidf[test_index], y[test_index]
    X_train,y_train = smote.fit_sample(X_train,y_train) # Balancing training data
    model.fit(X_train, y_train) # Fitting SVC
    y_pred = model.predict(X_test)
    score = metrics.recall_score(y_test, y_pred) # Calculating recall
    print("Cross-validation recall: ", score)
    scores.append(score) # appending cross-validation recall for each iteration
mean_ = np.mean(scores)
print("Mean cross-validation recall: ", mean_recall)


Iteration  1
Cross-validation recall:  0.7328519855595668
Iteration  2
Cross-validation recall:  0.753309265944645
Iteration  3
Cross-validation recall:  0.7268351383874849
Iteration  4
Cross-validation recall:  0.7593261131167268
Iteration  5
Cross-validation recall:  0.7605294825511432
Iteration  6
Cross-validation recall:  0.7220216606498195
Iteration  7
Cross-validation recall:  0.7569193742478941
Iteration  8
Cross-validation recall:  0.740072202166065
Iteration  9
Cross-validation recall:  0.7509025270758123
Iteration  10
Cross-validation recall:  0.7566265060240964
Mean cross-validation recall:  0.7459394255723254


In [None]:
# Implementing Naive Bayes Classifier
nbc_clf = MultinomialNB()

# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, y_train = data_tfidf[train_index], y[train_index]
    X_test, y_test = data_tfidf[test_index], y[test_index]
    nbc_clf.fit(X_train, y_train) # Fitting NBC
    y_pred = nbc_clf.predict(X_test)
    score = metrics.recall_score(y_test, y_pred) # Calculating recall
    print("Cross-validation recall: ", score)
    scores.append(score) # appending cross-validation recall for each iteration
nbc_mean_recall = np.mean(scores)
print("Mean cross-validation recall: ", nbc_mean_recall)


Iteration  1
Cross-validation recall:  0.9638989169675091
Iteration  2
Cross-validation recall:  0.9590854392298436
Iteration  3
Cross-validation recall:  0.9747292418772563
Iteration  4
Cross-validation recall:  0.9663056558363418
Iteration  5
Cross-validation recall:  0.9663056558363418
Iteration  6
Cross-validation recall:  0.9819494584837545
Iteration  7
Cross-validation recall:  0.9663056558363418
Iteration  8
Cross-validation recall:  0.9699157641395909
Iteration  9
Cross-validation recall:  0.9675090252707581
Iteration  10
Cross-validation recall:  0.9722891566265061
Mean cross-validation recall:  0.9688293970104244
