# Wine Reviews
This analysis and possible prediction of wine data. The original dataset can be found [here](https://www.kaggle.com/datasets/zynicide/wine-reviews)

In [None]:
import pandas as pd
import numpy as np
import nltk
import pickle
from nltk import pos_tag  # Part-of-Speech
from nltk.corpus import wordnet
import string
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer  # sentiment analyzer
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# If the corpus aren't downloaded
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('vader_lexicon')
#nltk.download('averaged_perceptron_tagger')

## Loading and cleaning

### Load and screening

In [None]:
df = pd.read_csv('winemag-data-130k-v2.csv', index_col=0)

First, we check  some basic data

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

Now, we know one fact: **The minimum score is 80**. It could be that, in general, the wines are good and enjoyable. So, the focus of the analysis can shift towards predict some description of the wine.

### Cleaning

In [None]:
# Drop columns that are not useful by nulls
df.drop(['region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'designation'], axis=1, inplace=True)

In [None]:
# Check duplicates
if df.duplicated().any():
    df.drop_duplicates(inplace=True)

We must solve the most null values possible, by this check each one

In [None]:
# As the winery are complete, we could use them to fill empty data of country and province
df[df.country.isna()].head()

In [None]:
# Check theory with the first
df.country[(df.winery == 'Kakhetia Traditional Winemaking') & (df.country.notna())]

In [None]:
# Now implement a solution
wineries = df.winery[df.country.isna()].unique()
for w in wineries:
    if not df.country[(df.winery == w) & (df.country.notna())].empty:
        country = df.country[(df.winery == w) & (df.country.notna())].values[0]
        province = df.province[(df.winery == w) & (df.province.notna())].values[0]
        for i in df[(df.country.isna()) & (df.winery == w)].index:
            df.loc[i, 'country'] = country
            df.loc[i, 'province'] = province

In [None]:
# Delete the row where cannot be filled
df.dropna(subset='country', inplace=True)

In [None]:
# Now remove the variery missing value
df.dropna(subset='variety', inplace=True)

For missing prices we could try to fill them with the mean of each winery. It could be more precise using a combination of variery and winery, but this approach could be enough.

In [None]:
# Get mean prices by winery where missing values of price
wineries = df.winery[df.price.isna()].unique()
mean_price_by_winery = df[['price', 'winery']].groupby('winery').mean().loc[wineries].dropna()

In [None]:
df[df.price.isna() & (df.winery == mean_price_by_winery.index[0])]

In [None]:
for w in mean_price_by_winery.index:
    indexes = df[df.price.isna() & (df.winery == w)].index
    for i in indexes:
        df.loc[i, 'price'] = mean_price_by_winery.loc[w].values[0]

In [None]:
# Fill with variety now
varieties = df.variety[df.price.isna()].unique()
mean_price_by_variety = df[['price', 'variety']].groupby('variety').mean().loc[varieties].dropna()

In [None]:
for w in mean_price_by_variety.index:
    indexes = df[df.price.isna() & (df.variety == w)].index
    for i in indexes:
        df.loc[i, 'price'] = mean_price_by_variety.loc[w].values[0]

In [None]:
# Drop rows that couldn't be filled
df.dropna(subset='price', inplace=True)

In [None]:
df.info()

The data now is clean of null values, now it's time to check the duplicated values. On this case, differen reviews for same wine bottle

In [None]:
# Check wines with differents reviews
df.title.duplicated().sum()

In [None]:
# Because it is not a big sample of data, drop it
df.drop_duplicates(subset='title', inplace=True)

In [None]:
df.reset_index(drop=True, inplace=True)

### Pre-processing
The data of description will be able to work with it after processing the text data

In [None]:
# Stopwords
sp = nltk.corpus.stopwords.words('english')

In [None]:
# First, let's create a function to identify the type of words
def get_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# And a cleaner
def clean_description(text):
    # Lower text
    text = text.lower()
    # tokenize and remove punctuation
    text = [word.strip(string.punctuation) for word in text.split(' ')]
    # Remove words with numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # Remove stopwords
    text = [word for word in text if word not in sp]
    # Remove empty tokens
    text = [w for w in text if len(w) > 0]
    pos_tags = pos_tag(text)
    # Lemmatize words
    text = [WordNetLemmatizer().lemmatize(t[0], get_pos(t[1])) for t in pos_tags]
    # Remove words with one letter
    text = [w for w in text if len(w) > 1]
    # Reconstruct the description
    text = " ".join(text)
    return text

[Why do we lemmatize?](https://www.ibm.com/topics/stemming-lemmatization)

In [None]:
# Apply to data
df['clean_description'] = df['description'].apply(clean_description)

Now, it's time to add convert data in a way to be useful in the analysis

In [None]:
# First, let's generate some sentiment values
sid = SentimentIntensityAnalyzer()
df['Sentiment'] = df['description'].apply(sid.polarity_scores)
df = pd.concat([df.drop('Sentiment', axis=1), df['Sentiment'].apply(pd.Series)], axis=1)

In [None]:
df.head()

## Exploratory analysis

In [None]:
# Remove columns that are not useful for the analysis
df.drop(['description', 'title'], axis=1, inplace=True)

In [None]:
def word_cloud_generator(data):
    #mask = np.array(Image.open('wine_mask.png'))
    wordcloud = WordCloud(background_color='white',
                          width=1000, height=1000,
                          colormap='Blues').generate(' '.join(data.values))
    plt.figure(figsize=(10,10), facecolor=None)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

In [None]:
for i, v in enumerate(df.variety.unique()):
    print(v)
    word_cloud_generator(df['clean_description'][df.variety == v])
    if i == 4:
        break

We can see how for a sample, they share some common words like *wine*, but also they have differents one tha may identify them.

In [None]:
# We can explore more with other categorical variables
for i, c in enumerate(df.country.unique()):
    print(c)
    word_cloud_generator(df['clean_description'][df.country == c])
    if i == 4:
        break

In [None]:
# We can check the best by winery by points and the mean price of their products
df[['points', 'price', 'winery']].groupby('winery').mean().sort_values('points', ascending=False).head()

In [None]:
# Also, the "worst"
df[['points', 'price', 'winery']].groupby('winery').mean().sort_values('points', ascending=False).tail()

In [None]:
# Also, we can visualize the best origin provinces by positive sentiment
df[['pos', 'province', 'points', 'price', 'country']].groupby(['province', 'country']).mean().sort_values('pos', ascending=False).head()

With this simple analysis we found that wine varieties have some distincful words that may classify them. Also, we saw how different variables are related between them.

## Machine Learning

In [None]:
# Drop columns with excess categories to prevent over-fitting
df.drop(['province', 'winery'], axis=1, inplace=True)

### Pre-process data
For pre-processing, first we vectorize the reviews. For this, filter the varieties only to use the biggest sample of data

In [None]:
df = pd.read_csv('data/wine_data_clean.csv')
counts = df[['country', 'variety']].groupby('variety').count()
varieties = counts[counts.country >= 1000].index
varieties.shape[0]

In [None]:
df = df[df.variety.isin(varieties)]
df.shape

In [None]:
# Encode labels
le = LabelEncoder()
df['variety'] = le.fit_transform(df['variety'])

In [None]:
# Drop not useful data
df.drop(['points', 'price', 'country', 'neg', 'neu', 'pos', 'compound'],
        axis=1, inplace=True)

In [None]:
# Save changes until now
df.to_csv('data/vectorized_wine_reviews.csv', index=False)
with open(f'models/le_variety.pkl', 'wb') as f:
    pickle.dump(le, f)

### Training

After evaluate different models and find best parameters (Check `optim.pi` and `training.py`), now, we compute the model

In [None]:
# Best classifier with best parameters
sgd = SGDClassifier(loss='modified_huber',
                    max_iter=10000,
                    penalty='l2')

In [None]:
# Split data
X, Y = df.clean_description, df['variety']

In [None]:
# Set vectorizer for text to numeric with TD-IDF transformation
vectorizer =TfidfVectorizer()
# Split training and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.10)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
# Save vectorizer
with open(f'models/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [None]:
# Fit the model
sgd.fit(X_train, Y_train)

In [None]:
# Make an evalutaion
sgd.score(X_test, Y_test)

Finally, check deeply results with test data

In [None]:
for n in range(X_test.shape[0]):
    if n == 10:
        break
    print(" ".join(vectorizer.inverse_transform(X_test)[n]))
    print('Label: ', le.inverse_transform(Y_test)[n])
    print()
    print('Prediction: ', le.inverse_transform(sgd.predict(X_test))[n])
    pred_proba = sgd.predict_proba(X_test[n])
    tmp = pd.DataFrame(columns=le.classes_, data=pred_proba)
    print('Probalities: ')
    print(tmp[tmp > 0].dropna(axis=1))
    print('\n')

Save the models for implementation

In [None]:
with open('models/sgd.pkl', 'wb') as f:
    pickle.dump(sgd, f)