### Importing all the necessary libraries

In [31]:
import warnings
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
warnings.filterwarnings('ignore')
from sklearn import svm
from sklearn.svm import SVC

### Loading Wine Data

In [32]:
df=pd.read_csv('wine.csv', usecols=['country','description','designation','points','price','province','region_1','region_2','taster_name','taster_twitter_handle',
                                    'title','variety','winery'])
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


### Understanding the structure of data

In [46]:
df.shape

(71322, 13)

In [50]:
df.describe()

Unnamed: 0,points,price
count,71322.0,67234.0
mean,88.602787,38.121516
std,3.135474,45.84397
min,80.0,4.0
25%,86.0,18.0
50%,88.0,28.0
75%,91.0,45.0
max,100.0,3300.0


In [55]:
print(df.isna().sum())
df.fillna('Nan', inplace=True)

country                     26
description                  0
designation              22214
points                       0
price                     4088
province                    26
region_1                 10092
region_2                 34659
taster_name              14781
taster_twitter_handle    18264
title                        0
variety                      0
winery                       0
dtype: int64


### Finding top 10 wines

In [33]:
#List all the unique wines and their count using counter from collections
counter=Counter(df['variety'].tolist())
counter

Counter({'White Blend': 2360,
         'Portuguese Red': 2466,
         'Pinot Gris': 1455,
         'Riesling': 5189,
         'Pinot Noir': 13272,
         'Tempranillo-Merlot': 7,
         'Frappato': 39,
         'Gewürztraminer': 1012,
         'Cabernet Sauvignon': 9472,
         'Nerello Mascalese': 117,
         'Chardonnay': 11753,
         'Malbec': 2652,
         'Tempranillo Blend': 588,
         'Meritage': 260,
         'Red Blend': 8946,
         'Merlot': 3102,
         "Nero d'Avola": 365,
         'Chenin Blanc': 591,
         'Gamay': 1025,
         'Sauvignon Blanc': 4967,
         'Viognier-Chardonnay': 12,
         'Primitivo': 222,
         'Catarratto': 29,
         'Inzolia': 30,
         'Petit Verdot': 269,
         'Monica': 4,
         'Bordeaux-style White Blend': 1066,
         'Grillo': 137,
         'Sangiovese': 2707,
         'Cabernet Franc': 1353,
         'Champagne Blend': 1396,
         'Bordeaux-style Red Blend': 6915,
         'Aglianico': 359,

In [34]:
#Getting top 10 varities from counter and ordering them from toop to bottom
top_10_varities={i[0]: idx for idx, i in enumerate(counter.most_common(10))}
top_10_varities

{'Pinot Noir': 0,
 'Chardonnay': 1,
 'Cabernet Sauvignon': 2,
 'Red Blend': 3,
 'Bordeaux-style Red Blend': 4,
 'Riesling': 5,
 'Sauvignon Blanc': 6,
 'Syrah': 7,
 'Rosé': 8,
 'Merlot': 9}

In [36]:
#Considering rows that only has top 10 wines
df=df[df['variety'].map(lambda x: x in top_10_varities)]

### Preparing independent and dependent variables for predictive analysis

In [40]:
# Preparing our independent variable 
X = df['description'].tolist() #X is the description of wine

#Preparing our dependant variable
Y = [top_10_varities[i] for i in df['variety'].tolist()] #Y is the rank of the name of that top 10 wine
Y = np.array(varietal_list) #Converting Y to numpy array

### Vectorizing and Splitting the data

In [56]:
# Vectorizing all the words in X 
count_vect=CountVectorizer()
x_train_counts=count_vect.fit_transform(X)

#Using Tfidf for scaling the impact based on frequency of word
tfidf_transformer=TfidfTransformer()
x_train_tfidf=tfidf_transformer.fit_transform(x_train_counts)

#Splitting data with train_test_split model
train_x, test_x, train_y, test_y=train_test_split(x_train_tfidf,Y, test_size=0.3)


### Applying ML models to predict the test data

#### 1. Naive Bayes model

In [57]:
# Applying Naive Bayes algorithm for predicitng testing data
clf=MultinomialNB().fit(train_x,train_y)
y_score=clf.predict(test_x)
n_right=0

#Calculating the accuracy for Naive Bayes model
for i in range(len(y_score)):
    if y_score[i]==test_y[i]:
        n_right+=1
print("Accuracy: %.2f%%" % ((n_right/float(len(test_y))*100)))

Accuracy: 63.11%


#### 2. Support Vector Model

In [25]:
# Applying SVM algorithm for predicitng testing data
clf=SVC(kernel='linear').fit(train_x,train_y)
y_score=clf.predict(test_x)
n_right=0
for i in range(len(y_score)):
    if y_score[i]==test_y[i]:
        n_right+=1
print("Accuracy: %.2f%%" % ((n_right/float(len(test_y))*100)))

Accuracy: 80.60%
