In [2]:
import numpy as np
import pandas as pd
import os

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import string
from wordcloud import WordCloud, STOPWORDS
from nltk.tokenize import RegexpTokenizer
import re

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt

In [3]:
train=pd.read_csv('train.csv')
test=pd.read_csv("test.csv")

In [4]:
print(train.info())
print(train.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82657 entries, 0 to 82656
Data columns (total 12 columns):
user_name             63264 non-null object
country               82622 non-null object
review_title          82657 non-null object
review_description    82657 non-null object
designation           59010 non-null object
points                82657 non-null int64
price                 77088 non-null float64
province              82622 non-null object
region_1              69903 non-null object
region_2              35949 non-null object
winery                82657 non-null object
variety               82657 non-null object
dtypes: float64(1), int64(1), object(10)
memory usage: 7.6+ MB
None
(82657, 12)


In [5]:
print(test.info())
print(test.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20665 entries, 0 to 20664
Data columns (total 11 columns):
user_name             15927 non-null object
country               20661 non-null object
review_title          20665 non-null object
review_description    20665 non-null object
designation           14676 non-null object
points                20665 non-null int64
price                 19271 non-null float64
province              20661 non-null object
region_1              17351 non-null object
region_2              8914 non-null object
winery                20665 non-null object
dtypes: float64(1), int64(1), object(9)
memory usage: 1.7+ MB
None
(20665, 11)


In [6]:
total = train.isnull().sum().sort_values(ascending = False)
percent = (train.isnull().sum()/train.isnull().count()*100).sort_values(ascending = False)
missing_train  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_train

Unnamed: 0,Total,Percent
region_2,46708,56.508221
designation,23647,28.608587
user_name,19393,23.462018
region_1,12754,15.43003
price,5569,6.737481
province,35,0.042344
country,35,0.042344
variety,0,0.0
winery,0,0.0
points,0,0.0


In [7]:
train['variety'].value_counts()

Pinot Noir                    10587
Chardonnay                     9403
Cabernet Sauvignon             7552
Red Blend                      7166
Bordeaux-style Red Blend       5497
Riesling                       4148
Sauvignon Blanc                4011
Syrah                          3316
Rosé                           2831
Merlot                         2471
Nebbiolo                       2242
Zinfandel                      2209
Sangiovese                     2165
Malbec                         2119
Portuguese Red                 1969
White Blend                    1896
Sparkling Blend                1739
Tempranillo                    1448
Rhône-style Red Blend          1182
Pinot Gris                     1148
Champagne Blend                1133
Cabernet Franc                 1095
Grüner Veltliner               1055
Portuguese White                896
Pinot Grigio                    873
Bordeaux-style White Blend      850
Gewürztraminer                  840
Gamay                       

In [8]:
df=pd.concat([train,test])
df.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,country,designation,points,price,province,region_1,region_2,review_description,review_title,user_name,variety,winery
0,Australia,Peace Family Vineyard,83,10.0,Australia Other,South Eastern Australia,,"Classic Chardonnay aromas of apple, pear and h...",Andrew Peace 2007 Peace Family Vineyard Chardo...,,Chardonnay,Andrew Peace
1,US,,89,15.0,Washington,Columbia Valley (WA),Columbia Valley,This wine is near equal parts Syrah and Merlot...,North by Northwest 2014 Red (Columbia Valley (...,@wawinereport,Red Blend,North by Northwest
2,Italy,Conca,94,80.0,Piedmont,Barolo,,Barolo Conca opens with inky dark concentratio...,Renato Ratti 2007 Conca (Barolo),,Nebbiolo,Renato Ratti
3,France,L'Abbaye,87,22.0,Southwest France,Bergerac Sec,,It's impressive what a small addition of Sauvi...,Domaine l'Ancienne Cure 2010 L'Abbaye White (B...,@vossroger,Bordeaux-style White Blend,Domaine l'Ancienne Cure
4,France,Le Cèdre Vintage,88,33.0,France Other,Vin de Liqueur,,"This ripe, sweet wine is rich and full of drie...",Château du Cèdre 2012 Le Cèdre Vintage Malbec ...,@vossroger,Malbec,Château du Cèdre


In [9]:
df['review_description']= df['review_description'].str.lower()
df['review_description']= df['review_description'].apply(lambda elem: re.sub('[^a-zA-Z]',' ', elem))  
df['review_description']

0        classic chardonnay aromas of apple  pear and h...
1        this wine is near equal parts syrah and merlot...
2        barolo conca opens with inky dark concentratio...
3        it s impressive what a small addition of sauvi...
4        this ripe  sweet wine is rich and full of drie...
                               ...                        
20660    clearly focused and fruit driven  this wine ha...
20661    herbal tones of bay and rosemary are upfront o...
20662    mocha cream  pencil shaving and dried herb aro...
20663    loud citrus and berry aromas precede an overlo...
20664    with very ripe fruit and firm tannins  this mo...
Name: review_description, Length: 103322, dtype: object

In [10]:
tokenizer = RegexpTokenizer(r'\w+')
words_descriptions = df['review_description'].apply(tokenizer.tokenize)
words_descriptions.head()

0    [classic, chardonnay, aromas, of, apple, pear,...
1    [this, wine, is, near, equal, parts, syrah, an...
2    [barolo, conca, opens, with, inky, dark, conce...
3    [it, s, impressive, what, a, small, addition, ...
4    [this, ripe, sweet, wine, is, rich, and, full,...
Name: review_description, dtype: object

In [11]:
stopword_list = stopwords.words('english')
ps = PorterStemmer()
words_descriptions = words_descriptions.apply(lambda elem: [word for word in elem if not word in stopword_list])
words_descriptions = words_descriptions.apply(lambda elem: [ps.stem(word) for word in elem])
df['description_cleaned'] = words_descriptions.apply(lambda elem: ' '.join(elem))
df['description_cleaned']

0        classic chardonnay aroma appl pear hay lead pa...
1        wine near equal part syrah merlot balanc caber...
2        barolo conca open inki dark concentr sooth aro...
3        impress small addit sauvignon gri muscadel sau...
4        ripe sweet wine rich full dri fresh fruit flav...
                               ...                        
20660    clearli focus fruit driven wine bonanza dark f...
20661    herbal tone bay rosemari upfront nose copper p...
20662    mocha cream pencil shave dri herb aroma set ba...
20663    loud citru berri aroma preced overload palat c...
20664    ripe fruit firm tannin mountain cab show attra...
Name: description_cleaned, Length: 103322, dtype: object

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vect= CountVectorizer(max_features=1500)
vectorized=vect.fit_transform(df['description_cleaned']).toarray()
vectorizeddf=pd.DataFrame(vectorized)

In [13]:
y=df['variety'].values
y=y[:len(train)]
X=vectorizeddf
X_train, X_test, y_train, y_test = train_test_split(X[:len(train)], y, test_size = 0.20, random_state = 42)

In [14]:
dt=DecisionTreeClassifier()
dt.fit(X_train,y_train)
dt.score(X_test,y_test)

0.5642390515364142

In [15]:
lr=LogisticRegression(penalty='l2',random_state=42 ,max_iter=500)
lr.fit(X_train,y_train)
lr.score(X_test,y_test)



0.6847326397290104

In [19]:
svml=SVC(kernel='linear')
svml.fit(X_train,y_train)
svml.score(X_test,y_test)

0.8487382224619010


In [20]:
X_output=X[len(train):]
test['variety']=svml.predict(X_output)
test.head()

Unnamed: 0,user_name,country,review_title,review_description,designation,points,price,province,region_1,region_2,winery,variety
0,@paulgwine,US,Boedecker Cellars 2011 Athena Pinot Noir (Will...,Nicely differentiated from the companion Stewa...,Athena,88,35.0,Oregon,Willamette Valley,Willamette Valley,Boedecker Cellars,Pinot Noir
1,@wineschach,Argentina,Mendoza Vineyards 2012 Gran Reserva by Richard...,"Charred, smoky, herbal aromas of blackberry tr...",Gran Reserva by Richard Bonvin,90,60.0,Mendoza Province,Mendoza,,Mendoza Vineyards,Malbec
2,@vboone,US,Prime 2013 Chardonnay (Coombsville),"Slightly sour and funky in earth, this is a re...",,87,38.0,California,Coombsville,Napa,Prime,Chardonnay
3,@wineschach,Argentina,Bodega Cuarto Dominio 2012 Chento Vineyard Sel...,"This concentrated, midnight-black Malbec deliv...",Chento Vineyard Selection,91,20.0,Mendoza Province,Mendoza,,Bodega Cuarto Dominio,Malbec
4,@kerinokeefe,Italy,SassodiSole 2012 Brunello di Montalcino,"Earthy aromas suggesting grilled porcini, leat...",,90,49.0,Tuscany,Brunello di Montalcino,,SassodiSole,Sangiovese


In [21]:
test.to_csv('newtest.csv') 