In [21]:
import pandas as pd
import numpy as np

data = pd.read_csv('winemag-data-130k-v2.csv')

In [22]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


## Checking for Null Values

In [23]:
data.isnull().any()

Unnamed: 0               False
country                   True
description              False
designation               True
points                   False
price                     True
province                  True
region_1                  True
region_2                  True
taster_name               True
taster_twitter_handle     True
title                    False
variety                   True
winery                   False
dtype: bool

#### We can see that in the Many Columns  have null values.

There are many techniques of dealing with the Missing values.

1.) Deleting the rows with missing values

2.) Replace with mean/median/mode.

3.) Assigning with Unique Category.

In this I'm chosen the 3 techniques to deal with missing values.

In [24]:
def fill_missing_data(data):
    data.country.fillna(value = "Others", inplace = True)
    data.designation.fillna(value = "Not known", inplace = True)
    #data.price.fillna(value = "No description given", inplace = True)
    data.province.fillna(value = "Not given", inplace = True)
    data.region_1.fillna(value = "Not known", inplace = True)
    data.region_2.fillna(value = "Others", inplace = True)
    data.taster_name.fillna(value = "Not known", inplace = True)
    data.taster_twitter_handle.fillna(value = "Not given", inplace = True)
    data.variety .fillna(value="Others", inplace = True)
    return data

In [25]:
new_data = fill_missing_data(data)
print(np.shape(new_data))
new_data.head(5)

(129971, 14)


Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,Others,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,Not known,Others,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",Not known,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,Others,Alexander Peartree,Not given,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [26]:
median = new_data["price"].median()
new_data["price"].fillna(median, inplace=True)

In [27]:
new_data.isnull().any()

Unnamed: 0               False
country                  False
description              False
designation              False
points                   False
price                    False
province                 False
region_1                 False
region_2                 False
taster_name              False
taster_twitter_handle    False
title                    False
variety                  False
winery                   False
dtype: bool

In [28]:
new_data.head(5)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,25.0,Sicily & Sardinia,Etna,Others,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,Not known,Others,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",Not known,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,Others,Alexander Peartree,Not given,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


I'm droping many columns as they didn't found usefull to me.

In [29]:
new_data.drop('region_1',inplace=True,axis=1)



In [30]:
new_data.drop('region_2',axis=1,inplace=True)

In [31]:
#new_data.drop('unnamed',axis=1)
new_data.drop('taster_name',axis=1,inplace=True)

In [32]:
new_data.drop('taster_twitter_handle',axis=1,inplace=True)


In [33]:
new_data.drop('province',axis=1,inplace=True)

In [34]:
new_data.head(5)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,25.0,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",Not known,87,14.0,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [35]:
count = new_data['variety'].value_counts()
count.tail(20)

Merlot-Grenache                 1
Roussanne-Grenache Blanc        1
Tempranillo-Tannat              1
Teroldego Rotaliano             1
Diamond                         1
Bombino Nero                    1
Tokay Pinot Gris                1
Pinot Grigio-Sauvignon Blanc    1
Merlot-Argaman                  1
Kangoun                         1
Pinot Blanc-Pinot Noir          1
Tinta Madeira                   1
Parraleta                       1
Premsal                         1
Zelen                           1
Petit Courbu                    1
Shiraz-Roussanne                1
Piquepoul Blanc                 1
Morava                          1
Moschofilero-Chardonnay         1
Name: variety, dtype: int64

In [36]:
data = new_data.groupby('variety').filter(lambda x: len(x) >100)

In [37]:
y = new_data['points'].values
X = new_data.drop(['points'], axis=1)
X.head(2)

Unnamed: 0.1,Unnamed: 0,country,description,designation,price,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,25.0,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,15.0,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos


## Splitting data into Train and cross validation(or test): Stratified Sampling

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y)
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.33, stratify=y_train)

## Encoding Text feature: Description

In [39]:
from sklearn.feature_extraction.text import CountVectorizer

print(X_train.shape, y_train.shape)
print(X_cv.shape, y_cv.shape)
print(X_test.shape, y_test.shape)

print("="*100)


vectorizer_description = CountVectorizer(min_df=10,max_features=5000)
vectorizer_description.fit(X_train['description'].values) # fit has to happen only on train data

# we use the fitted CountVectorizer to convert the text to vector
X_train_description_bow = vectorizer_description.transform(X_train['description'].values)
X_cv_description_bow = vectorizer_description.transform(X_cv['description'].values)
X_test_description_bow = vectorizer_description.transform(X_test['description'].values)

print("After vectorizations")
print(X_train_description_bow.shape, y_train.shape)
print(X_cv_description_bow.shape, y_cv.shape)
print(X_test_description_bow.shape, y_test.shape)
print("="*100)


(58343, 8) (58343,)
(28737, 8) (28737,)
(42891, 8) (42891,)
After vectorizations
(58343, 5000) (58343,)
(28737, 5000) (28737,)
(42891, 5000) (42891,)


## Encoding Categorical feature: Country

In [40]:
vectorizer_country = CountVectorizer()
vectorizer_country.fit(X_train['country'].values) # fit has to happen only on train data

# we use the fitted CountVectorizer to convert the text to vector
X_train_country_ohe = vectorizer_country.transform(X_train['country'].values)
X_cv_country_ohe = vectorizer_country.transform(X_cv['country'].values)
X_test_country_ohe = vectorizer_country.transform(X_test['country'].values)

print("After vectorizations")
print(X_train_country_ohe.shape, y_train.shape)
print(X_cv_country_ohe.shape, y_cv.shape)
print(X_test_country_ohe.shape, y_test.shape)
print(vectorizer_country.get_feature_names())
print("="*100)

After vectorizations
(58343, 47) (58343,)
(28737, 47) (28737,)
(42891, 47) (42891,)
['africa', 'and', 'argentina', 'australia', 'austria', 'bosnia', 'brazil', 'bulgaria', 'canada', 'chile', 'china', 'croatia', 'cyprus', 'czech', 'egypt', 'england', 'france', 'georgia', 'germany', 'greece', 'herzegovina', 'hungary', 'india', 'israel', 'italy', 'lebanon', 'macedonia', 'mexico', 'moldova', 'morocco', 'new', 'others', 'peru', 'portugal', 'republic', 'romania', 'serbia', 'slovakia', 'slovenia', 'south', 'spain', 'switzerland', 'turkey', 'ukraine', 'uruguay', 'us', 'zealand']


## Encoding Categorical feature: Designation

In [41]:
vectorizer_designation = CountVectorizer()
vectorizer_designation.fit(X_train['designation'].values) # fit has to happen only on train data

# we use the fitted CountVectorizer to convert the text to vector
X_train_designation_ohe = vectorizer_designation.transform(X_train['designation'].values)
X_cv_designation_ohe = vectorizer_designation.transform(X_cv['designation'].values)
X_test_designation_ohe = vectorizer_designation.transform(X_test['designation'].values)

print("After vectorizations")
print(X_train_designation_ohe.shape, y_train.shape)
print(X_cv_designation_ohe.shape, y_cv.shape)
print(X_test_designation_ohe.shape, y_test.shape)
print(vectorizer_designation.get_feature_names())
print("="*100)

After vectorizations
(58343, 15967) (58343,)
(28737, 15967) (28737,)
(42891, 15967) (42891,)
['002', '006', '01', '02', '03', '05', '068', '075', '08', '09', '10', '100', '1000', '101', '1028', '1040', '107', '109', '10rf', '10th', '11', '1100', '1105', '113', '114', '1147', '1149', '115', '119', '1194', '12', '120', '121', '1265', '128', '13', '130', '1300', '138', '14', '140', '1423', '146', '149', '1492', '15', '150', '150th', '152', '154', '1550', '156', '16', '1607', '1614', '1619', '1628', '1637', '168', '1698', '17', '172', '1735', '174', '175', '1753', '1762', '1769', '1772', '178', '18', '181', '1810', '1827', '1836', '1840', '1844', '1850', '1860', '1865', '1866', '1868', '1869', '1872', '1875', '1879', '1880', '1887', '1888', '1894', '19', '1903', '1904', '1905', '1907', '1908', '1909', '1910', '1912', '1914', '1919', '1922', '1924', '1927', '1935', '1939', '1940', '1955', '1970', '1973', '1979', '198', '1988', '1998', '1b', '1er', '1rdrs4', '1st', '20', '200', '2000', '2003

## Encoding Categorical feature: Title

In [42]:
vectorizer_title = CountVectorizer()
vectorizer_title.fit(X_train['title'].values) # fit has to happen only on train data

# we use the fitted CountVectorizer to convert the text to vector
X_train_title_ohe = vectorizer_title.transform(X_train['title'].values)
X_cv_title_ohe = vectorizer_title.transform(X_cv['title'].values)
X_test_title_ohe = vectorizer_title.transform(X_test['title'].values)

print("After vectorizations")
print(X_train_title_ohe.shape, y_train.shape)
print(X_cv_title_ohe.shape, y_cv.shape)
print(X_test_title_ohe.shape, y_test.shape)
print(vectorizer_title.get_feature_names())
print("="*100)

After vectorizations
(58343, 26068) (58343,)
(28737, 26068) (28737,)
(42891, 26068) (42891,)
['002', '006', '01', '02', '03', '05', '068', '075', '08', '09', '10', '100', '1000', '101', '1028', '103', '1040', '107', '109', '10rf', '10span', '10th', '11', '1100', '1105', '113', '114', '1147', '1149', '115', '119', '1194', '12', '120', '121', '1265', '128', '12c', '13', '130', '1300', '138', '14', '140', '1423', '146', '149', '1492', '15', '150', '150th', '152', '154', '1550', '156', '16', '1607', '1614', '1619', '1628', '1637', '168', '1698', '17', '172', '1735', '174', '175', '1752', '1753', '1762', '1769', '1772', '178', '1789', '18', '181', '1810', '1827', '1836', '1840', '18401', '1844', '1845', '1848', '1850', '1852', '1856', '1860', '1865', '1866', '1868', '1869', '1870', '1872', '1875', '1877', '1879', '1880', '1882', '1887', '1888', '1894', '1898', '19', '1903', '1904', '1905', '1907', '1908', '1909', '1910', '1912', '1914', '1919', '1922', '1924', '1927', '1934', '1935', '1939'

## Encoding Categorical feature: Variety

In [43]:
vectorizer_variety = CountVectorizer()
vectorizer_variety.fit(X_train['variety'].values) # fit has to happen only on train data

# we use the fitted CountVectorizer to convert the text to vector
X_train_variety_ohe = vectorizer_variety.transform(X_train['variety'].values)
X_cv_variety_ohe = vectorizer_variety.transform(X_cv['variety'].values)
X_test_variety_ohe = vectorizer_variety.transform(X_test['variety'].values)

print("After vectorizations")
print(X_train_variety_ohe.shape, y_train.shape)
print(X_cv_variety_ohe.shape, y_cv.shape)
print(X_test_variety_ohe.shape, y_test.shape)
print(vectorizer_variety.get_feature_names())
print("="*100)

After vectorizations
(58343, 471) (58343,)
(28737, 471) (28737,)
(42891, 471) (42891,)
['abouriou', 'affile', 'agiorgitiko', 'aglianico', 'albana', 'albanello', 'albariño', 'albarossa', 'aleatico', 'alfrocheiro', 'alicante', 'aligoté', 'alsace', 'altesse', 'alvarelhão', 'alvarinho', 'and', 'angevine', 'ansonica', 'antão', 'apple', 'aragonez', 'aragonês', 'arinto', 'arneis', 'asprinio', 'assyrtico', 'assyrtiko', 'athiri', 'austrian', 'auxerrois', 'avesso', 'avola', 'azal', 'babić', 'bacchus', 'baco', 'baga', 'barbera', 'barroca', 'bastardo', 'bianca', 'bianco', 'bical', 'black', 'blanc', 'blanca', 'blanco', 'blatina', 'blauer', 'blaufränkisch', 'blend', 'bobal', 'bois', 'bombino', 'bonarda', 'bordeaux', 'bouschet', 'bovale', 'boğazkere', 'brachetto', 'braucol', 'bual', 'cabernet', 'canaiolo', 'candia', 'canelli', 'cannonau', 'cao', 'cappuccio', 'carignan', 'carignane', 'carignano', 'carineña', 'cariñena', 'carmenère', 'carricante', 'casavecchia', 'castelão', 'catarratto', 'cayuga', 'cer

## Encoding Categorical feature: Winery

In [44]:
vectorizer_winery = CountVectorizer()
vectorizer_winery.fit(X_train['winery'].values) # fit has to happen only on train data

# we use the fitted CountVectorizer to convert the text to vector
X_train_winery_ohe = vectorizer_winery.transform(X_train['winery'].values)
X_cv_winery_ohe = vectorizer_winery.transform(X_cv['winery'].values)
X_test_winery_ohe = vectorizer_winery.transform(X_test['winery'].values)

print("After vectorizations")
print(X_train_winery_ohe.shape, y_train.shape)
print(X_cv_winery_ohe.shape, y_cv.shape)
print(X_test_winery_ohe.shape, y_test.shape)
print(vectorizer_winery.get_feature_names())
print("="*100)

After vectorizations
(58343, 12837) (58343,)
(28737, 12837) (28737,)
(42891, 12837) (42891,)
['10', '100', '103', '10span', '12', '128', '12c', '14', '1492', '15', '154', '16', '1637', '1752', '1789', '18401', '1845', '1848', '1850', '1852', '1856', '1860', '1868', '1870', '1875', '1877', '1882', '1898', '1912', '21', '22', '24', '29', '2hawk', '2nd', '2plank', '32', '33', '35', '351', '36', '360', '38', '39', '3cv', '3fools', '401k', '41', '42', '44', '460', '46n118', '4r4u', '50', '52', '5os', '60', '6cepas6', '7200', '75', '772', '786', '800', '868', '88', '91', 'aaron', 'abacela', 'abad', 'abadia', 'abadía', 'abarbanel', 'abbadia', 'abbaye', 'abbazia', 'abbey', 'abbeyville', 'abbona', 'abbotts', 'abeja', 'abele', 'abelis', 'abella', 'abelé', 'aberrant', 'abiouness', 'abiqua', 'abraham', 'abrantes', 'abrigo', 'acacia', 'acaibo', 'acate', 'accadia', 'accordini', 'accornero', 'aces', 'achaia', 'achaval', 'acinum', 'aciprestes', 'acker', 'ackerman', 'aconcagua', 'aconga', 'aconquija', 

## Encoding Numerical feature: Price

In [47]:
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
# normalizer.fit(X_train['price'].values)
# this will rise an error Expected 2D array, got 1D array instead: 
# array=[105.22 215.96  96.01 ... 368.98  80.53 709.67].
# Reshape your data either using 
# array.reshape(-1, 1) if your data has a single feature 
# array.reshape(1, -1)  if it contains a single sample.
normalizer.fit(X_train['price'].values.reshape(1,-1))

X_train_price_norm = normalizer.transform(X_train['price'].values.reshape(1,-1))
X_cv_price_norm = normalizer.transform(X_cv['price'].values.reshape(1,-1))
X_test_price_norm = normalizer.transform(X_test['price'].values.reshape(1,-1))


print(X_train_price_norm[0:5])
print(X_train_price_norm.shape)
print(X_cv_price_norm[0:5])
print(X_cv_price_norm.shape)
print(X_test_price_norm[0:5])
print(X_test_price_norm.shape)

X_train_price_norm=X_train_price_norm.reshape(-1,1)
X_cv_price_norm=X_cv_price_norm.reshape(-1,1)
X_test_price_norm=X_test_price_norm.reshape(-1,1)

[[0.00248062 0.0104026  0.00112028 ... 0.00352088 0.0020005  0.004001  ]]
(1, 58343)
[[0.01544051 0.00076115 0.00163104 ... 0.00380576 0.0027184  0.0027184 ]]
(1, 28737)
[[0.0061229  0.00365546 0.00191912 ... 0.00228466 0.0020105  0.00118803]]
(1, 42891)


## Concatinating all the features

In [50]:
from scipy.sparse import hstack

X_tr = hstack((X_train_description_bow,X_train_country_ohe,X_train_designation_ohe,X_train_title_ohe,X_train_variety_ohe,X_train_winery_ohe,X_train_price_norm)).tocsr()

X_cr = hstack((X_cv_description_bow,X_cv_country_ohe,X_cv_designation_ohe,X_cv_title_ohe,X_cv_variety_ohe,X_cv_winery_ohe,X_cv_price_norm)).tocsr()

X_te = hstack((X_test_description_bow,X_test_country_ohe,X_test_designation_ohe,X_test_title_ohe,X_test_variety_ohe,X_test_winery_ohe,X_test_price_norm)).tocsr()


print("Final Data matrix")
print(X_tr.shape, y_train.shape)
print(X_cr.shape, y_cv.shape)
print(X_te.shape, y_test.shape)
print("="*100)

Final Data matrix
(58343, 60391) (58343,)
(28737, 60391) (28737,)
(42891, 60391) (42891,)


In [54]:
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(max_depth = 3)
clf = decision_tree.fit(X_tr ,y_train)

In [57]:
clf.score(X_tr ,y_train,sample_weight=None)

0.17427969079409697

In [60]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
clf = SVC( gamma='auto')
clf.fit(X_tr ,y_train)

SVC(gamma='auto')

In [61]:
clf.score(X_tr ,y_train,sample_weight=None)

0.13238948974169995

In [2]:
from prettytable import PrettyTable


x = PrettyTable()
x.field_names = ["Model", "Best score"]

x.add_row(["Decision tree", 0.1742 ])
x.add_row(["SVM",0.1323 ])

print(x)

+---------------+------------+
|     Model     | Best score |
+---------------+------------+
| Decision tree |   0.1742   |
|      SVM      |   0.1323   |
+---------------+------------+
