In [1]:
from copy import copy, deepcopy
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split 
from sklearn import metrics

import sklearn.linear_model
import pandas as pd 
import numpy as np
import sys
import re
%matplotlib inline

In [2]:
# Load the Wine Dataset
df_wine = pd.read_csv("../data/winemag-data-130k-v2.csv", encoding = 'utf8', index_col=0)
df_wine = df_wine.reset_index()

In [3]:
print(df_wine.dtypes)

index                      int64
country                   object
description               object
designation               object
points                     int64
price                    float64
province                  object
region_1                  object
region_2                  object
taster_name               object
taster_twitter_handle     object
title                     object
variety                   object
winery                    object
dtype: object


# REGION CHECK

In [6]:
df_wine["region"] = df_wine["region_2"].fillna(df_wine["region_1"])
df_wine.head()

Unnamed: 0,index,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,region
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,Etna
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,Willamette Valley
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,Lake Michigan Shore
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,Willamette Valley


In [7]:
df_wine = df_wine[['country','province','region','price','title','variety','points']]
#df_wine = df_wine[['country','province','region','price','variety','points']]

In [8]:
# Finding the number of rows with NULL values
df_wine.isnull().sum()

country        63
province       63
region      21247
price        8996
title           0
variety         1
points          0
dtype: int64

In [9]:
# DROP rows with NULL values
df_wine=df_wine.dropna()
df_wine.shape

(101400, 7)

In [10]:
df_wine['points'].describe()

count    101400.000000
mean         88.463343
std           3.060467
min          80.000000
25%          86.000000
50%          88.000000
75%          91.000000
max         100.000000
Name: points, dtype: float64

In [11]:
print(df_wine.dtypes)

country      object
province     object
region       object
price       float64
title        object
variety      object
points        int64
dtype: object


In [13]:
# GET DUMMY VARIABLES
from sklearn.preprocessing import LabelEncoder
#Auto encodes any dataframe column of type category or object.
def dummyEncode(df):
        columnsToEncode = list(df.select_dtypes(include=['category','object']))
        le = LabelEncoder()
        for feature in columnsToEncode:
            try:
                df[feature] = le.fit_transform(df[feature])
            except:
                print('Error encoding '+feature)
        return df

In [14]:
df_dummy = dummyEncode(df_wine[['country','province','region','variety']])
df_dummy.columns = ['country_num', 'province_num','region_num','variety_num']
df_wine = pd.concat([df_wine, df_dummy], axis=1)
df_wine.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,country,province,region,price,title,variety,points,country_num,province_num,region_num,variety_num
2,US,Oregon,Willamette Valley,14.0,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,87,6,43,1009,318
3,US,Michigan,Lake Michigan Shore,13.0,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,87,6,30,477,347
4,US,Oregon,Willamette Valley,65.0,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,87,6,43,1009,322
5,Spain,Northern Spain,Navarra,15.0,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,87,5,39,639,430
6,Italy,Sicily & Sardinia,Vittoria,16.0,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,87,4,50,999,131


# Extracting Year from Title

In [15]:
# REGEX: Extract first numbers from Title - FAILED BECAUSE THERE ARE MANY NUMBERS IN TITLES
df = pd.DataFrame(df_wine['title'])
df['year'] = df['title'].str.extract('(\d+)')
df.year.unique()

array(['2013', '2012', '2011', '2010', '2007', '2009', '2014', '2015',
       nan, '2016', '2', '2004', '2003', '2006', '2008', '2001', '2005',
       '2002', '9', '46', '1887', '2000', '1999', '1991', '1997', '772',
       '1', '41', '42', '44', '14', '33', '2017', '1637', '35', '39',
       '1996', '4', '3', '012', '401', '181', '1492', '1898', '1998',
       '7200', '1852', '50', '7', '12', '66', '1995', '1994', '1992',
       '18401', '15', '5', '6', '1929', '240', '075', '17', '1875', '22',
       '10', '786', '21', '8', '38', '351', '460', '1856', '91', '29',
       '24', '25', '1990', '1988', '154', '511', '1827', '1860', '45',
       '735', '1872', '52', '109', '204', '150', '1850', '337', '1877',
       '30', '310', '1870', '100', '205', '1000', '1868', '16', '103',
       '585', '413', '1989', '1993', '360', '32', '20', '1882', '51',
       '375', '1821', '47', '158', '69', '128', '1947', '13', '1070',
       '1985', '1927', '1904', '68', '1847', '1982', '1986', '90', '736',


In [16]:
# FUNCTION: FIND ALL NUMBERS IN THE TITLE, ADD THEM TO A LIST VARIABLE
def regex(x):
    L = re.findall(r'\d+', str(x))
    L_str = ",".join(str(x) for x in L)
    return str(L_str)

In [17]:
df['title_num_values'] = df['title'].apply(regex)

In [18]:
# FINDING HOW MANY NUMBERS DOES EACH TITLE INCLUDE
df['num_count'] = df.title_num_values.str.count(',') + 1
numeric_values_ct = df.num_count.max()
df.head()

Unnamed: 0,title,year,title_num_values,num_count
2,Rainstorm 2013 Pinot Gris (Willamette Valley),2013,2013,1
3,St. Julian 2013 Reserve Late Harvest Riesling ...,2013,2013,1
4,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,2012,2012,1
5,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,2011,2011,1
6,Terre di Giurfo 2013 Belsito Frappato (Vittoria),2013,2013,1


In [19]:
# FOR ALL THE NUMERIC VALUES IN THE TITLES FIND THEM AND ADD TO A NEW DATA FRAME AS COLUMNS
def regex2(y):
    L = re.findall(r'\d+', str(y))
    X = np.array([L])
    new_L = []
    for x in X:
        a = x.tolist()
        b = []
        for i in range(1, numeric_values_ct + 1):
            if len(a) == numeric_values_ct:
                new_L.append(a)
                break
            else:
                for j in range(0, numeric_values_ct - len(a)):
                     b = a.extend([0])
    return new_L

In [20]:
xx = df['title'].apply(regex2)
new_list1 = []
new_list2 = []
new_list3 = []
new_list4 = []
new_list5 = []
for row in xx:
    new_list1.append(row[0][0])
    new_list2.append(row[0][1])
    new_list3.append(row[0][2])
    new_list4.append(row[0][3])
    new_list5.append(row[0][4])

In [21]:
df.insert(loc=0, column='N5', value= pd.Series(new_list5))
df.insert(loc=0, column='N4', value= pd.Series(new_list4))
df.insert(loc=0, column='N3', value= pd.Series(new_list3))
df.insert(loc=0, column='N2', value= pd.Series(new_list2))
df.insert(loc=0, column='N1', value= pd.Series(new_list1))

In [22]:
# SET 0 FOR ALL THE VALUES WHICH HAS MORE THAN 4 DIGIT
df.loc[df['N1'].astype(str).map(len) != 4 , 'N1'] = 0
df.loc[df['N2'].astype(str).map(len) != 4 , 'N2'] = 0
df.loc[df['N3'].astype(str).map(len) != 4 , 'N3'] = 0
df.loc[df['N4'].astype(str).map(len) != 4 , 'N4'] = 0
df.loc[df['N5'].astype(str).map(len) != 4 , 'N5'] = 0

# SET 0 FOR ALL THE VALUES GREATER THAN CURRENT YEAR
df.loc[df['N1'].astype(int) > 2018 , 'N1'] = 0
df.loc[df['N2'].astype(int) > 2018 , 'N2'] = 0
df.loc[df['N3'].astype(int) > 2018 , 'N3'] = 0
df.loc[df['N4'].astype(int) > 2018 , 'N4'] = 0
df.loc[df['N5'].astype(int) > 2018 , 'N5'] = 0

In [23]:
df['year_of_wine'] = df[['N5','N4','N3','N2','N1']].astype(int).max(axis=1)

In [24]:
# DELETE ALL THE YEARS BEFORE 1900 
df = df[['title', 'year_of_wine']]
df.loc[df['year_of_wine']  < 1900 , 'year_of_wine'] = 0
df.year_of_wine.unique()

array([2012, 2011, 2013, 2010, 2007, 2009, 2014, 2015,    0, 2016, 2004,
       2003, 2006, 2008, 2001, 2005, 2000, 1999, 1991, 2002, 1997, 2017,
       1996, 1998, 1995, 1994, 1992, 1990, 1989, 1993, 1947, 1988, 1927,
       1904, 1982, 1985, 1987, 1978, 1986, 1945])

In [25]:
# CONCAT THE YEAR COLUMN TO MAIN DATAFRAME
try:
    if 'year_of_wine' not in df_wine:
        df_wine = pd.concat([df_wine, df.year_of_wine], axis=1)
except (RuntimeError, TypeError, NameError):
    print("Error: concating year to main dataframe")

In [26]:
df_wine.head(3)

Unnamed: 0,country,province,region,price,title,variety,points,country_num,province_num,region_num,variety_num,year_of_wine
2,US,Oregon,Willamette Valley,14.0,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,87,6,43,1009,318,2012
3,US,Michigan,Lake Michigan Shore,13.0,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,87,6,30,477,347,2011
4,US,Oregon,Willamette Valley,65.0,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,87,6,43,1009,322,2013


In [27]:
df_wine[['title','year_of_wine']].head()

Unnamed: 0,title,year_of_wine
2,Rainstorm 2013 Pinot Gris (Willamette Valley),2012
3,St. Julian 2013 Reserve Late Harvest Riesling ...,2011
4,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,2013
5,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,2012
6,Terre di Giurfo 2013 Belsito Frappato (Vittoria),2012


# Convert categorical variables to numeric with One Hot Encoding

In [28]:
# Variety
df_variety = df_wine.variety.str.get_dummies()
df_variety.columns = ['v_' + col for col in df_variety.columns] # naming the columns
df_wine = pd.concat([df_wine, df_variety], axis=1) # combine main dataframe with country matrix dataframe
df_wine = df_wine.drop('variety', axis=1) # drop the country attribute since different attributes per country value
df_wine = df_wine.drop('variety_num', axis=1) # drop the country attribute since different attributes per country value

In [29]:
# Country
df_country = df_wine.country.str.get_dummies() # get the pivot of the country attribte
df_country.columns = ['country_' + col for col in df_country.columns] # naming the columns
df_wine = pd.concat([df_wine, df_country], axis=1) # combine main dataframe with country matrix dataframe
df_wine = df_wine.drop('country', axis=1) # drop the country attribute since different attributes per country value
df_wine = df_wine.drop('country_num', axis=1) # drop the country attribute since different attributes per country value

In [30]:
# Province
df_province = df_wine.province.str.get_dummies() # get the pivot of the country attribte
df_province.columns = ['province_' + col for col in df_province.columns] # naming the columns
df_wine = pd.concat([df_wine, df_province], axis=1) # combine main dataframe with country matrix dataframe
df_wine = df_wine.drop('province', axis=1) # drop the country attribute since different attributes per country value
df_wine = df_wine.drop('province_num', axis=1) # drop the country attribute since different attributes per country value

In [31]:
# Region
#df_region = df_wine.region.str.get_dummies() # get the pivot of the country attribte
#df_region.columns = ['region_' + col for col in df_province.columns] # naming the columns
#df_wine = pd.concat([df_wine, df_region], axis=1) # combine main dataframe with country matrix dataframe
df_wine = df_wine.drop('region', axis=1) # drop the country attribute since different attributes per country value
df_wine = df_wine.drop('region_num', axis=1) # drop the country attribute since different attributes per country value


# Data Preparation

In [32]:
# DEFINE X and Y VARIABLES
#df_wine = df_wine[df_wine['year_of_wine'] > 0]
df_X = df_wine.drop('points', axis=1)
df_X = df_X.drop('year_of_wine', axis=1)
df_X = df_X.drop('title', axis=1) # year was already extracted
df_Y = df_wine[['points']]
print(df_X.shape)
print(df_Y.shape)

(101400, 582)
(101400, 1)


In [33]:
df_X = df_X2
df_Y = df_Y2