# Imports

In [1]:
# data manipulation libraries
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 60)

# data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.pylabtools import figsize

%matplotlib inline
# to display visuals in the notebook

%config InlineBackend.figure_format='retina'
#to enable high resolution plots

# feature extraction and preprocessing
import re
import datetime

# feature transformation and preprocessing
from category_encoders.ordinal import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Understand & Clean & Format Data

In [2]:
train = pd.read_csv("../data/train/train.csv") 
test = pd.read_csv("../data/test/test.csv")
train.sample(5)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
3200,Portugal,"Full and round, this is a ripe blackberry-flav...",Samora Tinto,85,12.0,Tejo,,,Roger Voss,@vossroger,Wines & Winemakers 2012 Samora Tinto Red (Tejo),Portuguese Red,Wines & Winemakers
6908,US,To call it entry level does not do justice to ...,Unfiltered,91,29.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,J. Christopher 2012 Unfiltered Pinot Noir (Wil...,Pinot Noir,J. Christopher
6385,US,"Black cherry juice, anise, thyme and pepper sh...",,87,16.0,California,Paso Robles,Central Coast,Matt Kettmann,@mattkettmann,Maddalena 2012 Merlot (Paso Robles),Merlot,Maddalena
4642,US,"Quite earthy in profile with hints of smoke, p...",Dry,87,16.0,New York,Finger Lakes,Finger Lakes,Anna Lee C. Iijima,,Billsboro 2010 Dry Riesling (Finger Lakes),Riesling,Billsboro
6253,Germany,Hints of bramble and earth lend savory nuances...,Bernkasteler alte Badstube am Doctorberg Kabinett,89,28.0,Mosel,,,Anna Lee C. Iijima,,Dr. Heidemanns-Bergweiler 2014 Bernkasteler al...,Riesling,Dr. Heidemanns-Bergweiler


In [3]:
print("There are {} rows and {} columns in the train dataset."
      .format(train.shape[0], train.shape[1]))

There are 9000 rows and 13 columns in the train dataset.


In [4]:
print("There are {} rows and {} columns in the test dataset."
      .format(test.shape[0], test.shape[1]))

There are 1000 rows and 13 columns in the test dataset.


# Descriptive statistics & information about datasets

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   country                8994 non-null   object 
 1   description            9000 non-null   object 
 2   designation            6455 non-null   object 
 3   points                 9000 non-null   int64  
 4   price                  8403 non-null   float64
 5   province               8994 non-null   object 
 6   region_1               7505 non-null   object 
 7   region_2               3469 non-null   object 
 8   taster_name            7223 non-null   object 
 9   taster_twitter_handle  6888 non-null   object 
 10  title                  9000 non-null   object 
 11  variety                9000 non-null   object 
 12  winery                 9000 non-null   object 
dtypes: float64(1), int64(1), object(11)
memory usage: 914.2+ KB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   country                1000 non-null   object 
 1   description            1000 non-null   object 
 2   designation            716 non-null    object 
 3   points                 1000 non-null   int64  
 4   price                  920 non-null    float64
 5   province               1000 non-null   object 
 6   region_1               831 non-null    object 
 7   region_2               384 non-null    object 
 8   taster_name            792 non-null    object 
 9   taster_twitter_handle  756 non-null    object 
 10  title                  1000 non-null   object 
 11  variety                1000 non-null   object 
 12  winery                 1000 non-null   object 
dtypes: float64(1), int64(1), object(11)
memory usage: 101.7+ KB


In [7]:
train.describe()

Unnamed: 0,points,price
count,9000.0,8403.0
mean,88.455222,35.532191
std,3.025945,40.750683
min,80.0,5.0
25%,86.0,17.0
50%,88.0,25.0
75%,91.0,42.0
max,100.0,1300.0


Majority of the features are categorical and we have some missing data in the both datasets. Machine learning models can only work with numerical and non-empty values. Thus we are going to develop strategies in Feature Engineering to impute the missing data and transform categorical values into the numeric values.

In [8]:
test.describe()

Unnamed: 0,points,price
count,1000.0,920.0
mean,88.503,34.675
std,3.067475,42.240874
min,80.0,7.0
25%,86.0,17.0
50%,88.0,25.0
75%,91.0,41.0
max,97.0,1000.0
