# IBM Applied Data Science Capstone

## Import Packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

## Load dataset from CSV File

In [2]:
data = pd.read_csv("winemag-data_first150k.csv")

## Inspect dataset

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150930 entries, 0 to 150929
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Unnamed: 0   150930 non-null  int64  
 1   country      150925 non-null  object 
 2   description  150930 non-null  object 
 3   designation  105195 non-null  object 
 4   points       150930 non-null  int64  
 5   price        137235 non-null  float64
 6   province     150925 non-null  object 
 7   region_1     125870 non-null  object 
 8   region_2     60953 non-null   object 
 9   variety      150930 non-null  object 
 10  winery       150930 non-null  object 
dtypes: float64(1), int64(2), object(8)
memory usage: 12.7+ MB


## View dataset

In [4]:
data.head(20)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude
5,5,Spain,"Deep, dense and pure from the opening bell, th...",Numanthia,95,73.0,Northern Spain,Toro,,Tinta de Toro,Numanthia
6,6,Spain,Slightly gritty black-fruit aromas include a s...,San Román,95,65.0,Northern Spain,Toro,,Tinta de Toro,Maurodos
7,7,Spain,Lush cedary black-fruit aromas are luxe and of...,Carodorum Único Crianza,95,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
8,8,US,This re-named vineyard was formerly bottled as...,Silice,95,65.0,Oregon,Chehalem Mountains,Willamette Valley,Pinot Noir,Bergström
9,9,US,The producer sources from two blocks of the vi...,Gap's Crown Vineyard,95,60.0,California,Sonoma Coast,Sonoma,Pinot Noir,Blue Farm


## Drop unnecessary columns with low correlation

In [5]:
data.drop(columns=['Unnamed: 0', 'description', 'region_2'], inplace=True)

In [6]:
data.head()

Unnamed: 0,country,designation,points,price,province,region_1,variety,winery
0,US,Martha's Vineyard,96,235.0,California,Napa Valley,Cabernet Sauvignon,Heitz
1,Spain,Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sauvignon Blanc,Macauley
3,US,Reserve,96,65.0,Oregon,Willamette Valley,Pinot Noir,Ponzi
4,France,La Brûlade,95,66.0,Provence,Bandol,Provence red blend,Domaine de la Bégude


## Convert string values to numbers

In [6]:
factor = pd.factorize(data['variety'])
data.variety = factor[0]

factor = pd.factorize(data['country'])
data.country = factor[0]

factor = pd.factorize(data['province'])
data.province = factor[0]

factor = pd.factorize(data['region_1'])
data.region_1 = factor[0]

factor = pd.factorize(data['winery'])
data.winery = factor[0]

factor = pd.factorize(data['designation'])
data.designation = factor[0]

In [7]:
data.head()

Unnamed: 0,country,designation,points,price,province,region_1,variety,winery
0,0,0,96,235.0,0,0,0,0
1,1,1,96,110.0,1,1,1,1
2,0,2,96,90.0,0,2,2,2
3,0,3,96,65.0,2,3,3,3
4,2,4,95,66.0,3,4,4,4


## Show correlations of the data

In [8]:
data.corr()

Unnamed: 0,country,designation,points,price,province,region_1,variety,winery
country,1.0,0.08507,-0.034472,-0.070326,0.647056,-0.113335,0.127098,0.03605
designation,0.08507,1.0,0.055096,0.075407,0.089102,0.091511,0.02631,0.158762
points,-0.034472,0.055096,1.0,0.459863,-0.076019,-0.061243,-0.00519,-0.14317
price,-0.070326,0.075407,0.459863,1.0,-0.084021,-0.016428,-0.010404,-0.045394
province,0.647056,0.089102,-0.076019,-0.084021,1.0,0.046251,0.126546,0.128598
region_1,-0.113335,0.091511,-0.061243,-0.016428,0.046251,1.0,0.221921,0.203264
variety,0.127098,0.02631,-0.00519,-0.010404,0.126546,0.221921,1.0,0.068565
winery,0.03605,0.158762,-0.14317,-0.045394,0.128598,0.203264,0.068565,1.0


## Assign X and Y Values

In [8]:
X = data[['country', 'province', 'region_1', 'winery', 'designation']].values
Y = data['variety'].values

## Split dataset to train and test

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

## Normalize X values

In [10]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Create, Train, and Predict Random Forest Classifier

In [11]:
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy')
classifier.fit(X_train, y_train)
y = classifier.predict(X_test)

## Evaluate Model Performance

In [12]:
print("Accuracy: " + str(round(accuracy_score(y_test, y) * 100, 2)) + "%")

Accuracy: 65.47%


In [13]:
print("F1 Score: " + str(f1_score(y_test, y, average='weighted')))

F1 Score: 0.6521461861477633
