## SPANISH WINE DATASET

In [1]:
#importing libraries for reading dataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#Reading dataset
wine = pd.read_csv("wines_SPA2.csv")
wine.head(10)

Unnamed: 0,winery,wine,year,rating,num_reviews,country,region,price,type,body,acidity
0,Teso La Monja,Tinto,2013,4.9,58,Espana,Toro,995.0,Toro Red,5.0,3.0
1,Artadi,Vina El Pison,2018,4.9,31,Espana,Vino de Espana,313.5,Tempranillo,4.0,2.0
2,Vega Sicilia,Unico,2009,4.8,1793,Espana,Ribera del Duero,324.95,Ribera Del Duero Red,5.0,3.0
3,Vega Sicilia,Unico,1999,4.8,1705,Espana,Ribera del Duero,692.96,Ribera Del Duero Red,5.0,3.0
4,Vega Sicilia,Unico,1996,4.8,1309,Espana,Ribera del Duero,778.06,Ribera Del Duero Red,5.0,3.0
5,Vega Sicilia,Unico,1998,4.8,1209,Espana,Ribera del Duero,490.0,Ribera Del Duero Red,5.0,3.0
6,Vega Sicilia,Unico,2010,4.8,1201,Espana,Ribera del Duero,349.0,Ribera Del Duero Red,5.0,3.0
7,Vega Sicilia,Unico,1995,4.8,926,Espana,Ribera del Duero,810.89,Ribera Del Duero Red,5.0,3.0
8,Vega Sicilia,Unico Reserva Especial Edicion,2015,4.8,643,Espana,Ribera del Duero,345.0,Ribera Del Duero Red,5.0,3.0
9,Vega Sicilia,Unico,2011,4.8,630,Espana,Ribera del Duero,315.0,Ribera Del Duero Red,5.0,3.0


# Descriptive Analysis

In [3]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   winery       7500 non-null   object 
 1   wine         7500 non-null   object 
 2   year         7498 non-null   object 
 3   rating       7500 non-null   float64
 4   num_reviews  7500 non-null   int64  
 5   country      7500 non-null   object 
 6   region       7500 non-null   object 
 7   price        7500 non-null   float64
 8   type         6955 non-null   object 
 9   body         6331 non-null   float64
 10  acidity      6331 non-null   float64
dtypes: float64(4), int64(1), object(6)
memory usage: 644.7+ KB


In [4]:
wine.describe()

Unnamed: 0,rating,num_reviews,price,body,acidity
count,7500.0,7500.0,7500.0,6331.0,6331.0
mean,4.254933,451.109067,60.095822,4.158427,2.946612
std,0.118029,723.001856,150.356676,0.583352,0.248202
min,4.2,25.0,4.99,2.0,1.0
25%,4.2,389.0,18.9,4.0,3.0
50%,4.2,404.0,28.53,4.0,3.0
75%,4.2,415.0,51.35,5.0,3.0
max,4.9,32624.0,3119.08,5.0,3.0


In [5]:
wine.agg(["mean", "std"])

  wine.agg(["mean", "std"])


Unnamed: 0,rating,num_reviews,price,body,acidity
mean,4.254933,451.109067,60.095822,4.158427,2.946612
std,0.118029,723.001856,150.356676,0.583352,0.248202


In [6]:
wine.shape

(7500, 11)

# Data Cleaning

In [7]:
wine = wine.dropna(subset = ['type'])

In [8]:
wine['year'].fillna(wine['year'].mode()[0], inplace=True)
wine['body'].fillna(wine['body'].median(), inplace=True)
wine['acidity'].fillna(wine['acidity'].median(), inplace=True)


In [9]:
wine.isnull().sum()

winery         0
wine           0
year           0
rating         0
num_reviews    0
country        0
region         0
price          0
type           0
body           0
acidity        0
dtype: int64

In [None]:
wine.shape

In [None]:
wine.corr()

# MODEL TRAINING

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode categorical variables
label_encoders = {}
for column in ['winery', 'wine', 'year', 'country', 'region', 'type']:
    le = LabelEncoder()
    wine[column] = le.fit_transform(wine[column])
    label_encoders[column] = le

# Define features and target variable
X = wine.drop('type', axis=1)
y = wine['type']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=55)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

# MODEL EVALUATION

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


# MODEL PREDICTION

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
rfc_acc = accuracy_score(y_test, rfc_pred)
print("The training accuracy for Random Forest is:", round(rfc.score(X_train, y_train)*100,2), "%")
print("The testing accuracy for Random Forest is:", round(rfc_acc * 100,2), "%")

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
dtc_pred = dtc.predict(X_test)
dtc_acc = accuracy_score(y_test, dtc_pred)
print("The training accuracy for decision tree classifier is:", round(dtc.score(X_train, y_train)*100,2), "%")
print("The testing accuracy for decision tree classifier is:", round(dtc_acc * 100,2), "%")

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
bc = BaggingClassifier()
bc.fit(X_train, y_train)
bc_pred = bc.predict(X_test)
bc_acc = accuracy_score(y_test, bc_pred)
print("The training accuracy for bagging classifier is:", round(bc.score(X_train, y_train)*100,2), "%")
print("The testing accuracy for bagging classifier is:", round(bc_acc * 100,2), "%")

# SAVING PREDICTION

In [None]:
import joblib

# Save the model as a pickle file
filename = './wine_classifer.pkl'
joblib.dump(rfc, filename)
