# Data creation

## Installing the required libraries 

In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


## Function to create the dataset

In [2]:
import numpy as np
import pandas as pd
import random
    

def generate_whiskey(num_rows=500):
    """
    Generate a balanced DataFrame with whiskey data across all price categories.
    
    Parameters:
    - num_rows: int — Number of rows to generate (default 500).
    
    Returns:
    - pd.DataFrame — Whiskey dataset.
    """

    # constants
    brands = ["Macallan", "Glenfiddich", "Yamazaki", "Lagavulin", "Jack Daniel's", 
              "Buffalo Trace", "Balvenie", "Ardbeg", "Jameson", "Highland Park"]
    types = ["Scotch", "Bourbon", "Rye", "Japanese", "Irish"]
    regions = {
        "Scotch": ["Islay", "Speyside", "Highlands", "Lowlands"],
        "Bourbon": ["Kentucky", "Tennessee"],
        "Rye": ["Canada", "USA"],
        "Japanese": ["Honshu", "Hokkaido"],
        "Irish": ["Dublin", "Cork"]
    }
    cask_types = ["Sherry", "Bourbon", "Port", "Wine", "Rum", "Tequila"]
    bottling_types = ["Single Malt", "Blended", "Single Cask", "Cask Strength"]
    category_definitions = {
        "Basic": (25, 49),
        "Standard": (50, 88),
        "Premium": (89, 128),
        "Exclusive": (129, 278),
        "Luxury": (279, 500)
    }

    data = []

    for _ in range(num_rows):
        brand = random.choice(brands)
        w_type = random.choice(types)
        region = random.choice(regions[w_type])
        age = np.random.choice([0, *range(3, 31)], p=[0.1] + [0.9 / 28] * 28)
        abv = round(random.uniform(40, 60), 1)

        if random.random() < 0.03:
            age = random.choice([0, 50])
            abv = round(random.uniform(30, 75), 1)
            region = "Unknown"
            cask = "Tequila"
        else:
            cask = random.choice(cask_types)

        bottling = random.choice(bottling_types)
        limited = np.random.rand() < 0.15
        release_year = random.randint(1990, 2025)
        awards = np.random.poisson(1.5)
        base_rating = 85 + (age / 30) * 10 + awards
        avg_rating = round(np.random.normal(base_rating, 1.5), 1)

        base_price = round(np.random.uniform(25, 500), 2)
        noise = np.random.normal(0, base_price * 0.1)
        price = round(base_price + noise, 2)
        price = max(20, price)

        category = next((k for k, (lo, hi) in category_definitions.items() if lo <= price <= hi), "Luxury")

        if avg_rating < 85:
            rating_category = "Low"
        elif avg_rating < 90:
            rating_category = "Medium"
        elif avg_rating < 95:
            rating_category = "High"
        else:
            rating_category = "Excellent"

        whiskey_name = f"{brand} {age if age else 'NAS'} {cask} Cask"

        data.append([
            whiskey_name, brand, w_type, age, abv, region, cask,
            bottling, price, limited, release_year, avg_rating,
            awards, rating_category, category
        ])

    columns = [
        "whiskey_name", "brand", "type", "age", "abv", "region", "cask_type",
        "bottling_type", "retail_price_usd", "is_limited_edition",
        "release_year", "average_rating", "award_wins", "rating_category", "category"
    ]

    df = pd.DataFrame(data, columns=columns)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    return df

## Create, process and save dataset in CSV file

In [3]:
# generate dataset
df = generate_whiskey(5000)

# display df and check nulls
print("===== Displaying dataset =====")
display(df)

print("\n===== Nulls by column =====")
print(df.isnull().sum())

===== Displaying dataset =====


Unnamed: 0,whiskey_name,brand,type,age,abv,region,cask_type,bottling_type,retail_price_usd,is_limited_edition,release_year,average_rating,award_wins,rating_category,category
0,Jameson 10 Sherry Cask,Jameson,Japanese,10,55.0,Honshu,Sherry,Single Cask,276.70,False,1997,88.0,1,Medium,Exclusive
1,Macallan 4 Wine Cask,Macallan,Bourbon,4,49.3,Tennessee,Wine,Single Malt,239.32,False,2001,89.1,4,Medium,Exclusive
2,Jack Daniel's 30 Sherry Cask,Jack Daniel's,Japanese,30,55.5,Hokkaido,Sherry,Single Cask,413.26,False,2009,96.8,1,Excellent,Luxury
3,Highland Park 50 Tequila Cask,Highland Park,Scotch,50,69.5,Unknown,Tequila,Single Malt,52.27,False,2011,105.5,3,Excellent,Standard
4,Yamazaki 11 Sherry Cask,Yamazaki,Irish,11,47.0,Dublin,Sherry,Single Cask,161.22,False,1999,93.5,3,High,Exclusive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,Ardbeg 11 Sherry Cask,Ardbeg,Scotch,11,46.2,Speyside,Sherry,Cask Strength,467.00,False,2020,91.9,3,High,Luxury
4996,Lagavulin 15 Wine Cask,Lagavulin,Irish,15,40.8,Cork,Wine,Single Malt,381.65,False,2007,87.7,0,Medium,Luxury
4997,Jameson 29 Bourbon Cask,Jameson,Rye,29,48.3,Canada,Bourbon,Single Malt,282.36,False,1999,95.6,0,Excellent,Luxury
4998,Lagavulin 9 Bourbon Cask,Lagavulin,Bourbon,9,48.4,Tennessee,Bourbon,Single Malt,141.69,False,2000,87.4,1,Medium,Exclusive



===== Nulls by column =====
whiskey_name          0
brand                 0
type                  0
age                   0
abv                   0
region                0
cask_type             0
bottling_type         0
retail_price_usd      0
is_limited_edition    0
release_year          0
average_rating        0
award_wins            0
rating_category       0
category              0
dtype: int64


In [4]:
# save dataset
filename = "whiskey_dataset.csv"
df.to_csv(filename, index=False)
print(f"\nDataset saved as {filename}")


Dataset saved as whiskey_dataset.csv


# Load and process dataset

In [5]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle

# attempt to load the CSV file
filename = "whiskey_dataset.csv"

try:
    df = pd.read_csv(filename)
    print("===== Dataset loaded successfully =====")
    # Display the first few rows
    display(df)
    
except FileNotFoundError:
    print(f"Error: The file '{filename}' was not found.")
except pd.errors.EmptyDataError:
    print(f"Error: The file '{filename}' is empty.")
except pd.errors.ParserError:
    print(f"Error: There was a problem parsing the file '{filename}'.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

# process dataset

# 1. apply label encoding in target df creation
label_encoder = LabelEncoder()
target_df_encoded = label_encoder.fit_transform(df['category'])

# 2. drop unneeded cols to create the features_df
features_df = df.drop(['category', 'whiskey_name'], axis=1)

# 3. apply One Hot to categorical independent columns

#     3.1 identify categorical and numerical columns
categorical_cols = features_df.select_dtypes(include=['object', 'bool']).columns
numeric_cols = features_df.select_dtypes(include=['int64', 'float64']).columns

#     3.2 apply One Hot to categorical columns
OH_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
OH_encoded = OH_encoder.fit_transform(features_df[categorical_cols])

OH_feature_names = OH_encoder.get_feature_names_out(categorical_cols)
OH_df = pd.DataFrame(OH_encoded, columns=OH_feature_names)

#     3.3 keep numerical columns
numeric_df = features_df[numeric_cols].reset_index(drop=True)

# 4. concat both dataframes
features_df = pd.concat([numeric_df, OH_df], axis=1)

# 5. save the feature columns (this includes both categorical ones and numerical ones)
feature_columns = list(numeric_cols) + list(OH_encoder.get_feature_names_out(categorical_cols))

# 6. normalize data, this is because SVM needs the data being normalized
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(features_df)
normalized_df = pd.DataFrame(normalized_features, columns=features_df.columns)

# 7. add back target
normalized_df['category'] = target_df_encoded

# save utils for later
dirname = "model_files"
# label encoder
with open(f'{dirname}/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)
# OH encoded columns
with open(f'{dirname}/feature_columns.pkl', 'wb') as f:
    pickle.dump(feature_columns, f)
# OH Encoder
with open(f'{dirname}/one_hot_encoder.pkl', 'wb') as f:
    pickle.dump(OH_encoder, f)
# Normalizer
with open(f'{dirname}/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# display normalized dataset
print("\n===== Processed Dataset =====")
display(normalized_df)

# split data and save
df_X_normalized = normalized_df.drop(columns=['category'])
df_Y = normalized_df[['category']]

df_X_normalized_train, df_X_normalized_test, df_Y_train, df_Y_test = train_test_split(df_X_normalized, df_Y, 
                                                                                      test_size=0.2, random_state=100)
with open(f'{dirname}/splited_data.pkl', 'wb') as file:
    pickle.dump((df_X_normalized_train, df_X_normalized_test, df_Y_train, df_Y_test), file)
print("\n===== Splited data saved =====")

===== Dataset loaded successfully =====


Unnamed: 0,whiskey_name,brand,type,age,abv,region,cask_type,bottling_type,retail_price_usd,is_limited_edition,release_year,average_rating,award_wins,rating_category,category
0,Jameson 10 Sherry Cask,Jameson,Japanese,10,55.0,Honshu,Sherry,Single Cask,276.70,False,1997,88.0,1,Medium,Exclusive
1,Macallan 4 Wine Cask,Macallan,Bourbon,4,49.3,Tennessee,Wine,Single Malt,239.32,False,2001,89.1,4,Medium,Exclusive
2,Jack Daniel's 30 Sherry Cask,Jack Daniel's,Japanese,30,55.5,Hokkaido,Sherry,Single Cask,413.26,False,2009,96.8,1,Excellent,Luxury
3,Highland Park 50 Tequila Cask,Highland Park,Scotch,50,69.5,Unknown,Tequila,Single Malt,52.27,False,2011,105.5,3,Excellent,Standard
4,Yamazaki 11 Sherry Cask,Yamazaki,Irish,11,47.0,Dublin,Sherry,Single Cask,161.22,False,1999,93.5,3,High,Exclusive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,Ardbeg 11 Sherry Cask,Ardbeg,Scotch,11,46.2,Speyside,Sherry,Cask Strength,467.00,False,2020,91.9,3,High,Luxury
4996,Lagavulin 15 Wine Cask,Lagavulin,Irish,15,40.8,Cork,Wine,Single Malt,381.65,False,2007,87.7,0,Medium,Luxury
4997,Jameson 29 Bourbon Cask,Jameson,Rye,29,48.3,Canada,Bourbon,Single Malt,282.36,False,1999,95.6,0,Excellent,Luxury
4998,Lagavulin 9 Bourbon Cask,Lagavulin,Bourbon,9,48.4,Tennessee,Bourbon,Single Malt,141.69,False,2000,87.4,1,Medium,Exclusive



===== Processed Dataset =====


Unnamed: 0,age,abv,retail_price_usd,release_year,average_rating,award_wins,brand_Ardbeg,brand_Balvenie,brand_Buffalo Trace,brand_Glenfiddich,...,bottling_type_Cask Strength,bottling_type_Single Cask,bottling_type_Single Malt,is_limited_edition_False,is_limited_edition_True,rating_category_Excellent,rating_category_High,rating_category_Low,rating_category_Medium,category
0,0.20,0.559551,0.400107,0.200000,0.273810,0.111111,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
1,0.08,0.431461,0.341442,0.314286,0.317460,0.444444,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1
2,0.60,0.570787,0.614426,0.542857,0.623016,0.111111,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,2
3,1.00,0.885393,0.047883,0.600000,0.968254,0.333333,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,4
4,0.22,0.379775,0.218871,0.257143,0.492063,0.333333,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.22,0.361798,0.698766,0.857143,0.428571,0.333333,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2
4996,0.30,0.240449,0.564817,0.485714,0.261905,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,2
4997,0.58,0.408989,0.408990,0.257143,0.575397,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,2
4998,0.18,0.411236,0.188220,0.285714,0.250000,0.111111,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1



===== Splited data saved =====


## Create dataset_infos.json 

In [6]:
from datasets import Dataset, DatasetDict, DatasetInfo
import pandas as pd
import json

dirname = "model_files"

# load splited dataset
with open(f'{dirname}/splited_data.pkl', 'rb') as file:
    df_X_normalized_train, df_X_normalized_test, df_Y_train, df_Y_test = pickle.load(file)

# combine X and Y
train_df = df_X_normalized_train.copy()
train_df['category'] = df_Y_train.squeeze().values

test_df = df_X_normalized_test.copy()
test_df['category'] = df_Y_test.squeeze().values

# create Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# create datasetDict
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# generate dataset_infos.json
info = DatasetInfo(
    description="Whiskey classification dataset with normalized features and encoded target category.",
    features=train_dataset.features,
)

# convert to a serializable dictionary manually
info_dict = {
    "description": info.description,
    "features": {key: str(val) for key, val in info.features.items()}  # Convert features to string to be JSON-serializable
}

# Save dataset_infos.json locally
info_path = f"{dirname}/dataset_infos.json"
with open(info_path, "w") as f:
    json.dump(info_dict, f, indent=2)

print(f"dataset_infos.json saved at: {info_path}")

  from .autonotebook import tqdm as notebook_tqdm


dataset_infos.json saved at: model_files/dataset_infos.json


# Train model

## Evaluate objective

The objective is to correct **clasify different whiskeys** into their appropiate category **based on various features**.

## Model type decision

The chosen model type is the _**SVM with Linear Kernel**_ <br>

This is because, after various trainments using diferent models (_SVM Linear_, _SVM RBF_, _KNN_, _Decision tree_ and _Random forest_), the model _**SVM with Linear Kernel**_ has: <br>
<ul>
    <li>Accuracy: 84%</li>
    <li>Macro F1 Scores: 0.83</li>
    <li>F1 Scores per Class: 0.84, 0.92, 0.98, 0.71, 0.70</li>
    <li>Balances both high accuracy and strong performance across all classes</li>
</ul>

## Train process

### Load splited data

In [7]:
import pickle

dirname = "model_files"
with open(f'{dirname}/splited_data.pkl', 'rb') as file:
    df_X_normalized_train, df_X_normalized_test, df_Y_train, df_Y_test = pickle.load(file)

# Confirm data is loaded
print("===== Splitted data loaded successfully =====")
print(f"Train X shape: {df_X_normalized_train.shape}")
print(f"Test X shape: {df_X_normalized_test.shape}")
print(f"Train Y shape: {df_Y_train.shape}")
print(f"Test Y shape: {df_Y_test.shape}")

===== Splitted data loaded successfully =====
Train X shape: (4000, 50)
Test X shape: (1000, 50)
Train Y shape: (4000, 1)
Test Y shape: (1000, 1)


In [8]:
import pandas as pd
import numpy as np
import math
from sklearn import linear_model
import matplotlib.pyplot as pl
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

'''
KERNEL = linear
REGULARIZATION PARAM = 50 -> C=50
'''

# algorithm method creation
svm_classifier = SVC(kernel='linear', C=50)

# train model with the classifier method
svm_model = svm_classifier.fit(df_X_normalized_train, df_Y_train['category'])

# predict data
predicted_Y_data = svm_model.predict(df_X_normalized_test)

print("\n******************")
print("Evaluation metrics")
print("******************\n")

# confusion matrix
cm = confusion_matrix(df_Y_test,predicted_Y_data)
print(f"\nConfussion matrix: \n{cm}")

# Accuracy
e = accuracy_score(df_Y_test,predicted_Y_data)
print(f"\nAccuracy: {e}")

# precission
p = precision_score(df_Y_test,predicted_Y_data, average=None, zero_division=0)
print(f"\nPrecission: {p}")

# sensitivity
s = recall_score(df_Y_test,predicted_Y_data, average=None, zero_division='warn')
print(f"\nSensitivity: {s}")

# f1 score
f1 = f1_score(df_Y_test,predicted_Y_data, average=None, zero_division=0)
print(f"\nF1 score: {f1}")

# add in dict for output
if 'models_metrics_outputs' in globals():
    models_metrics_outputs['SVM: Linear Kernel'] = {
        'confussion matrix' : cm,
        'accuracy' : e,
        'precission' : p,
        'sensitivity' : s,
        'f1' : f1
    }
    print("Metrics loaded to compare")


******************
Evaluation metrics
******************


Confussion matrix: 
[[ 36   0   0   0   2]
 [  0 296  12   3   0]
 [  1   2 501   2   3]
 [  0   7   0  66   2]
 [  2   0   0   6  59]]

Accuracy: 0.958

Precission: [0.92307692 0.9704918  0.97660819 0.85714286 0.89393939]

Sensitivity: [0.94736842 0.95176849 0.98428291 0.88       0.88059701]

F1 score: [0.93506494 0.96103896 0.98043053 0.86842105 0.88721805]


### Save model

In [9]:
modelname = "whiskey_classificator_model.pkl"
try:
    with open(modelname, 'wb') as file:
        pickle.dump(svm_model, file)

    print(f"Model saved as {modelname}")
except Exception as e:
    print(f"An error occurred while saving the model: {e}")

Model saved as whiskey_classificator_model.pkl


# Test Model

## Only 1 test

In [10]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pickle
import pandas as pd

# load processors
dirname = "model_files"
with open(f'{dirname}/label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)
with open(f'{dirname}/one_hot_encoder.pkl', 'rb') as f:
    OH_encoder = pickle.load(f)
with open(f'{dirname}/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)
with open(f'{dirname}/feature_columns.pkl', 'rb') as f:
    feature_columns = pickle.load(f)

# generate new dataset
df = generate_whiskey(num_rows=500)

# separate target
target_df = df['category'].reset_index(drop=True)
# drop uneeded columns
features_df = df.drop(['category', 'whiskey_name'], axis=1)

# identify categorical and numerical columns
categorical_cols = features_df.select_dtypes(include=['object', 'bool']).columns
numeric_cols = features_df.select_dtypes(include=['int64', 'float64']).columns

# apply One Hot to categorical columns
OH_encoded = OH_encoder.transform(features_df[categorical_cols])  # Use transform, no fit
OH_feature_names = OH_encoder.get_feature_names_out(categorical_cols)
OH_df = pd.DataFrame(OH_encoded, columns=OH_feature_names)

# keep numerical columns
numeric_df = features_df[numeric_cols].reset_index(drop=True)

# concat both dataframes
features_df = pd.concat([numeric_df, OH_df], axis=1)

# get the missing columns in new dataset using the list saved
missing_cols = set(feature_columns) - set(features_df.columns)
for col in missing_cols:
    features_df[col] = 0   # add columns with value = 0

# re order the columns
features_df = features_df[feature_columns]

# normalize using the scaler loaded (use transform, not fit_transform)
normalized_features = scaler.transform(features_df)
normalized_df = pd.DataFrame(normalized_features, columns=feature_columns)

# add back encoded target
normalized_df['category'] = target_df.reset_index(drop=True)
# apply label encoder to target column
normalized_df['category'] = label_encoder.transform(normalized_df['category'])

# display
print("\n========== NORMALIZED DATASET ==========")
display(normalized_df)



# load model
with open('whiskey_classificator_model.pkl', 'rb') as f:
    svm_model = pickle.load(f)
    
print("Model has been loaded")

# drop category before predict
normalized_df = normalized_df.drop(['category'], axis=1)

# predict with normalized_df
predictions = svm_model.predict(normalized_df)

# create a df with the results
result_df = df.copy()

result_df['Predicted category'] = predictions

# apply label encoder to target column
result_df['category'] = label_encoder.transform(result_df['category'])

# display predictions vs real
print("\n========== Predictions result comparison ==========")
display(result_df)




Unnamed: 0,age,abv,retail_price_usd,release_year,average_rating,award_wins,brand_Ardbeg,brand_Balvenie,brand_Buffalo Trace,brand_Glenfiddich,...,bottling_type_Cask Strength,bottling_type_Single Cask,bottling_type_Single Malt,is_limited_edition_False,is_limited_edition_True,rating_category_Excellent,rating_category_High,rating_category_Low,rating_category_Medium,category
0,0.30,0.483146,0.137088,0.114286,0.472222,0.222222,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,3
1,0.46,0.438202,0.406180,0.514286,0.519841,0.222222,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2
2,0.60,0.474157,0.367887,0.257143,0.579365,0.000000,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1
3,0.22,0.449438,0.076823,0.657143,0.361111,0.000000,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,4
4,0.20,0.566292,0.575285,0.257143,0.281746,0.000000,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.42,0.582022,0.381352,0.685714,0.424603,0.111111,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
496,0.58,0.287640,0.154729,0.171429,0.492063,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3
497,0.14,0.570787,0.318717,0.942857,0.305556,0.111111,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
498,0.00,0.269663,0.076587,0.571429,0.190476,0.111111,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,4


Model has been loaded



Unnamed: 0,whiskey_name,brand,type,age,abv,region,cask_type,bottling_type,retail_price_usd,is_limited_edition,release_year,average_rating,award_wins,rating_category,category,Predicted category
0,Jack Daniel's 15 Rum Cask,Jack Daniel's,Bourbon,15,51.6,Tennessee,Rum,Single Malt,109.11,False,1994,93.0,2,High,3,3
1,Jack Daniel's 23 Sherry Cask,Jack Daniel's,Rye,23,49.6,Canada,Sherry,Single Cask,280.57,False,2008,94.2,2,High,2,1
2,Ardbeg 30 Tequila Cask,Ardbeg,Rye,30,51.2,USA,Tequila,Single Cask,256.17,False,1999,95.7,0,Excellent,1,1
3,Lagavulin 11 Rum Cask,Lagavulin,Bourbon,11,50.1,Kentucky,Rum,Cask Strength,70.71,False,2013,90.2,0,High,4,4
4,Highland Park 10 Sherry Cask,Highland Park,Scotch,10,55.3,Speyside,Sherry,Single Cask,388.32,False,1999,88.2,0,Medium,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,Lagavulin 21 Tequila Cask,Lagavulin,Rye,21,56.0,USA,Tequila,Cask Strength,264.75,False,2014,91.8,1,High,1,1
496,Lagavulin 29 Tequila Cask,Lagavulin,Rye,29,42.9,USA,Tequila,Blended,120.35,False,1996,93.5,0,High,3,1
497,Buffalo Trace 7 Tequila Cask,Buffalo Trace,Japanese,7,55.5,Hokkaido,Tequila,Single Cask,224.84,False,2023,88.8,1,Medium,1,1
498,Buffalo Trace NAS Port Cask,Buffalo Trace,Irish,0,42.1,Cork,Port,Cask Strength,70.56,False,2010,85.9,1,Medium,4,4


## 10 tests with different datasets

In [14]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pickle
import pandas as pd
import random

# load processors
dirname = "model_files"
with open(f'{dirname}/label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)
with open(f'{dirname}/one_hot_encoder.pkl', 'rb') as f:
    OH_encoder = pickle.load(f)
with open(f'{dirname}/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)
with open(f'{dirname}/feature_columns.pkl', 'rb') as f:
    feature_columns = pickle.load(f)

# load model
with open('whiskey_classificator_model.pkl', 'rb') as f:
    svm_model = pickle.load(f)
    
print("Model has been loaded")


n_tests = 100
accuracy_results = {}
accuracies = []
for i in range(0, n_tests):

    # generate new dataset
    nr = random.randint(500, 50000)
    df = generate_whiskey(num_rows=nr)

    # separate target
    target_df = df['category'].reset_index(drop=True)
    # drop uneeded columns
    features_df = df.drop(['category', 'whiskey_name'], axis=1)
    
    # identify categorical and numerical columns
    categorical_cols = features_df.select_dtypes(include=['object', 'bool']).columns
    numeric_cols = features_df.select_dtypes(include=['int64', 'float64']).columns
    
    # apply One Hot to categorical columns
    OH_encoded = OH_encoder.transform(features_df[categorical_cols])  # Use transform, no fit
    OH_feature_names = OH_encoder.get_feature_names_out(categorical_cols)
    OH_df = pd.DataFrame(OH_encoded, columns=OH_feature_names)
    
    # keep numerical columns
    numeric_df = features_df[numeric_cols].reset_index(drop=True)
    
    # concat both dataframes
    features_df = pd.concat([numeric_df, OH_df], axis=1)
    
    # get the missing columns in new dataset using the list saved
    missing_cols = set(feature_columns) - set(features_df.columns)
    for col in missing_cols:
        features_df[col] = 0   # add columns with value = 0
    
    # re order the columns
    features_df = features_df[feature_columns]
    
    # normalize using the scaler loaded (use transform, not fit_transform)
    normalized_features = scaler.transform(features_df)
    normalized_df = pd.DataFrame(normalized_features, columns=feature_columns)
    
    # add back encoded target
    normalized_df['category'] = target_df.reset_index(drop=True)
    # apply label encoder to target column
    normalized_df['category'] = label_encoder.transform(normalized_df['category'])

    # drop category before predict
    normalized_df = normalized_df.drop(['category'], axis=1)
    
    # predict with normalized_df
    predictions = svm_model.predict(normalized_df)
    
    # create a df with the results
    result_df = df.copy()
    
    result_df['Predicted category'] = predictions
    
    # apply label encoder to target column
    result_df['category'] = label_encoder.transform(result_df['category'])

    
    # calculate the accuracy (prediction rate)
    accuracy = accuracy_score(result_df['category'], result_df['Predicted category'])

    # save to results dict
    accuracy_results[f"test {i+1}"] = {
        "Rows": nr,
        "Accuracy": f"{accuracy * 100:.2f}%"
    }
    accuracies.append(accuracy)

# show the results
results_df = pd.DataFrame.from_dict(accuracy_results, orient='index')
print("\nSummary Table:")
display(results_df)

# show average accuracy
mean_accuracy = np.mean(accuracies)
print(f"\nAverage Accuracy across {n_tests} tests: {mean_accuracy * 100:.2f}%")

Model has been loaded

Summary Table:


Unnamed: 0,Rows,Accuracy
test 1,31049,95.86%
test 2,14671,95.62%
test 3,46757,95.49%
test 4,31354,95.61%
test 5,33051,95.75%
...,...,...
test 96,48551,95.58%
test 97,8528,95.34%
test 98,31297,95.65%
test 99,31909,95.63%



Average Accuracy across 100 tests: 95.65%


# EXPORT MODEL TO DOWNLOADS FOLDER

In [None]:
%cp "svm_model.pkl" "/home/$USER/Downloads/whiskey_classificator_model.pkl"