# Data creation

## Installing the required libraries 

In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


## Function to create the dataset

In [11]:
import numpy as np
import pandas as pd
import random
    

def generate_whiskey(num_rows=500):
    """
    Generate a balanced DataFrame with whiskey data across all price categories.
    
    Parameters:
    - num_rows: int — Number of rows to generate (default 500).
    
    Returns:
    - pd.DataFrame — Whiskey dataset.
    """

    # constants
    brands = ["Macallan", "Glenfiddich", "Yamazaki", "Lagavulin", "Jack Daniel's", 
              "Buffalo Trace", "Balvenie", "Ardbeg", "Jameson", "Highland Park"]
    types = ["Scotch", "Bourbon", "Rye", "Japanese", "Irish"]
    regions = {
        "Scotch": ["Islay", "Speyside", "Highlands", "Lowlands"],
        "Bourbon": ["Kentucky", "Tennessee"],
        "Rye": ["Canada", "USA"],
        "Japanese": ["Honshu", "Hokkaido"],
        "Irish": ["Dublin", "Cork"]
    }
    cask_types = ["Sherry", "Bourbon", "Port", "Wine", "Rum"]
    bottling_types = ["Single Malt", "Blended", "Single Cask", "Cask Strength"]
    
    # category definitions (linked to price)
    category_definitions = {
        "Basic": (25, 49),
        "Standard": (50, 88),
        "Premium": (89, 128),
        "Exclusive": (129, 278),
        "Luxury": (279, 500)
    }

    
    categories = list(category_definitions.keys())
    num_classes = len(categories)
    per_class = num_rows // num_classes
    remainder = num_rows % num_classes

    data = []

    for i, category in enumerate(categories):
        count = per_class + (1 if i < remainder else 0)
        price_min, price_max = category_definitions[category]

        for _ in range(count):
            brand = random.choice(brands)
            w_type = random.choice(types)
            region = random.choice(regions[w_type])
            age = np.random.choice([0, *range(3, 31)], p=[0.1] + [0.9 / 28] * 28)
            abv = round(random.uniform(40, 60), 1)
            cask = random.choice(cask_types)
            bottling = random.choice(bottling_types)
            limited = np.random.rand() < 0.15
            release_year = random.randint(1990, 2025)
            awards = np.random.poisson(1.5)
            avg_rating = round(np.random.normal(85 + (age / 30) * 10 + awards, 3), 1)
            price = round(random.uniform(price_min, price_max), 2)

            # rating category (ordinal)
            if avg_rating < 85:
                rating_category = "Low"
            elif avg_rating < 90:
                rating_category = "Medium"
            elif avg_rating < 95:
                rating_category = "High"
            else:
                rating_category = "Excelent"

            whiskey_name = f"{brand} {age if age else 'NAS'} {cask} Cask"

            data.append([
                whiskey_name, brand, w_type, age, abv, region, cask,
                bottling, price, limited, release_year, avg_rating,
                awards, rating_category, category
            ])

    columns = [
        "whiskey_name", "brand", "type", "age", "abv", "region", "cask_type",
        "bottling_type", "retail_price_usd", "is_limited_edition",
        "release_year", "average_rating", "award_wins", "rating_category", "category"
    ]

    df = pd.DataFrame(data, columns=columns)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    return df

## Create, process and save dataset in CSV file

In [12]:
# generate dataset
df = generate_whiskey(5000)

# display df and check nulls
print("===== Displaying dataset =====")
display(df)

print("\n===== Nulls by column =====")
print(df.isnull().sum())

===== Displaying dataset =====


Unnamed: 0,whiskey_name,brand,type,age,abv,region,cask_type,bottling_type,retail_price_usd,is_limited_edition,release_year,average_rating,award_wins,rating_category,category
0,Balvenie 8 Wine Cask,Balvenie,Scotch,8,51.3,Highlands,Wine,Cask Strength,51.40,False,2009,84.9,1,Low,Standard
1,Jack Daniel's 21 Rum Cask,Jack Daniel's,Rye,21,46.2,Canada,Rum,Single Cask,96.49,False,2016,99.5,2,Excelent,Premium
2,Lagavulin 5 Bourbon Cask,Lagavulin,Irish,5,51.6,Dublin,Bourbon,Cask Strength,100.74,False,2015,83.1,1,Low,Premium
3,Lagavulin 29 Rum Cask,Lagavulin,Rye,29,49.9,USA,Rum,Blended,50.05,False,1997,98.8,1,Excelent,Standard
4,Macallan 25 Rum Cask,Macallan,Rye,25,43.2,Canada,Rum,Single Cask,48.94,False,2012,95.2,1,Excelent,Basic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,Ardbeg 14 Bourbon Cask,Ardbeg,Rye,14,49.0,Canada,Bourbon,Single Malt,338.84,False,2025,94.7,1,High,Luxury
4996,Highland Park 9 Wine Cask,Highland Park,Japanese,9,51.2,Hokkaido,Wine,Blended,41.67,False,2021,87.4,1,Medium,Basic
4997,Glenfiddich 28 Port Cask,Glenfiddich,Japanese,28,57.7,Honshu,Port,Single Cask,262.86,False,2013,94.8,2,High,Exclusive
4998,Macallan 28 Bourbon Cask,Macallan,Rye,28,54.6,USA,Bourbon,Blended,249.43,True,2024,102.0,2,Excelent,Exclusive



===== Nulls by column =====
whiskey_name          0
brand                 0
type                  0
age                   0
abv                   0
region                0
cask_type             0
bottling_type         0
retail_price_usd      0
is_limited_edition    0
release_year          0
average_rating        0
award_wins            0
rating_category       0
category              0
dtype: int64


In [13]:
# save dataset
filename = "whiskey_dataset.csv"
df.to_csv(filename, index=False)
print(f"\nDataset saved as {filename}")


Dataset saved as whiskey_dataset.csv


# Load and process dataset

In [14]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle

# attempt to load the CSV file
filename = "whiskey_dataset.csv"

try:
    df = pd.read_csv(filename)
    print("===== Dataset loaded successfully =====")
    # Display the first few rows
    display(df)
    
except FileNotFoundError:
    print(f"Error: The file '{filename}' was not found.")
except pd.errors.EmptyDataError:
    print(f"Error: The file '{filename}' is empty.")
except pd.errors.ParserError:
    print(f"Error: There was a problem parsing the file '{filename}'.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

# process dataset

# 1. apply label encoding in target df creation
label_encoder = LabelEncoder()
target_df_encoded = label_encoder.fit_transform(df['category'])

# 2. drop unneeded cols to create the features_df
features_df = df.drop(['category', 'whiskey_name'], axis=1)

# 3. apply One Hot to categorical independent columns

#     3.1 identify categorical and numerical columns
categorical_cols = features_df.select_dtypes(include=['object', 'bool']).columns
numeric_cols = features_df.select_dtypes(include=['int64', 'float64']).columns

#     3.2 apply One Hot to categorical columns
OH_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
OH_encoded = OH_encoder.fit_transform(features_df[categorical_cols])

OH_feature_names = OH_encoder.get_feature_names_out(categorical_cols)
OH_df = pd.DataFrame(OH_encoded, columns=OH_feature_names)

#     3.3 keep numerical columns
numeric_df = features_df[numeric_cols].reset_index(drop=True)

# 4. concat both dataframes
features_df = pd.concat([numeric_df, OH_df], axis=1)

# 5. save the feature columns (this includes both categorical ones and numerical ones)
feature_columns = list(numeric_cols) + list(OH_encoder.get_feature_names_out(categorical_cols))

# 6. normalize data, this is because SVM needs the data being normalized
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(features_df)
normalized_df = pd.DataFrame(normalized_features, columns=features_df.columns)

# 7. add back target
normalized_df['category'] = target_df_encoded

# save utils for later
dirname = "model_files"
# label encoder
with open(f'{dirname}/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)
# OH encoded columns
with open(f'{dirname}/feature_columns.pkl', 'wb') as f:
    pickle.dump(feature_columns, f)
# OH Encoder
with open(f'{dirname}/one_hot_encoder.pkl', 'wb') as f:
    pickle.dump(OH_encoder, f)
# Normalizer
with open(f'{dirname}/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# display normalized dataset
print("\n===== Processed Dataset =====")
display(normalized_df)

# split data and save
df_X_normalized = normalized_df.drop(columns=['category'])
df_Y = normalized_df[['category']]

df_X_normalized_train, df_X_normalized_test, df_Y_train, df_Y_test = train_test_split(df_X_normalized, df_Y, 
                                                                                      test_size=0.2, random_state=100)
with open(f'{dirname}/splited_data.pkl', 'wb') as file:
    pickle.dump((df_X_normalized_train, df_X_normalized_test, df_Y_train, df_Y_test), file)
print("\n===== Splited data saved =====")

===== Dataset loaded successfully =====


Unnamed: 0,whiskey_name,brand,type,age,abv,region,cask_type,bottling_type,retail_price_usd,is_limited_edition,release_year,average_rating,award_wins,rating_category,category
0,Balvenie 8 Wine Cask,Balvenie,Scotch,8,51.3,Highlands,Wine,Cask Strength,51.40,False,2009,84.9,1,Low,Standard
1,Jack Daniel's 21 Rum Cask,Jack Daniel's,Rye,21,46.2,Canada,Rum,Single Cask,96.49,False,2016,99.5,2,Excelent,Premium
2,Lagavulin 5 Bourbon Cask,Lagavulin,Irish,5,51.6,Dublin,Bourbon,Cask Strength,100.74,False,2015,83.1,1,Low,Premium
3,Lagavulin 29 Rum Cask,Lagavulin,Rye,29,49.9,USA,Rum,Blended,50.05,False,1997,98.8,1,Excelent,Standard
4,Macallan 25 Rum Cask,Macallan,Rye,25,43.2,Canada,Rum,Single Cask,48.94,False,2012,95.2,1,Excelent,Basic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,Ardbeg 14 Bourbon Cask,Ardbeg,Rye,14,49.0,Canada,Bourbon,Single Malt,338.84,False,2025,94.7,1,High,Luxury
4996,Highland Park 9 Wine Cask,Highland Park,Japanese,9,51.2,Hokkaido,Wine,Blended,41.67,False,2021,87.4,1,Medium,Basic
4997,Glenfiddich 28 Port Cask,Glenfiddich,Japanese,28,57.7,Honshu,Port,Single Cask,262.86,False,2013,94.8,2,High,Exclusive
4998,Macallan 28 Bourbon Cask,Macallan,Rye,28,54.6,USA,Bourbon,Blended,249.43,True,2024,102.0,2,Excelent,Exclusive



===== Processed Dataset =====


Unnamed: 0,age,abv,retail_price_usd,release_year,average_rating,award_wins,brand_Ardbeg,brand_Balvenie,brand_Buffalo Trace,brand_Glenfiddich,...,bottling_type_Cask Strength,bottling_type_Single Cask,bottling_type_Single Malt,is_limited_edition_False,is_limited_edition_True,rating_category_Excelent,rating_category_High,rating_category_Low,rating_category_Medium,category
0,0.266667,0.565,0.055479,0.542857,0.328173,0.142857,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,4
1,0.700000,0.310,0.150488,0.742857,0.780186,0.285714,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,3
2,0.166667,0.580,0.159443,0.714286,0.272446,0.142857,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,3
3,0.966667,0.495,0.052635,0.200000,0.758514,0.142857,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,4
4,0.833333,0.160,0.050296,0.628571,0.647059,0.142857,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.466667,0.450,0.661139,1.000000,0.631579,0.142857,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,2
4996,0.300000,0.560,0.034978,0.885714,0.405573,0.142857,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0
4997,0.933333,0.885,0.501043,0.657143,0.634675,0.285714,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
4998,0.933333,0.730,0.472745,0.971429,0.857585,0.285714,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1



===== Splited data saved =====


## Create dataset_infos.json 

In [6]:
from datasets import Dataset, DatasetDict, DatasetInfo
import pandas as pd
import json

dirname = "model_files"

# load splited dataset
with open(f'{dirname}/splited_data.pkl', 'rb') as file:
    df_X_normalized_train, df_X_normalized_test, df_Y_train, df_Y_test = pickle.load(file)

# combine X and Y
train_df = df_X_normalized_train.copy()
train_df['category'] = df_Y_train.squeeze().values

test_df = df_X_normalized_test.copy()
test_df['category'] = df_Y_test.squeeze().values

# create Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# create datasetDict
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# generate dataset_infos.json
info = DatasetInfo(
    description="Whiskey classification dataset with normalized features and encoded target category.",
    features=train_dataset.features,
)

# convert to a serializable dictionary manually
info_dict = {
    "description": info.description,
    "features": {key: str(val) for key, val in info.features.items()}  # Convert features to string to be JSON-serializable
}

# Save dataset_infos.json locally
info_path = f"{dirname}/dataset_infos.json"
with open(info_path, "w") as f:
    json.dump(info_dict, f, indent=2)

print(f"dataset_infos.json saved at: {info_path}")

  from .autonotebook import tqdm as notebook_tqdm


dataset_infos.json saved at: model_files/dataset_infos.json


# Train model

## Evaluate objective

The objective is to correct **clasify different whiskeys** into their appropiate category **based on various features**.

## Model type decision

The chosen model type is the _**SVM with Linear Kernel**_ <br>

This is because, after various trainments using diferent models (_SVM Linear_, _SVM RBF_, _KNN_, _Decision tree_ and _Random forest_), the model _**SVM with Linear Kernel**_ has: <br>
<ul>
    <li>Accuracy: 84%</li>
    <li>Macro F1 Scores: 0.83</li>
    <li>F1 Scores per Class: 0.84, 0.92, 0.98, 0.71, 0.70</li>
    <li>Balances both high accuracy and strong performance across all classes</li>
</ul>

## Train process

### Load splited data

In [16]:
import pickle

dirname = "model_files"
with open(f'{dirname}/splited_data.pkl', 'rb') as file:
    df_X_normalized_train, df_X_normalized_test, df_Y_train, df_Y_test = pickle.load(file)

# Confirm data is loaded
print("===== Splitted data loaded successfully =====")
print(f"Train X shape: {df_X_normalized_train.shape}")
print(f"Test X shape: {df_X_normalized_test.shape}")
print(f"Train Y shape: {df_Y_train.shape}")
print(f"Test Y shape: {df_Y_test.shape}")

===== Splitted data loaded successfully =====
Train X shape: (4000, 48)
Test X shape: (1000, 48)
Train Y shape: (4000, 1)
Test Y shape: (1000, 1)


In [17]:
import pandas as pd
import numpy as np
import math
from sklearn import linear_model
import matplotlib.pyplot as pl
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

'''
KERNEL = linear
REGULARIZATION PARAM = 50 -> C=50
'''

# algorithm method creation
svm_classifier = SVC(kernel='linear', C=50)

# train model with the classifier method
svm_model = svm_classifier.fit(df_X_normalized_train, df_Y_train['category'])

# predict data
predicted_Y_data = svm_model.predict(df_X_normalized_test)

print("\n******************")
print("Evaluation metrics")
print("******************\n")

# confusion matrix
cm = confusion_matrix(df_Y_test,predicted_Y_data)
print(f"\nConfussion matrix: \n{cm}")

# Accuracy
e = accuracy_score(df_Y_test,predicted_Y_data)
print(f"\nAccuracy: {e}")

# precission
p = precision_score(df_Y_test,predicted_Y_data, average=None, zero_division=0)
print(f"\nPrecission: {p}")

# sensitivity
s = recall_score(df_Y_test,predicted_Y_data, average=None, zero_division='warn')
print(f"\nSensitivity: {s}")

# f1 score
f1 = f1_score(df_Y_test,predicted_Y_data, average=None, zero_division=0)
print(f"\nF1 score: {f1}")

# add in dict for output
if 'models_metrics_outputs' in globals():
    models_metrics_outputs['SVM: Linear Kernel'] = {
        'confussion matrix' : cm,
        'accuracy' : e,
        'precission' : p,
        'sensitivity' : s,
        'f1' : f1
    }
    print("Metrics loaded to compare")


******************
Evaluation metrics
******************


Confussion matrix: 
[[193   0   0   0   1]
 [  0 218   1   5   0]
 [  0   2 191   0   0]
 [  0   0   0 191   4]
 [  7   0   0   3 184]]

Accuracy: 0.977

Precission: [0.965      0.99090909 0.99479167 0.95979899 0.97354497]

Sensitivity: [0.99484536 0.97321429 0.98963731 0.97948718 0.94845361]

F1 score: [0.97969543 0.98198198 0.99220779 0.96954315 0.96083551]


### Save model

In [18]:
modelname = "svm_model.pkl"
try:
    with open(modelname, 'wb') as file:
        pickle.dump(svm_model, file)

    print(f"Model saved as {modelname}")
except Exception as e:
    print(f"An error occurred while saving the model: {e}")

Model saved as svm_model.pkl


# Test Model

## Only 1 test

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pickle
import pandas as pd

# load processors
dirname = "model_files"
with open(f'{dirname}/label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)
with open(f'{dirname}/one_hot_encoder.pkl', 'rb') as f:
    OH_encoder = pickle.load(f)
with open(f'{dirname}/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)
with open(f'{dirname}/feature_columns.pkl', 'rb') as f:
    feature_columns = pickle.load(f)

# generate new dataset
df = generate_whiskey(num_rows=500)

# separate target
target_df = df['category'].reset_index(drop=True)
# drop uneeded columns
features_df = df.drop(['category', 'whiskey_name'], axis=1)

# identify categorical and numerical columns
categorical_cols = features_df.select_dtypes(include=['object', 'bool']).columns
numeric_cols = features_df.select_dtypes(include=['int64', 'float64']).columns

# apply One Hot to categorical columns
OH_encoded = OH_encoder.transform(features_df[categorical_cols])  # Use transform, no fit
OH_feature_names = OH_encoder.get_feature_names_out(categorical_cols)
OH_df = pd.DataFrame(OH_encoded, columns=OH_feature_names)

# keep numerical columns
numeric_df = features_df[numeric_cols].reset_index(drop=True)

# concat both dataframes
features_df = pd.concat([numeric_df, OH_df], axis=1)

# get the missing columns in new dataset using the list saved
missing_cols = set(feature_columns) - set(features_df.columns)
for col in missing_cols:
    features_df[col] = 0   # add columns with value = 0

# re order the columns
features_df = features_df[feature_columns]

# normalize using the scaler loaded (use transform, not fit_transform)
normalized_features = scaler.transform(features_df)
normalized_df = pd.DataFrame(normalized_features, columns=feature_columns)

# add back encoded target
normalized_df['category'] = target_df.reset_index(drop=True)
# apply label encoder to target column
normalized_df['category'] = label_encoder.transform(normalized_df['category'])

# display
print("\n========== NORMALIZED DATASET ==========")
display(normalized_df)



# load model
with open('whiskey_classificator_model.pkl', 'rb') as f:
    svm_model = pickle.load(f)
    
print("Model has been loaded")

# drop category before predict
normalized_df = normalized_df.drop(['category'], axis=1)

# predict with normalized_df
predictions = svm_model.predict(normalized_df)

# create a df with the results
result_df = df.copy()

result_df['Predicted category'] = predictions

# apply label encoder to target column
result_df['category'] = label_encoder.transform(result_df['category'])

# display predictions vs real
print("\n========== Predictions result comparison ==========")
display(result_df)




Unnamed: 0,age,abv,retail_price_usd,release_year,average_rating,award_wins,brand_Ardbeg,brand_Balvenie,brand_Buffalo Trace,brand_Glenfiddich,...,bottling_type_Cask Strength,bottling_type_Single Cask,bottling_type_Single Malt,is_limited_edition_False,is_limited_edition_True,rating_category_Excelent,rating_category_High,rating_category_Low,rating_category_Medium,category
0,0.966667,0.965,0.368149,0.857143,0.696594,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1
1,0.300000,0.930,0.036726,0.657143,0.312693,0.000000,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
2,0.600000,0.065,0.405192,0.485714,0.594427,0.142857,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1
3,0.000000,0.515,0.077372,0.828571,0.362229,0.142857,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,4
4,0.000000,0.630,0.075181,0.171429,0.442724,0.142857,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.933333,0.205,0.103458,0.885714,0.603715,0.142857,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,4
496,0.233333,0.385,0.192945,0.028571,0.365325,0.142857,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3
497,0.700000,0.880,0.352241,0.885714,0.616099,0.285714,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
498,0.566667,0.330,0.866748,0.742857,0.517028,0.285714,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2


Model has been loaded



Unnamed: 0,whiskey_name,brand,type,age,abv,region,cask_type,bottling_type,retail_price_usd,is_limited_edition,release_year,average_rating,award_wins,rating_category,category,Predicted category
0,Jack Daniel's 29 Rum Cask,Jack Daniel's,Scotch,29,59.3,Speyside,Rum,Single Malt,199.79,False,2020,96.8,0,Excelent,1,1
1,Glenfiddich 9 Port Cask,Glenfiddich,Scotch,9,58.6,Highlands,Port,Single Cask,42.50,False,2013,84.4,0,Low,0,0
2,Jack Daniel's 18 Rum Cask,Jack Daniel's,Bourbon,18,41.3,Kentucky,Rum,Cask Strength,217.37,True,2007,93.5,1,High,1,1
3,Lagavulin NAS Wine Cask,Lagavulin,Scotch,0,50.3,Speyside,Wine,Single Malt,61.79,False,2019,86.0,1,Medium,4,4
4,Glenfiddich NAS Port Cask,Glenfiddich,Bourbon,0,52.6,Kentucky,Port,Single Cask,60.75,False,1996,88.6,1,Medium,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,Jack Daniel's 28 Bourbon Cask,Jack Daniel's,Rye,28,44.1,Canada,Bourbon,Blended,74.17,False,2021,93.8,1,High,4,4
496,Glenfiddich 7 Rum Cask,Glenfiddich,Irish,7,47.7,Dublin,Rum,Cask Strength,116.64,False,1991,86.1,1,Medium,3,3
497,Lagavulin 21 Rum Cask,Lagavulin,Bourbon,21,57.6,Tennessee,Rum,Blended,192.24,False,2021,94.2,2,High,1,1
498,Jameson 17 Bourbon Cask,Jameson,Japanese,17,46.6,Hokkaido,Bourbon,Single Cask,436.42,False,2016,91.0,2,High,2,2



Accuracy (Prediction Rate): 96.20%


## 10 tests with different datasets

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pickle
import pandas as pd
import random

# load processors
dirname = "model_files"
with open(f'{dirname}/label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)
with open(f'{dirname}/one_hot_encoder.pkl', 'rb') as f:
    OH_encoder = pickle.load(f)
with open(f'{dirname}/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)
with open(f'{dirname}/feature_columns.pkl', 'rb') as f:
    feature_columns = pickle.load(f)

# load model
with open('svm_model.pkl', 'rb') as f:
    svm_model = pickle.load(f)
    
print("Model has been loaded")


n_tests = 10
accuracy_results = {}
for i in range(0, n_tests):

    # generate new dataset
    nr = random.randint(500, 50000)
    df = generate_whiskey(num_rows=nr)

    # separate target
    target_df = df['category'].reset_index(drop=True)
    # drop uneeded columns
    features_df = df.drop(['category', 'whiskey_name'], axis=1)
    
    # identify categorical and numerical columns
    categorical_cols = features_df.select_dtypes(include=['object', 'bool']).columns
    numeric_cols = features_df.select_dtypes(include=['int64', 'float64']).columns
    
    # apply One Hot to categorical columns
    OH_encoded = OH_encoder.transform(features_df[categorical_cols])  # Use transform, no fit
    OH_feature_names = OH_encoder.get_feature_names_out(categorical_cols)
    OH_df = pd.DataFrame(OH_encoded, columns=OH_feature_names)
    
    # keep numerical columns
    numeric_df = features_df[numeric_cols].reset_index(drop=True)
    
    # concat both dataframes
    features_df = pd.concat([numeric_df, OH_df], axis=1)
    
    # get the missing columns in new dataset using the list saved
    missing_cols = set(feature_columns) - set(features_df.columns)
    for col in missing_cols:
        features_df[col] = 0   # add columns with value = 0
    
    # re order the columns
    features_df = features_df[feature_columns]
    
    # normalize using the scaler loaded (use transform, not fit_transform)
    normalized_features = scaler.transform(features_df)
    normalized_df = pd.DataFrame(normalized_features, columns=feature_columns)
    
    # add back encoded target
    normalized_df['category'] = target_df.reset_index(drop=True)
    # apply label encoder to target column
    normalized_df['category'] = label_encoder.transform(normalized_df['category'])

    # drop category before predict
    normalized_df = normalized_df.drop(['category'], axis=1)
    
    # predict with normalized_df
    predictions = svm_model.predict(normalized_df)
    
    # create a df with the results
    result_df = df.copy()
    
    result_df['Predicted category'] = predictions
    result_df['Predicted category'] = label_encoder.transform(result_df['Predicted category'])
    
    # apply label encoder to target column
    result_df['category'] = label_encoder.transform(result_df['category'])

    
    # calculate the accuracy (prediction rate)
    accuracy = accuracy_score(result_df['category'], result_df['Predicted category'])

    # save to results dict
    accuracy_results[f"test {i+1}"] = {
        "Rows": nr,
        "Accuracy": f"{accuracy * 100:.2f}%"
    }

# show the results
results_df = pd.DataFrame.from_dict(accuracy_results, orient='index')
print("\nSummary Table:")
print(results_df)

# EXPORT MODEL TO DOWNLOADS FOLDER

In [None]:
%cp "whiskey_classificator_model.pkl" "/home/$USER/Downloads/whiskey_classificator_model.pkl"