In [1]:
%pip install gradio huggingface_hub

Note: you may need to restart the kernel to use updated packages.


In [2]:
import gradio as gr
import pandas as pd
import numpy as np
import pickle
import random
from sklearn.metrics import accuracy_score
from huggingface_hub import hf_hub_download


# load processors
dirname = "model_files"
with open(f'{dirname}/label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)
with open(f'{dirname}/one_hot_encoder.pkl', 'rb') as f:
    OH_encoder = pickle.load(f)
with open(f'{dirname}/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)
with open(f'{dirname}/feature_columns.pkl', 'rb') as f:
    feature_columns = pickle.load(f)

# set model name and filename
model_name = "Rogudev/whiskey_classificator"
model_filename = "whiskey_classificator_model.pkl"

# build model path
model_path = hf_hub_download(repo_id=model_name, filename=model_filename)
# load model from Hugging Face
with open(model_path, 'rb') as f:
    svm_model = pickle.load(f)

# function to generate a new dataset
def generate_whiskey(num_rows=500):
    """
    Generate a balanced DataFrame with whiskey data across all price categories.
    
    Parameters:
    - num_rows: int — Number of rows to generate (default 500).
    
    Returns:
    - pd.DataFrame — Whiskey dataset.
    """

    # constants
    brands = ["Macallan", "Glenfiddich", "Yamazaki", "Lagavulin", "Jack Daniel's", 
              "Buffalo Trace", "Balvenie", "Ardbeg", "Jameson", "Highland Park"]
    types = ["Scotch", "Bourbon", "Rye", "Japanese", "Irish"]
    regions = {
        "Scotch": ["Islay", "Speyside", "Highlands", "Lowlands"],
        "Bourbon": ["Kentucky", "Tennessee"],
        "Rye": ["Canada", "USA"],
        "Japanese": ["Honshu", "Hokkaido"],
        "Irish": ["Dublin", "Cork"]
    }
    cask_types = ["Sherry", "Bourbon", "Port", "Wine", "Rum"]
    bottling_types = ["Single Malt", "Blended", "Single Cask", "Cask Strength"]
    
    # category definitions (linked to price)
    category_definitions = {
        "Basic": (25, 49),
        "Standard": (50, 88),
        "Premium": (89, 128),
        "Exclusive": (129, 278),
        "Luxury": (279, 500)
    }

    
    categories = list(category_definitions.keys())
    num_classes = len(categories)
    per_class = num_rows // num_classes
    remainder = num_rows % num_classes

    data = []

    for i, category in enumerate(categories):
        count = per_class + (1 if i < remainder else 0)
        price_min, price_max = category_definitions[category]

        for _ in range(count):
            brand = random.choice(brands)
            w_type = random.choice(types)
            region = random.choice(regions[w_type])
            age = np.random.choice([0, *range(3, 31)], p=[0.1] + [0.9 / 28] * 28)
            abv = round(random.uniform(40, 60), 1)
            cask = random.choice(cask_types)
            bottling = random.choice(bottling_types)
            limited = np.random.rand() < 0.15
            release_year = random.randint(1990, 2025)
            awards = np.random.poisson(1.5)
            avg_rating = round(np.random.normal(85 + (age / 30) * 10 + awards, 3), 1)
            price = round(random.uniform(price_min, price_max), 2)

            # rating category (ordinal)
            if avg_rating < 85:
                rating_category = "Low"
            elif avg_rating < 90:
                rating_category = "Medium"
            elif avg_rating < 95:
                rating_category = "High"
            else:
                rating_category = "Excelent"

            whiskey_name = f"{brand} {age if age else 'NAS'} {cask} Cask"

            data.append([
                whiskey_name, brand, w_type, age, abv, region, cask,
                bottling, price, limited, release_year, avg_rating,
                awards, rating_category, category
            ])

    columns = [
        "whiskey_name", "brand", "type", "age", "abv", "region", "cask_type",
        "bottling_type", "retail_price_usd", "is_limited_edition",
        "release_year", "average_rating", "award_wins", "rating_category", "category"
    ]

    df = pd.DataFrame(data, columns=columns)
    df = df.sample(frac=1, random_state=random.randint(0, 10_000)).reset_index(drop=True)

    return df 

# test model with data generation
def test_model(num_rows):
    # generate new dataset
    df = generate_whiskey(num_rows)

    # drop uneeded columns
    features_df = df.drop(['category', 'whiskey_name'], axis=1)
    
    # identify categorical and numerical columns
    categorical_cols = features_df.select_dtypes(include=['object', 'bool']).columns
    numeric_cols = features_df.select_dtypes(include=['int64', 'float64']).columns
    
    # apply One Hot to categorical columns
    OH_encoded = OH_encoder.transform(features_df[categorical_cols])  # Use transform, no fit
    OH_feature_names = OH_encoder.get_feature_names_out(categorical_cols)
    OH_df = pd.DataFrame(OH_encoded, columns=OH_feature_names)
    
    # keep numerical columns
    numeric_df = features_df[numeric_cols].reset_index(drop=True)
    
    # concat both dataframes
    features_df = pd.concat([numeric_df, OH_df], axis=1)
    
    # get the missing columns in new dataset using the list saved
    missing_cols = set(feature_columns) - set(features_df.columns)
    for col in missing_cols:
        features_df[col] = 0   # add columns with value = 0
    
    # re order the columns
    features_df = features_df[feature_columns]
    
    # normalize using the scaler loaded (use transform, not fit_transform)
    normalized_features = scaler.transform(features_df)
    normalized_df = pd.DataFrame(normalized_features, columns=feature_columns)
    
    # predict with normalized_df
    predictions = svm_model.predict(normalized_df)
    predictions = pd.DataFrame(predictions, columns=['Predictions'])
    
    # create a df with the results
    result_df = df.copy()
    # add codified categories
    result_df['Codified category'] = label_encoder.transform(df['category'])
    # join the original df with predictions to show in Gradio
    result_df['Predicted category'] = predictions['Predictions']

    
    # calculate the accuracy (prediction rate)
    accuracy = accuracy_score(result_df['Codified category'], result_df['Predicted category'])

    # set the categories index
    category_list = "\n".join([v for v in {0:'Basic', 1:'Standard', 2:'Premium', 3:'Exclusive', 4:'Luxury'}.values()])

    return category_list, result_df, f"Model accuracy: {accuracy * 100:.2f}%", gr.update(visible=True)



# Gradio interface using Blocks
with gr.Blocks() as demo:
    gr.Markdown("### Demo for whiskey classifier model")
    gr.Markdown("Create a synthetic dataset, process dataset, apply model and show accuracy.")

    rows_slider = gr.Slider(minimum=100, maximum=5000, value=1000, label="Number of rows")

    with gr.Row():
        category_output = gr.Textbox(label="Categories list")
        accuracy_output = gr.Textbox(label="Accuracy")

    with gr.Row():
        dataset_output = gr.Dataframe(label="Dataset to predict without processment + codified categories + predictions", visible=False)

    run_btn = gr.Button("Run Test")

    run_btn.click(fn=test_model, inputs=rows_slider, outputs=[
        category_output, dataset_output, accuracy_output, dataset_output
    ])

demo.launch()

  from .autonotebook import tqdm as notebook_tqdm


* Running on local URL:  http://127.0.0.1:7913
* To create a public link, set `share=True` in `launch()`.


