In [1]:
# config (run before anything else)
LOG_RESULTS = True
N_SPLITS = 5
RANDOM_SEED = 42

import warnings

warnings.filterwarnings(
    "ignore",
    message=r"unknown class\(es\).*will be ignored",
    module="sklearn.preprocessing._label",
)

## Feature Engineering

### Text Features:

Experiment with `TfidfVectorizer` parameters (e.g., `ngram_range`, `min_df`, `max_df`).


In [2]:
from sklearn.svm import SVC
from utils import run_configs, print_results_table

# Path to folds file
FOLDS_PATH = "data/strong_baseline/folds_preprocessed.csv"

# Parameter grid
tfidf_configs = [
    {
        "description": "Baseline (Unigrams only)",
        "ngram_range": (1, 1),
        "min_df": 1,
        "max_df": 1.0,
    },
    {
        "description": "Unigrams + Bigrams",
        "ngram_range": (1, 2),
        "min_df": 1,
        "max_df": 1.0,
    },
    {
        "description": "Unigrams, ignore very rare terms (min_df=5)",
        "ngram_range": (1, 1),
        "min_df": 5,
        "max_df": 1.0,
    },
    {
        "description": "Unigrams, ignore very common terms (max_df=0.9)",
        "ngram_range": (1, 1),
        "min_df": 1,
        "max_df": 0.9,
    },
    {
        "description": "Unigrams + Bigrams, with term filtering",
        "ngram_range": (1, 2),
        "min_df": 5,
        "max_df": 0.9,
    },
]

results = run_configs(
    models=[(SVC, {"kernel": "linear"}, "Linear SVC")],
    folds_path=FOLDS_PATH,
    tfidf_configs=tfidf_configs,
    random_seed=RANDOM_SEED,
    log_results=LOG_RESULTS,
)

print_results_table(
    results, title="Linear SVC — TF-IDF Configs (Preprocessed)", top_n=10
)

Model: SVC
Cross-Validation (using data/strong_baseline/folds_preprocessed.csv)
  Fold 1: 0.8533
  Fold 2: 0.8533
  Fold 3: 0.8483
  Fold 4: 0.8433
  Fold 5: 0.8447
Mean accuracy: 0.8486  |  Std: 0.0047
Total runtime: 15.07 seconds
➕ Added new results for 'Linear SVC (preprocessed data) | Text | Baseline (Unigrams only)'.
✅ Results saved to results/log.xlsx
Model: SVC
Cross-Validation (using data/strong_baseline/folds_preprocessed.csv)
  Fold 1: 0.8883
  Fold 2: 0.8767
  Fold 3: 0.8767
  Fold 4: 0.8617
  Fold 5: 0.8564
Mean accuracy: 0.8720  |  Std: 0.0128
Total runtime: 45.94 seconds
➕ Added new results for 'Linear SVC (preprocessed data) | Text | Unigrams + Bigrams'.
✅ Results saved to results/log.xlsx
Model: SVC
Cross-Validation (using data/strong_baseline/folds_preprocessed.csv)
  Fold 1: 0.8633
  Fold 2: 0.8550
  Fold 3: 0.8450
  Fold 4: 0.8450
  Fold 5: 0.8464
Mean accuracy: 0.8509  |  Std: 0.0081
Total runtime: 13.40 seconds
➕ Added new results for 'Linear SVC (preprocessed data

### Numerical Features:

Use the `n_ingredients` column. We could also create new features like the number of steps, length of the description, etc. We may need to combine these with the TF-IDF features.


In [3]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.svm import SVC
from utils import run_kfold_experiment, create_folds

# config
ORIGINAL_DATA_PATH = "data/train.csv"
FOLDS_PATH = "data/feature_engineering/folds_numerical.csv"

# using best parameters identified from previous experiment for better performance
TFIDF_CFG = {
    "ngram_range": (1, 2),
    "min_df": 5,
    "max_df": 0.9,
}

# data loading and feature engineering
print(f"Loading original dataset from '{ORIGINAL_DATA_PATH}'...")
try:
    data = pd.read_csv(ORIGINAL_DATA_PATH, sep=";")
except FileNotFoundError:
    print(f"\nERROR: Original data file not found at '{ORIGINAL_DATA_PATH}'.")
    print("This script needs the raw dataset to create numerical features.")
    exit()

print("Starting feature engineering...")

# fill NaNs in key columns
data["description"] = data["description"].fillna("")
data["steps"] = data["steps"].fillna("[]")
data["n_ingredients"] = data["n_ingredients"].fillna(0)

# create 'description_len' feature
data["description_len"] = data["description"].str.len()


# create 'n_steps' feature
# evaluate 'steps' string to a list and get its length
def count_steps(step_string):
    try:
        return len(re.findall(r"\'(.*?)\'", str(step_string)))
    except (TypeError, ValueError):
        return 0


data["n_steps"] = data["steps"].apply(count_steps)
print(f"Created new features: 'description_len' and 'n_steps'.")
print("Top 5 rows with new features:")
print(data[["n_ingredients", "description_len", "n_steps"]].head())

# text preprocessing
print("\nPreprocessing text data...")
try:
    stopwords.words("english")
except LookupError:
    print("Stopwords not found. Downloading...")
    nltk.download("stopwords")
stop_words_set = set(stopwords.words("english"))

# same cleaning steps as before
data["description_processed"] = (
    data["description"]
    .str.lower()
    .str.replace(r"[^a-z\s]", "", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
    .apply(lambda text: " ".join(w for w in text.split() if w not in stop_words_set))
)

numerical_features = ["n_ingredients", "description_len", "n_steps"]

# make folds
print("\nCreating stratified folds via create_folds...")
create_folds(
    input_data=data,
    output_file=FOLDS_PATH,
    text_columns=[
        "description_processed"
    ],  # creates "document" column, same as description_processed, but default name, so its easier to replicate
    n_splits=N_SPLITS,
    random_seed=RANDOM_SEED,
)

# run K-Fold experiment with Linear SVC
print("\nRunning K-Fold experiment with Linear SVC on document + numeric features...\n")
_ = run_kfold_experiment(
    folds_file=FOLDS_PATH,
    model_cls=SVC,
    text_columns={"document": TFIDF_CFG},  # use the 'document' column created above
    categorical_columns=None,
    numerical_columns=numerical_features,  # ['n_ingredients','description_len','n_steps']
    model_kwargs={"kernel": "linear", "random_state": RANDOM_SEED},
    model_desc = "Linear SVC (numerical data) | Text + Numerical | Unigrams + Bigrams, with term filtering",
    log_results=LOG_RESULTS,
    random_seed=RANDOM_SEED,
)

Loading original dataset from 'data/train.csv'...
Starting feature engineering...
Created new features: 'description_len' and 'n_steps'.
Top 5 rows with new features:
   n_ingredients  description_len  n_steps
0              6               38        7
1             13               85       16
2             12               57       13
3              4               61        2
4             13              128       31

Preprocessing text data...

Creating stratified folds via create_folds...

## Chef ID Distribution Check

=== Original Dataset === (Size: 2999)
         Count  Percentage
chef_id                   
4470       806       26.88
5060       534       17.81
3288       451       15.04
8688       432       14.40
1533       404       13.47
6357       372       12.40

=== Fold 1 === (Size: 600)
         Count  Percentage
chef_id                   
4470       161       26.83
5060       107       17.83
3288        90       15.00
8688        86       14.33
1533        81       13.

### Categorical Features:

Process the `tags` field.

In [4]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.svm import SVC
from utils import run_kfold_experiment, create_folds

# config
ORIGINAL_DATA_PATH = "data/train.csv"
FOLDS_PATH = "data/feature_engineering/folds_allfeatures.csv"

# using best parameters identified from previous experiment for better performance
TFIDF_CFG = {
    "ngram_range": (1, 2),
    "min_df": 5,
    "max_df": 0.9,
}

# data loading and feature engineering
print(f"Loading original dataset from '{ORIGINAL_DATA_PATH}'...")
try:
    data = pd.read_csv(ORIGINAL_DATA_PATH, sep=";")
except FileNotFoundError:
    print(f"\nERROR: Original data file not found at '{ORIGINAL_DATA_PATH}'.")
    exit()

print("Starting feature engineering...")

# fill NaNs in key columns
data["description"] = data["description"].fillna("")
data["steps"] = data["steps"].fillna("[]")
data["tags"] = data["tags"].fillna("[]")
data["n_ingredients"] = data["n_ingredients"].fillna(0)

# create numerical features
data["description_len"] = data["description"].str.len()
data["n_steps"] = data["steps"].apply(lambda x: len(re.findall(r"\'(.*?)\'", str(x))))


# define a function to parse the tags column (kept for consistency, helper has its own too)
def parse_tags(tag_string):
    try:
        return re.findall(r"\'(.*?)\'", str(tag_string))
    except (TypeError, ValueError):
        return []


# text preprocessing
print("\nPreprocessing text data...")
try:
    stopwords.words("english")
except LookupError:
    print("Stopwords not found. Downloading...")
    nltk.download("stopwords")
stop_words_set = set(stopwords.words("english"))

data["description_processed"] = (
    data["description"]
    .str.lower()
    .str.replace(r"[^a-z\s]", "", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
    .apply(lambda text: " ".join([w for w in text.split() if w not in stop_words_set]))
)

numerical_features = ["n_ingredients", "description_len", "n_steps"]
categorical_feature = "tags"  # will be parsed inside run_kfold_experiment
text_feature = "description_processed"  # fed to create_folds -> 'document'

# make folds
print("\nCreating stratified folds via create_folds...")
create_folds(
    input_data=data,
    output_file=FOLDS_PATH,
    text_columns=[text_feature],  # creates 'document' from description_processed
    n_splits=N_SPLITS,
    random_seed=RANDOM_SEED,
)

# run K-Fold experiment with Linear SVC
print(
    "\nRunning K-Fold experiment with Linear SVC on document + numeric + tags features...\n"
)
_ = run_kfold_experiment(
    folds_file=FOLDS_PATH,
    model_cls=SVC,
    text_columns={"document": TFIDF_CFG},  # use the 'document' column created above
    categorical_columns=categorical_feature,  # 'tags' (helper will parse list-like strings)
    numerical_columns=numerical_features,  # ['n_ingredients','description_len','n_steps']
    model_kwargs={"kernel": "linear", "random_state": RANDOM_SEED},
    model_desc="Linear SVC (allfeatures data) | Text + Numerical + Categorical | Unigrams + Bigrams, with term filtering",
    log_results=LOG_RESULTS,
    random_seed=RANDOM_SEED,
)

Loading original dataset from 'data/train.csv'...
Starting feature engineering...

Preprocessing text data...

Creating stratified folds via create_folds...

## Chef ID Distribution Check

=== Original Dataset === (Size: 2999)
         Count  Percentage
chef_id                   
4470       806       26.88
5060       534       17.81
3288       451       15.04
8688       432       14.40
1533       404       13.47
6357       372       12.40

=== Fold 1 === (Size: 600)
         Count  Percentage
chef_id                   
4470       161       26.83
5060       107       17.83
3288        90       15.00
8688        86       14.33
1533        81       13.50
6357        75       12.50

=== Fold 2 === (Size: 600)
         Count  Percentage
chef_id                   
4470       162       27.00
5060       107       17.83
3288        90       15.00
8688        86       14.33
1533        81       13.50
6357        74       12.33

=== Fold 3 === (Size: 600)
         Count  Percentage
chef_id       

## Mixing and matching

To see what has the best results.


In [5]:
from utils import run_configs, print_results_table
import itertools

# config
FOLDS_PATH = "data/feature_engineering/folds_allfeatures.csv"

tfidf_configs = [
    {
        "description": "Unigrams + Bigrams, with term filtering",
        "ngram_range": (1, 2),
        "min_df": 5,
        "max_df": 0.9,
    }
]

numerical_features = ["n_ingredients", "description_len", "n_steps"]
categorical_feature = "tags"
# text_column_name    = "document" # def, no need to add

# all feature combinations
features = ["Text", "Numerical", "Categorical"]
feature_combinations = [
    combo
    for r in range(1, len(features) + 1)
    for combo in itertools.combinations(features, r)
]

results = run_configs(
    models=[(SVC, {"kernel": "linear"}, "Linear SVC")],
    folds_path=FOLDS_PATH,
    tfidf_configs=tfidf_configs,
    feature_combinations=feature_combinations,
    categorical_features=categorical_feature,
    numerical_features=numerical_features,
    random_seed=RANDOM_SEED,
    log_results=LOG_RESULTS,
)

print_results_table(results, title="SVC Accuracy by Feature Set", top_n=10)

Model: SVC
Cross-Validation (using data/feature_engineering/folds_allfeatures.csv)
  Fold 1: 0.6650
  Fold 2: 0.6283
  Fold 3: 0.6817
  Fold 4: 0.6550
  Fold 5: 0.6728
Mean accuracy: 0.6606  |  Std: 0.0205
Total runtime: 2.64 seconds
➕ Added new results for 'Linear SVC (allfeatures data) | Text | Unigrams + Bigrams, with term filtering'.
✅ Results saved to results/log.xlsx
Model: SVC
Cross-Validation (using data/feature_engineering/folds_allfeatures.csv)
  Fold 1: 0.3450
  Fold 2: 0.3467
  Fold 3: 0.3650
  Fold 4: 0.3400
  Fold 5: 0.3589
Mean accuracy: 0.3511  |  Std: 0.0104
Total runtime: 0.36 seconds
➕ Added new results for 'Linear SVC (allfeatures data) | Numerical'.
✅ Results saved to results/log.xlsx
Model: SVC
Cross-Validation (using data/feature_engineering/folds_allfeatures.csv)
  Fold 1: 0.8117
  Fold 2: 0.8083
  Fold 3: 0.7900
  Fold 4: 0.8050
  Fold 5: 0.8047
Mean accuracy: 0.8039  |  Std: 0.0083
Total runtime: 2.07 seconds
➕ Added new results for 'Linear SVC (allfeatures da