In [1]:
import pandas as pd
import numpy as np
import os
import gc
import warnings
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from PIL import Image
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input as effnet_preprocess
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')
sns.set(style="darkgrid")

In [2]:
df = pd.read_csv('/content/train.csv')
df_test = pd.read_csv('/content/test.csv')

# Sample 50% of the training data
df = df.sample(frac=0.4, random_state=42).reset_index(drop=True)

In [3]:
import pandas as pd
import numpy as np

def reduce_mem_usage(df):
    """Iterate through all the columns of a dataframe and modify the data type to reduce memory usage."""
    start_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage of dataframe is {start_mem:.2f} MB')

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
             # Check if the number of unique values is small enough to be a category
            if len(df[col].unique()) / len(df) < 0.5:
                df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage after optimization is: {end_mem:.2f} MB')
    print(f'Decreased by {100 * (start_mem - end_mem) / start_mem:.1f}%')

    return df

df = reduce_mem_usage(df)
df_test = reduce_mem_usage(df_test)

Memory usage of dataframe is 0.92 MB
Memory usage after optimization is: 0.69 MB
Decreased by 25.0%
Memory usage of dataframe is 1.72 MB
Memory usage after optimization is: 1.43 MB
Decreased by 16.7%


In [4]:
print(df.head())

   sample_id                                    catalog_content  \
0     158784  Item Name: Log Cabin Sugar Free Syrup, 24 FL O...   
1       4095  Item Name: Raspberry Ginseng Oolong Tea (50 te...   
2     172021  Item Name: Walden Farms Honey Dijon Dressing -...   
3     268276  Item Name: Vlasic Ovals Hamburger Dill Pickle ...   
4     154791  Item Name: Amoretti Premium Syrup, Grand Orang...   

                                          image_link      price  
0  https://m.media-amazon.com/images/I/71QD2OFXqD...  12.195000  
1  https://m.media-amazon.com/images/I/813OiT8mdJ...  38.540001  
2  https://m.media-amazon.com/images/I/71HGx42QmU...  17.860001  
3  https://m.media-amazon.com/images/I/71AbnhXOTA...   2.940000  
4  https://m.media-amazon.com/images/I/61c+aTE6TY...  25.990000  


In [5]:
first_row = df.head().iloc[0]

catalog_content_first_row = first_row['catalog_content']

print(catalog_content_first_row)



Item Name: Log Cabin Sugar Free Syrup, 24 FL OZ (Pack of 12)
Bullet Point 1: Contains twelve (12) 24-ounce bottles of Log Cabin Sugar Free Syrup for Pancakes and Waffles
Bullet Point 2: Indulge in thick, delicious syrup for pancakes, waffles, French toast and more
Bullet Point 3: 90% fewer calories than our original syrup and no sugar or high fructose corn syrup
Bullet Point 4: Amazing syrup that you can feel good about serving to your family and guests
Bullet Point 5: Stock up on this breakfast staple for decadent pancakes and waffles anytime
Value: 288.0
Unit: Fl Oz



In [6]:
def parse_catalog_content(text):
    parsed_data = {
        'item_name': '', 'product_description': '', 'value': np.nan,
        'unit': '', 'bullet_points': []
    }
    for line in text.strip().split('\n'):
        if line.startswith('Item Name:'):
            parsed_data['item_name'] = line.replace('Item Name:', '').strip()
        elif line.startswith('Product Description:'):
            parsed_data['product_description'] = line.replace('Product Description:', '').strip()
        elif line.startswith('Value:'):
            try:
                parsed_data['value'] = float(line.replace('Value:', '').strip())
            except (ValueError, TypeError):
                parsed_data['value'] = np.nan
        elif line.startswith('Unit:'):
            parsed_data['unit'] = line.replace('Unit:', '').strip()
        elif 'Bullet Point' in line:
            bullet = re.sub(r'Bullet Point.*?:', '', line).strip()
            parsed_data['bullet_points'].append(bullet)
    parsed_data['bullet_points'] = ' '.join(parsed_data['bullet_points'])
    return pd.Series(parsed_data)

# Apply the parsing function
parsed_df = df['catalog_content'].apply(parse_catalog_content)
df = pd.concat([df.drop('catalog_content', axis=1), parsed_df], axis=1)

parsed_df_test = df_test['catalog_content'].apply(parse_catalog_content)
df_test = pd.concat([df_test.drop('catalog_content', axis=1), parsed_df_test], axis=1)

In [7]:
df.head()

Unnamed: 0,sample_id,image_link,price,item_name,product_description,value,unit,bullet_points
0,158784,https://m.media-amazon.com/images/I/71QD2OFXqD...,12.195,"Log Cabin Sugar Free Syrup, 24 FL OZ (Pack of 12)",,288.0,Fl Oz,Contains twelve (12) 24-ounce bottles of Log C...
1,4095,https://m.media-amazon.com/images/I/813OiT8mdJ...,38.540001,"Raspberry Ginseng Oolong Tea (50 tea bags, ZIN...",Our Raspberry Ginseng Oolong Tea is a luxuriou...,100.0,Count,Our Raspberry Ginseng Oolong Tea is a luxuriou...
2,172021,https://m.media-amazon.com/images/I/71HGx42QmU...,17.860001,Walden Farms Honey Dijon Dressing - Calorie-Fr...,Nothing beats the flavor of honey and Dijon mu...,24.0,Fl Oz,Walden Farms Salad Dressings
3,268276,https://m.media-amazon.com/images/I/71AbnhXOTA...,2.94,"Vlasic Ovals Hamburger Dill Pickle Chips, Keto...",,16.0,Count,One 16 fl oz jar of Vlasic Ovals Hamburger Dil...
4,154791,https://m.media-amazon.com/images/I/61c+aTE6TY...,25.99,"Amoretti Premium Syrup, Grand Orange, 25.4 Oun...",,304.8,Fl Oz,Made with natural flavor Only 35 calories per ...


In [8]:
df_test.head()

Unnamed: 0,sample_id,image_link,item_name,product_description,value,unit,bullet_points
0,100179,https://m.media-amazon.com/images/I/71hoAn78AW...,Rani 14-Spice Eshamaya's Mango Chutney (Indian...,Mango chutney is made from diced green mangoes...,10.5,Ounce,You'll LOVE our 14-Spice Eshamaya's Mango Chut...
1,245611,https://m.media-amazon.com/images/I/61ex8NHCIj...,Natural MILK TEA Flavoring extract by HALO PAN...,Check our popular Milk Tea flavoring extract i...,2.0,Fl Oz,"Authentic Tasting, Asian-Inspired Natural flav..."
2,146263,https://m.media-amazon.com/images/I/61KCM61J8e...,Honey Filled Hard Candy - Bulk Pack 2 Pounds -...,Honey Filled Hard Candy - Bulk Pack 2 Pounds -...,32.0,Ounce,Honey Filled Hard Candy; 2-pound bulk pack; ap...
3,95658,https://m.media-amazon.com/images/I/51Ex6uOH7y...,Vlasic Snack'mm's Kosher Dill 16 Oz (Pack of 2),,2.0,Count,
4,36806,https://m.media-amazon.com/images/I/71QYlrOMoS...,"McCormick Culinary Vanilla Extract, 32 fl oz -...",,32.0,Fl Oz,PREMIUM INGREDIENTS: McCormick Culinary Pure V...


In [9]:
import pandas as pd
from IPython.display import display
import re
import numpy as np

def create_features(df, mappers=None):
    if df is None:
        print("Input DataFrame is None, skipping feature creation.")
        return None, None

    # Extract Brand first
    df['brand'] = df['item_name'].apply(lambda x: str(x).split(' ')[0].replace('’s', '')) # Handle potential NaN

    # Combining all text for searching
    df['full_text'] = df['item_name'].fillna('') + ' ' + df['product_description'].fillna('') + ' ' + df['bullet_points'].fillna('')
    df['full_text'] = df['full_text'].str.lower()

    # Extracted Numerical and Text Length Features
    df['size_from_title'] = df['item_name'].str.extract(r'(\d+\.?\d*)\s*(oz|ounce|ounces|lb|lbs|kg|g|gram|grams)', flags=re.IGNORECASE)[0]
    df['size_from_title'] = pd.to_numeric(df['size_from_title'], errors='coerce')
    df['item_name_length'] = df['item_name'].str.len()
    df['bullet_points_count'] = df['bullet_points'].apply(lambda x: len(x.split('.')) if isinstance(x, str) and x else 0)

    # **Enhanced Text Features**
    df['item_name_word_count'] = df['item_name'].apply(lambda x: len(str(x).split()))
    df['item_name_unique_word_count'] = df['item_name'].apply(lambda x: len(set(str(x).split())))
    df['full_text_word_count'] = df['full_text'].apply(lambda x: len(str(x).split()))
    df['full_text_unique_word_count'] = df['full_text'].apply(lambda x: len(set(str(x).split())))
    df['product_description_char_count'] = df['product_description'].str.len().fillna(0)
    df['bullet_points_char_count'] = df['bullet_points'].str.len().fillna(0)

    # Others
    epsilon = 1e-6
    if 'price' in df.columns:
        df['value_per_price'] = df['value'] / (df['price'] + epsilon)
    else:
        df['value_per_price'] = np.nan # Or some other placeholder if price is not available
    df['total_value'] = df['value'] * df.get('pack_size', 1)
    df['brand_unit'] = df['brand'].astype(str) + '_' + df['unit'].astype(str)


    # Extracted and Flagged Features
    df['is_gluten_free'] = df['full_text'].str.contains('gluten-free', na=False).astype(int)
    df['is_nut_free'] = df['full_text'].str.contains('nut-free', na=False).astype(int)
    df['is_made_in_usa'] = df['full_text'].str.contains('made in usa', na=False).astype(int)
    df['is_for_dips'] = df['full_text'].str.contains('dips|spreads', na=False).astype(int)

    # Encoded Categorical Features
    if mappers is None:
        # Learning mappers and returning them (Training mode)
        print("Training mode: Learning category mappers...")
        mappers = {}
        # Conversion to category and creating the mapping dictionary
        # Handle potential NaN values before converting to category
        df['brand_cat'] = df['brand'].fillna('Unknown').astype('category')
        mappers['brand'] = dict(enumerate(df['brand_cat'].cat.categories))
        mappers['brand'] = {v: k for k, v in mappers['brand'].items()}

        df['unit_cat'] = df['unit'].fillna('Unknown').astype('category')
        mappers['unit'] = dict(enumerate(df['unit_cat'].cat.categories))
        mappers['unit'] = {v: k for k, v in mappers['unit'].items()}

        df['brand_encoded'] = df['brand'].map(mappers['brand'])
        df['unit_encoded'] = df['unit'].map(mappers['unit'])

        # Drop temporary columns
        df = df.drop(columns=['brand_cat', 'unit_cat'])

        return df, mappers
    else:
        # Testing Mode: Apply existing mappers
        print("Testing mode: Applying existing mappers...")
        # Handle categories present in test but not in train by mapping to -1 (or another indicator)
        df['brand_encoded'] = df['brand'].map(mappers['brand']).fillna(-1)
        df['unit_encoded'] = df['unit'].map(mappers['unit']).fillna(-1)


        return df, None

In [10]:
df_train_featured, learned_mappers = create_features(df)
print("\nLearned Mappers from Training Data:")
print(learned_mappers)


# Processing the test data using the learned mappers
df_test_featured, _ = create_features(df_test, mappers=learned_mappers)


# Displaying the result for the test set
print("\nDataFrame `df_test` with Advanced Features:")
display(df_test_featured[['sample_id', 'size_from_title', 'brand', 'unit', 'brand_encoded', 'unit_encoded', 'is_gluten_free']])

Training mode: Learning category mappers...

Learned Mappers from Training Data:
{'brand': {'': 0, '"Bumble': 1, '"Menta"': 2, '"Now': 3, '"Savia': 4, "'Rich": 5, "'Whisps,": 6, '(10': 7, '(16': 8, '(2': 9, '(3': 10, '(3)': 11, '(4': 12, '(4-Pack)': 13, '(7-PACK)': 14, '(8': 15, '(Collection': 16, '(NOT': 17, '(Pack': 18, '.Mount': 19, '0854775002031-m5temp': 20, '1': 21, '1.44': 22, '1.75': 23, '1/2': 24, '1/4': 25, '10': 26, '10%': 27, '10.5oz': 28, '100': 29, '100%': 30, '100pcs': 31, '108ct': 32, '10ct.': 33, '10oz': 34, '116': 35, '12': 36, '12.75': 37, '12ct': 38, '13': 39, '131': 40, '13oz': 41, '14': 42, '152': 43, '18': 44, '18"': 45, '180': 46, '1850': 47, '1883': 48, '18ct': 49, '1LB': 50, '1lb': 51, '1oz': 52, '2': 53, '2-pack': 54, '2.1OZCatHairballControl': 55, '2.5': 56, '2.6oz': 57, '20': 58, '200+': 59, '2018': 60, '2025': 61, '20Oz': 62, '20oz': 63, '212': 64, '24': 65, '24Vegan': 66, '24x18-Inch': 67, '25': 68, '255': 69, '25OZ': 70, '25ct': 71, '270': 72, '2LB': 73,

Unnamed: 0,sample_id,size_from_title,brand,unit,brand_encoded,unit_encoded,is_gluten_free
0,100179,10.5,Rani,Ounce,5222.0,23.0,0
1,245611,2.0,Natural,Fl Oz,4398.0,12.0,1
2,146263,,Honey,Ounce,2955.0,23.0,0
3,95658,16.0,Vlasic,Count,6660.0,8.0,0
4,36806,,McCormick,Fl Oz,4038.0,12.0,0
...,...,...,...,...,...,...,...
74995,93616,6.0,Good,Ounce,2624.0,23.0,0
74996,249434,,Colombina,Ounce,1535.0,23.0,0
74997,162217,,"Kerns,",Fl Oz,-1.0,12.0,0
74998,230487,,NY,Ounce,4362.0,23.0,0


In [11]:
if 'df_train_featured' in globals() and df_train_featured is not None and \
   'df_test_featured' in globals() and df_test_featured is not None:

    # Identify categorical columns to encode
    categorical_cols = ['brand', 'unit']

    # Apply One-Hot Encoding
    print("Applying One-Hot Encoding...")
    df_train_encoded = pd.get_dummies(df_train_featured, columns=categorical_cols, dummy_na=False)
    df_test_encoded = pd.get_dummies(df_test_featured, columns=categorical_cols, dummy_na=False)

    # Align columns - this is important to ensure train and test have the same columns
    train_cols = df_train_encoded.columns
    test_cols = df_test_encoded.columns

    missing_in_test = set(train_cols) - set(test_cols)
    for c in missing_in_test:
        df_test_encoded[c] = 0

    missing_in_train = set(test_cols) - set(train_cols)
    for c in missing_in_train:
        df_train_encoded[c] = 0

    # Ensure the order of columns is the same
    df_test_encoded = df_test_encoded[train_cols]


    print("One-Hot Encoding applied. Displaying head of encoded training data:")
    display(df_train_encoded.head())
    print("\nDisplaying head of encoded test data:")
    display(df_test_encoded.head())

    # Now, you would use df_train_encoded and df_test_encoded for combining with BERT embeddings
    # and for model training/prediction, instead of df_train_featured and df_test_featured.

else:
    print("Skipping One-Hot Encoding: Feature engineered DataFrames not available.")
    df_train_encoded, df_test_encoded = None, None

Applying One-Hot Encoding...
One-Hot Encoding applied. Displaying head of encoded training data:


Unnamed: 0,sample_id,image_link,price,item_name,product_description,value,bullet_points,full_text,size_from_title,item_name_length,...,brand_Fulfil,brand_Rasa,brand_Yakami,brand_Bohio,brand_RECYCLED,brand_Classy,brand_Salutem,brand_Scor-Pal,brand_Barr's,brand_Cote
0,158784,https://m.media-amazon.com/images/I/71QD2OFXqD...,12.195,"Log Cabin Sugar Free Syrup, 24 FL OZ (Pack of 12)",,288.0,Contains twelve (12) 24-ounce bottles of Log C...,"log cabin sugar free syrup, 24 fl oz (pack of ...",,49,...,0,0,0,0,0,0,0,0,0,0
1,4095,https://m.media-amazon.com/images/I/813OiT8mdJ...,38.540001,"Raspberry Ginseng Oolong Tea (50 tea bags, ZIN...",Our Raspberry Ginseng Oolong Tea is a luxuriou...,100.0,Our Raspberry Ginseng Oolong Tea is a luxuriou...,"raspberry ginseng oolong tea (50 tea bags, zin...",,64,...,0,0,0,0,0,0,0,0,0,0
2,172021,https://m.media-amazon.com/images/I/71HGx42QmU...,17.860001,Walden Farms Honey Dijon Dressing - Calorie-Fr...,Nothing beats the flavor of honey and Dijon mu...,24.0,Walden Farms Salad Dressings,walden farms honey dijon dressing - calorie-fr...,12.0,154,...,0,0,0,0,0,0,0,0,0,0
3,268276,https://m.media-amazon.com/images/I/71AbnhXOTA...,2.94,"Vlasic Ovals Hamburger Dill Pickle Chips, Keto...",,16.0,One 16 fl oz jar of Vlasic Ovals Hamburger Dil...,"vlasic ovals hamburger dill pickle chips, keto...",,65,...,0,0,0,0,0,0,0,0,0,0
4,154791,https://m.media-amazon.com/images/I/61c+aTE6TY...,25.99,"Amoretti Premium Syrup, Grand Orange, 25.4 Oun...",,304.8,Made with natural flavor Only 35 calories per ...,"amoretti premium syrup, grand orange, 25.4 oun...",25.4,61,...,0,0,0,0,0,0,0,0,0,0



Displaying head of encoded test data:


Unnamed: 0,sample_id,image_link,price,item_name,product_description,value,bullet_points,full_text,size_from_title,item_name_length,...,unit_ml,unit_ounce,unit_ounces,unit_oz,unit_packs,unit_per Box,unit_per Carton,unit_pound,unit_pounds,unit_units
0,100179,https://m.media-amazon.com/images/I/71hoAn78AW...,0,Rani 14-Spice Eshamaya's Mango Chutney (Indian...,Mango chutney is made from diced green mangoes...,10.5,You'll LOVE our 14-Spice Eshamaya's Mango Chut...,rani 14-spice eshamaya's mango chutney (indian...,10.5,137,...,False,False,False,False,False,0,0,False,0,0
1,245611,https://m.media-amazon.com/images/I/61ex8NHCIj...,0,Natural MILK TEA Flavoring extract by HALO PAN...,Check our popular Milk Tea flavoring extract i...,2.0,"Authentic Tasting, Asian-Inspired Natural flav...",natural milk tea flavoring extract by halo pan...,2.0,217,...,False,False,False,False,False,0,0,False,0,0
2,146263,https://m.media-amazon.com/images/I/61KCM61J8e...,0,Honey Filled Hard Candy - Bulk Pack 2 Pounds -...,Honey Filled Hard Candy - Bulk Pack 2 Pounds -...,32.0,Honey Filled Hard Candy; 2-pound bulk pack; ap...,honey filled hard candy - bulk pack 2 pounds -...,,67,...,False,False,False,False,False,0,0,False,0,0
3,95658,https://m.media-amazon.com/images/I/51Ex6uOH7y...,0,Vlasic Snack'mm's Kosher Dill 16 Oz (Pack of 2),,2.0,,vlasic snack'mm's kosher dill 16 oz (pack of 2),16.0,47,...,False,False,False,False,False,0,0,False,0,0
4,36806,https://m.media-amazon.com/images/I/71QYlrOMoS...,0,"McCormick Culinary Vanilla Extract, 32 fl oz -...",,32.0,PREMIUM INGREDIENTS: McCormick Culinary Pure V...,"mccormick culinary vanilla extract, 32 fl oz -...",,187,...,False,False,False,False,False,0,0,False,0,0


In [12]:
import joblib

mappers_filename = 'learned_mappers.pkl'

joblib.dump(learned_mappers, mappers_filename)

print(f"Learned mappers saved successfully to {mappers_filename}")

Learned mappers saved successfully to learned_mappers.pkl


In [13]:
display(df[['sample_id', 'size_from_title', 'brand', 'unit', 'brand_encoded', 'unit_encoded', 'is_gluten_free']])

Unnamed: 0,sample_id,size_from_title,brand,unit,brand_encoded,unit_encoded,is_gluten_free
0,158784,,Log,Fl Oz,3706,12,0
1,4095,,Raspberry,Count,5230,8,0
2,172021,12.0,Walden,Fl Oz,6715,12,1
3,268276,,Vlasic,Count,6660,8,0
4,154791,25.4,Amoretti,Fl Oz,336,12,0
...,...,...,...,...,...,...,...
29995,146588,,Maple,Count,3937,8,0
29996,29972,,Atkins,Fl Oz,462,12,0
29997,79107,,1,Ounce,21,23,0
29998,176583,,Pepto,count,4900,38,0


In [14]:
df.head()

Unnamed: 0,sample_id,image_link,price,item_name,product_description,value,unit,bullet_points,brand,full_text,...,total_value,brand_unit,is_gluten_free,is_nut_free,is_made_in_usa,is_for_dips,brand_cat,unit_cat,brand_encoded,unit_encoded
0,158784,https://m.media-amazon.com/images/I/71QD2OFXqD...,12.195,"Log Cabin Sugar Free Syrup, 24 FL OZ (Pack of 12)",,288.0,Fl Oz,Contains twelve (12) 24-ounce bottles of Log C...,Log,"log cabin sugar free syrup, 24 fl oz (pack of ...",...,288.0,Log_Fl Oz,0,0,0,0,Log,Fl Oz,3706,12
1,4095,https://m.media-amazon.com/images/I/813OiT8mdJ...,38.540001,"Raspberry Ginseng Oolong Tea (50 tea bags, ZIN...",Our Raspberry Ginseng Oolong Tea is a luxuriou...,100.0,Count,Our Raspberry Ginseng Oolong Tea is a luxuriou...,Raspberry,"raspberry ginseng oolong tea (50 tea bags, zin...",...,100.0,Raspberry_Count,0,0,0,0,Raspberry,Count,5230,8
2,172021,https://m.media-amazon.com/images/I/71HGx42QmU...,17.860001,Walden Farms Honey Dijon Dressing - Calorie-Fr...,Nothing beats the flavor of honey and Dijon mu...,24.0,Fl Oz,Walden Farms Salad Dressings,Walden,walden farms honey dijon dressing - calorie-fr...,...,24.0,Walden_Fl Oz,1,0,0,0,Walden,Fl Oz,6715,12
3,268276,https://m.media-amazon.com/images/I/71AbnhXOTA...,2.94,"Vlasic Ovals Hamburger Dill Pickle Chips, Keto...",,16.0,Count,One 16 fl oz jar of Vlasic Ovals Hamburger Dil...,Vlasic,"vlasic ovals hamburger dill pickle chips, keto...",...,16.0,Vlasic_Count,0,0,0,0,Vlasic,Count,6660,8
4,154791,https://m.media-amazon.com/images/I/61c+aTE6TY...,25.99,"Amoretti Premium Syrup, Grand Orange, 25.4 Oun...",,304.8,Fl Oz,Made with natural flavor Only 35 calories per ...,Amoretti,"amoretti premium syrup, grand orange, 25.4 oun...",...,304.8,Amoretti_Fl Oz,0,0,0,0,Amoretti,Fl Oz,336,12


In [15]:
df_test.head()

Unnamed: 0,sample_id,image_link,item_name,product_description,value,unit,bullet_points,brand,full_text,size_from_title,...,bullet_points_char_count,value_per_price,total_value,brand_unit,is_gluten_free,is_nut_free,is_made_in_usa,is_for_dips,brand_encoded,unit_encoded
0,100179,https://m.media-amazon.com/images/I/71hoAn78AW...,Rani 14-Spice Eshamaya's Mango Chutney (Indian...,Mango chutney is made from diced green mangoes...,10.5,Ounce,You'll LOVE our 14-Spice Eshamaya's Mango Chut...,Rani,rani 14-spice eshamaya's mango chutney (indian...,10.5,...,423,,10.5,Rani_Ounce,0,0,0,0,5222.0,23.0
1,245611,https://m.media-amazon.com/images/I/61ex8NHCIj...,Natural MILK TEA Flavoring extract by HALO PAN...,Check our popular Milk Tea flavoring extract i...,2.0,Fl Oz,"Authentic Tasting, Asian-Inspired Natural flav...",Natural,natural milk tea flavoring extract by halo pan...,2.0,...,486,,2.0,Natural_Fl Oz,1,0,0,0,4398.0,12.0
2,146263,https://m.media-amazon.com/images/I/61KCM61J8e...,Honey Filled Hard Candy - Bulk Pack 2 Pounds -...,Honey Filled Hard Candy - Bulk Pack 2 Pounds -...,32.0,Ounce,Honey Filled Hard Candy; 2-pound bulk pack; ap...,Honey,honey filled hard candy - bulk pack 2 pounds -...,,...,496,,32.0,Honey_Ounce,0,0,0,0,2955.0,23.0
3,95658,https://m.media-amazon.com/images/I/51Ex6uOH7y...,Vlasic Snack'mm's Kosher Dill 16 Oz (Pack of 2),,2.0,Count,,Vlasic,vlasic snack'mm's kosher dill 16 oz (pack of 2),16.0,...,0,,2.0,Vlasic_Count,0,0,0,0,6660.0,8.0
4,36806,https://m.media-amazon.com/images/I/71QYlrOMoS...,"McCormick Culinary Vanilla Extract, 32 fl oz -...",,32.0,Fl Oz,PREMIUM INGREDIENTS: McCormick Culinary Pure V...,McCormick,"mccormick culinary vanilla extract, 32 fl oz -...",,...,1187,,32.0,McCormick_Fl Oz,0,0,0,0,4038.0,12.0


In [16]:
pip install transformers torch pandas



In [17]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from tqdm.notebook import tqdm

In [18]:
# Access the 5th row (index 4)
fifth_row = df.iloc[4]

# Get the 'full_text' from the fifth row
catalog_content_fifth_row = fifth_row['full_text']

print(catalog_content_fifth_row)

amoretti premium syrup, grand orange, 25.4 ounce (pack of 12)  made with natural flavor only 35 calories per serving and 62 servings per bottle free pump included delicious syrup for latte, tea, coffee, smoothies, italian soda and desserts proudly made in southern california


In [19]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"Using device: {device}")

Using device: cuda


In [21]:
def get_bert_embeddings(text_list, batch_size=16):
    """Generates DistilBERT embeddings for a list of texts."""
    model.eval()
    all_embeddings = []

    for i in tqdm(range(0, len(text_list), batch_size), desc="Generating Embeddings"):
        batch_texts = text_list[i:i+batch_size]

        inputs = tokenizer(
            batch_texts, return_tensors='pt', truncation=True,
            padding=True, max_length=128
        )

        inputs = {key: val.to(device) for key, val in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        embeddings = outputs.last_hidden_state.mean(dim=1)
        all_embeddings.append(embeddings.cpu().numpy())

    return np.vstack(all_embeddings)

In [22]:
text_data_list = df['full_text'].tolist()
train_embeddings = get_bert_embeddings(text_data_list)

Generating Embeddings:   0%|          | 0/1875 [00:00<?, ?it/s]

In [23]:
test_text_list = df_test['full_text'].tolist()
test_embeddings = get_bert_embeddings(test_text_list)

Generating Embeddings:   0%|          | 0/4688 [00:00<?, ?it/s]

In [24]:
bert_df = pd.DataFrame(train_embeddings)
bert_df.columns = [f'bert_{i}' for i in range(bert_df.shape[1])]

In [25]:
gc.collect()

3179

In [26]:
test_bert_df = pd.DataFrame(test_embeddings)
test_bert_df.columns = [f'bert_{i}' for i in range(test_bert_df.shape[1])]

In [27]:
display(bert_df.head())

Unnamed: 0,bert_0,bert_1,bert_2,bert_3,bert_4,bert_5,bert_6,bert_7,bert_8,bert_9,...,bert_758,bert_759,bert_760,bert_761,bert_762,bert_763,bert_764,bert_765,bert_766,bert_767
0,-0.068827,0.25152,0.333489,0.241956,0.342458,-0.22665,0.003175,0.301631,-0.036324,-0.041521,...,-0.235074,-0.13143,-0.095607,-0.148491,-0.023553,-0.211931,-0.010769,0.008236,0.067531,-0.115099
1,-0.465534,-0.073495,0.386675,0.180587,0.287033,-0.045065,0.017095,0.367019,-0.209528,0.167893,...,-0.384154,-0.192729,0.063083,-0.134403,-0.25277,-0.304791,-0.335039,-0.023868,-0.068404,-0.098475
2,-0.242956,0.111134,0.234416,0.211209,0.320511,-0.133878,0.005591,0.324222,0.132345,-0.065189,...,-0.088112,-0.15393,0.030235,0.048276,-0.051814,-0.261285,-0.150516,0.036125,0.001843,0.031135
3,-0.260626,0.041268,0.240604,0.295577,0.254434,-0.063628,-0.293346,0.373086,0.09731,0.002949,...,-0.104384,-0.07574,0.138008,0.09977,-0.082139,-0.137609,-0.227918,0.177854,0.042161,-0.165652
4,-0.157259,0.032156,0.226684,0.181221,0.280515,-0.052703,0.090127,0.175041,0.118526,0.072211,...,-0.049485,-0.070778,-0.166384,-0.06913,0.028027,-0.129555,0.014717,0.042802,-0.095301,-0.1241


In [28]:
display(test_bert_df.head())

Unnamed: 0,bert_0,bert_1,bert_2,bert_3,bert_4,bert_5,bert_6,bert_7,bert_8,bert_9,...,bert_758,bert_759,bert_760,bert_761,bert_762,bert_763,bert_764,bert_765,bert_766,bert_767
0,-0.354232,0.037641,0.096414,0.248686,0.267613,-0.035609,-0.013614,0.415421,0.172093,0.034315,...,-0.182769,-0.102982,0.065086,-0.102934,-0.16965,-0.210989,-0.281351,0.08757,0.011162,-0.024327
1,-0.156423,0.083091,0.385709,0.181833,0.367321,-0.127781,0.120819,0.32971,0.047721,-0.078231,...,-0.126531,-0.145542,0.082508,-0.124749,-0.108829,-0.345423,-0.028783,-0.029766,-0.08992,-0.051168
2,-0.32662,0.102165,0.354262,0.184696,0.316962,-0.052845,0.025626,0.312831,0.049918,-0.080893,...,-0.049287,-0.25005,0.165586,-0.107625,-0.075856,-0.188541,-0.09352,0.048513,-0.068668,0.026461
3,0.000988,0.110604,0.06912,0.088857,0.104758,-0.02174,0.07988,-0.00359,0.199636,0.124082,...,-0.150086,0.05089,-0.169518,-0.165548,0.05262,0.037278,-0.002595,0.220331,-0.045691,-0.047284
4,-0.244438,0.055966,0.24136,0.13086,0.352477,-0.145542,0.021944,0.181788,0.308604,-0.093434,...,-0.078324,-0.189521,-0.081445,-0.070213,-0.159391,-0.467907,-0.029442,-0.124541,-0.010308,-0.004068


In [29]:
feature_cols = [
    'sample_id', 'price', 'value', 'size_from_title', 'item_name_length',
    'bullet_points_count', 'is_gluten_free', 'is_nut_free', 'is_made_in_usa',
    'is_for_dips', 'unit_encoded', 'brand_encoded'
]
feature_cols_test = [
    'sample_id', 'value', 'size_from_title', 'item_name_length',
    'bullet_points_count', 'is_gluten_free', 'is_nut_free', 'is_made_in_usa',
    'is_for_dips', 'unit_encoded', 'brand_encoded'
]

# Use the sampled DataFrame for structured features
structured_features_df = df[feature_cols].copy()
structured_features_df_test = df_test[feature_cols_test].copy()

In [30]:
final_df = pd.concat([
    structured_features_df.reset_index(drop=True),
    bert_df.reset_index(drop=True)
], axis=1)
final_df_test = pd.concat([
    structured_features_df_test.reset_index(drop=True),
    test_bert_df.reset_index(drop=True)
], axis=1)

In [31]:
final_df.fillna(0, inplace=True)
final_df_test.fillna(0, inplace=True)

In [32]:
print("Successfully combined all features!")
print(f"Shape of the original DataFrame: {df.shape}")
print(f"Shape of the BERT DataFrame: {bert_df.shape}")
print(f"Shape of the final combined DataFrame: {final_df.shape}")

print("\nDisplaying the final model-ready DataFrame:")
display(final_df.head())

Successfully combined all features!
Shape of the original DataFrame: (30000, 30)
Shape of the BERT DataFrame: (30000, 768)
Shape of the final combined DataFrame: (30000, 780)

Displaying the final model-ready DataFrame:


Unnamed: 0,sample_id,price,value,size_from_title,item_name_length,bullet_points_count,is_gluten_free,is_nut_free,is_made_in_usa,is_for_dips,...,bert_758,bert_759,bert_760,bert_761,bert_762,bert_763,bert_764,bert_765,bert_766,bert_767
0,158784,12.195,288.0,0.0,49,1,0,0,0,0,...,-0.235074,-0.13143,-0.095607,-0.148491,-0.023553,-0.211931,-0.010769,0.008236,0.067531,-0.115099
1,4095,38.540001,100.0,0.0,64,3,0,0,0,0,...,-0.384154,-0.192729,0.063083,-0.134403,-0.25277,-0.304791,-0.335039,-0.023868,-0.068404,-0.098475
2,172021,17.860001,24.0,12.0,154,1,1,0,0,0,...,-0.088112,-0.15393,0.030235,0.048276,-0.051814,-0.261285,-0.150516,0.036125,0.001843,0.031135
3,268276,2.94,16.0,0.0,65,1,0,0,0,0,...,-0.104384,-0.07574,0.138008,0.09977,-0.082139,-0.137609,-0.227918,0.177854,0.042161,-0.165652
4,154791,25.99,304.8,25.4,61,1,0,0,0,0,...,-0.049485,-0.070778,-0.166384,-0.06913,0.028027,-0.129555,0.014717,0.042802,-0.095301,-0.1241


In [33]:
print(f"Shape of the original DataFrame: {df_test.shape}")
print(f"Shape of the BERT DataFrame: {test_bert_df.shape}")
print(f"Shape of the final combined DataFrame: {final_df_test.shape}")

print("\nDisplaying the final model-ready DataFrame for Test:")
display(final_df_test.head())

Shape of the original DataFrame: (75000, 27)
Shape of the BERT DataFrame: (75000, 768)
Shape of the final combined DataFrame: (75000, 779)

Displaying the final model-ready DataFrame for Test:


Unnamed: 0,sample_id,value,size_from_title,item_name_length,bullet_points_count,is_gluten_free,is_nut_free,is_made_in_usa,is_for_dips,unit_encoded,...,bert_758,bert_759,bert_760,bert_761,bert_762,bert_763,bert_764,bert_765,bert_766,bert_767
0,100179,10.5,10.5,137,5,0,0,0,0,23.0,...,-0.182769,-0.102982,0.065086,-0.102934,-0.16965,-0.210989,-0.281351,0.08757,0.011162,-0.024327
1,245611,2.0,2.0,217,3,1,0,0,0,12.0,...,-0.126531,-0.145542,0.082508,-0.124749,-0.108829,-0.345423,-0.028783,-0.029766,-0.08992,-0.051168
2,146263,32.0,0.0,67,1,0,0,0,0,23.0,...,-0.049287,-0.25005,0.165586,-0.107625,-0.075856,-0.188541,-0.09352,0.048513,-0.068668,0.026461
3,95658,2.0,16.0,47,0,0,0,0,0,8.0,...,-0.150086,0.05089,-0.169518,-0.165548,0.05262,0.037278,-0.002595,0.220331,-0.045691,-0.047284
4,36806,32.0,0.0,187,1,0,0,0,0,12.0,...,-0.078324,-0.189521,-0.081445,-0.070213,-0.159391,-0.467907,-0.029442,-0.124541,-0.010308,-0.004068


In [34]:
X_train = final_df.drop(columns=['sample_id', 'price'])
y_train = final_df['price']

X_test = final_df_test.drop(columns=['sample_id'])
X_test = X_test[X_train.columns]

y_train_log = np.log1p(y_train)

In [35]:
# X = final_df.drop(columns=['sample_id', 'price'])
# y = final_df['price']

In [36]:
# from sklearn.model_selection import train_test_split

In [37]:
# X_train, X_val, y_train, y_val = train_test_split(
#     X, y, test_size=0.2, random_state=42
# )
# y_train_log = np.log1p(y_train)
# y_val_log = np.log1p(y_val)

In [38]:
pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/400.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.5.0


In [39]:
# import optuna

# # Define the KFold strategy outside the objective function
# NFOLDS = 5
# folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)

# def objective(trial):
#     """The function Optuna will try to minimize."""

#     # Suggest hyperparameters for Optuna to test in this trial
#     params = {
#         'objective': 'regression_l1',
#         'metric': 'mae',
#         'n_estimators': 5000, # Set high, early stopping will find the best
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05),
#         'num_leaves': trial.suggest_int('num_leaves', 20, 200),
#         'max_depth': trial.suggest_int('max_depth', 5, 15),
#         'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 1.0, log=True),
#         'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 1.0, log=True),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
#         'subsample': trial.suggest_float('subsample', 0.6, 1.0),
#         'n_jobs': -1,
#         'seed': 42
#     }

#     # --- Run K-Fold Cross-Validation for this set of parameters ---
#     fold_scores = []
#     from sklearn.metrics import mean_absolute_error
#     for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, y_train_log)):
#         X_train_fold, y_train_log_fold = X_train.iloc[train_idx], y_train_log.iloc[train_idx]
#         X_valid_fold, y_valid_log_fold = X_train.iloc[valid_idx], y_train_log.iloc[valid_idx]

#         model = lgb.LGBMRegressor(**params)

#         model.fit(
#             X_train_fold, y_train_log_fold,
#             eval_set=[(X_valid_fold, y_valid_log_fold)],
#             eval_metric='mae',
#             callbacks=[
#                 lgb.early_stopping(100, verbose=False), # Keep this to stop training
#                 lgb.log_evaluation(period=200)          # Add this to print the score every 200 rounds
#             ]
#         )

#         preds = model.predict(X_valid_fold)
#         mae_score = mean_absolute_error(y_valid_log_fold, preds)
#         fold_scores.append(mae_score)

#     # Return the average score across folds for this trial
#     return np.mean(fold_scores)

# # --- Run the optimization study ---
# print("Starting hyperparameter optimization with Optuna...")
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=50, timeout=600) # Run for 50 different parameter combinations

# print("\nOptimization finished.")
# print("Best trial's score (MAE):", study.best_value)
# print("Best trial's parameters:")
# best_params = study.best_params
# print(best_params)

In [40]:
%pip install optuna



# **Training The Model**

In [42]:
NFOLDS = 2
folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)

lgbm_sub_preds = np.zeros(X_test.shape[0])
xgb_sub_preds = np.zeros(X_test.shape[0])

lgbm_params = {
    'objective': 'regression_l1',
    'metric': 'mae',
    'n_estimators': 10000,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'num_leaves': 40,
    'verbose': -1,
    'n_jobs': -1,
    'seed': 42,
    'boosting_type': 'gbdt',
}

xgb_params = {
    'objective': 'reg:absoluteerror',
    'eval_metric': 'mae',
    'eta': 0.01,
    'max_depth': 7,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'alpha': 0.1,
    'lambda': 0.1,
    'nthread': -1,
    'seed': 42,
}


print("Starting Hybrid Model K-Fold Cross-Validation...")
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, y_train_log)):
    print(f"\n--- Fold {n_fold + 1} ---")
    X_train_fold, y_train_log_fold = X_train.iloc[train_idx], y_train_log.iloc[train_idx]
    X_valid_fold, y_valid_log_fold = X_train.iloc[valid_idx], y_train_log.iloc[valid_idx]

    # Train LightGBM Model
    print("Training LightGBM...")
    model_lgb = lgb.LGBMRegressor(**lgbm_params)
    model_lgb.fit(
        X_train_fold, y_train_log_fold,
        eval_set=[(X_valid_fold, y_valid_log_fold)],
        callbacks=[lgb.early_stopping(100, verbose=False), lgb.log_evaluation(period=500)]
    )
    lgbm_sub_preds += model_lgb.predict(X_test) / folds.n_splits

    # Train XGBoost Model
    print("Training XGBoost...")
    dtrain = xgb.DMatrix(X_train_fold, label=y_train_log_fold)
    dvalid = xgb.DMatrix(X_valid_fold, label=y_valid_log_fold)
    dtest = xgb.DMatrix(X_test)
    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

    model_xgb = xgb.train(
        params=xgb_params,
        dtrain=dtrain,
        num_boost_round=5000,
        evals=watchlist,
        early_stopping_rounds=100,
        verbose_eval=500
    )
    xgb_sub_preds += model_xgb.predict(dtest, iteration_range=(0, model_xgb.best_iteration + 1)) / NFOLDS

    # Clean up memory
    del model_lgb, model_xgb, dtrain, dvalid, dtest
    gc.collect()

print("\nK-Fold training for both models complete.")

Starting Hybrid Model K-Fold Cross-Validation...

--- Fold 1 ---
Training LightGBM...
[500]	valid_0's l1: 0.625945
[1000]	valid_0's l1: 0.612613
[1500]	valid_0's l1: 0.606759
[2000]	valid_0's l1: 0.603104
[2500]	valid_0's l1: 0.600613
[3000]	valid_0's l1: 0.597161
[3500]	valid_0's l1: 0.594904
[4000]	valid_0's l1: 0.593776
[4500]	valid_0's l1: 0.592966
[5000]	valid_0's l1: 0.591117
[5500]	valid_0's l1: 0.589884
[6000]	valid_0's l1: 0.589115
[6500]	valid_0's l1: 0.588642
[7000]	valid_0's l1: 0.58819
[7500]	valid_0's l1: 0.587915
[8000]	valid_0's l1: 0.587024
[8500]	valid_0's l1: 0.58639
[9000]	valid_0's l1: 0.586177
[9500]	valid_0's l1: 0.585864
[10000]	valid_0's l1: 0.585538
Training XGBoost...
[0]	train-mae:0.76198	valid-mae:0.76694
[500]	train-mae:0.44938	valid-mae:0.62724
[1000]	train-mae:0.38435	valid-mae:0.61666
[1346]	train-mae:0.36442	valid-mae:0.61534

--- Fold 2 ---
Training LightGBM...
[500]	valid_0's l1: 0.620694
[1000]	valid_0's l1: 0.60771
[1500]	valid_0's l1: 0.601842
[20

In [45]:
print("\nBlending predictions...")
blended_log_predictions = (lgbm_sub_preds * 0.5) + (xgb_sub_preds * 0.5)


final_hybrid_predictions = np.expm1(blended_log_predictions)
final_hybrid_predictions[final_hybrid_predictions < 0] = 0

submission_df = pd.DataFrame({
    'sample_id': final_df_test['sample_id'],
    'price': final_hybrid_predictions
})

submission_df.to_csv('submission_hybrid.csv', index=False)

print("\n✅ Hybrid model submission file 'submission_hybrid.csv' created successfully!")
display(submission_df.head())


Blending predictions...

✅ Hybrid model submission file 'submission_hybrid.csv' created successfully!


Unnamed: 0,sample_id,price
0,100179,14.354296
1,245611,15.182956
2,146263,22.822148
3,95658,18.688358
4,36806,25.591068


In [46]:
import joblib
joblib.dump(model_lgb, 'lgbm_model.pkl')
joblib.dump(model_xgb, 'xgb_model.pkl')
print("\n✅ Models saved successfully!")

NameError: name 'model_lgb' is not defined

In [None]:
print("Retraining LightGBM on full training data...")

final_lgbm_params = lgbm_params.copy()

final_model_lgb = lgb.LGBMRegressor(**final_lgbm_params)

# Fit on the full training data (X_train, y_train_log)
final_model_lgb.fit(X_train, y_train_log)

print("Final LightGBM model training complete.")

import joblib
model_lgb_filename = 'final_lgbm_model.pkl'
joblib.dump(final_model_lgb, model_lgb_filename)
print(f"✅ Final LightGBM model saved successfully to {model_lgb_filename}")

Retraining LightGBM on full training data...


In [None]:
# NFOLDS = 5
# folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)
# oof_preds = np.zeros(X_train.shape[0])
# sub_preds = np.zeros(X_test.shape[0])

# best_iterations = []

# lgbm_params = {
#     'objective': 'regression_l1',
#     'metric': 'mae',
#     'n_estimators': 2000,
#     'learning_rate': 0.02,
#     'num_leaves': 31,
#     'n_jobs': -1,
#     'seed': 42
# }

# print("Starting K-Fold Cross-Validation...")
# for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, y_train_log)):
#     X_train_fold, y_train_log_fold = X_train.iloc[train_idx], y_train_log.iloc[train_idx]
#     X_valid_fold, y_valid_log_fold = X_train.iloc[valid_idx], y_train_log.iloc[valid_idx]

#     model = lgb.LGBMRegressor(**lgbm_params)

#     model.fit(
#         X_train_fold, y_train_log_fold,
#         eval_set=[(X_valid_fold, y_valid_log_fold)],
#         eval_metric='mae',
#         callbacks=[lgb.early_stopping(100, verbose=False)]
#     )

#     oof_preds[valid_idx] = model.predict(X_valid_fold)

#     best_iter = model.best_iteration_
#     best_iterations.append(best_iter)

#     print(f"Fold {n_fold+1} finished. Best iteration: {best_iter}")

#     sub_preds += model.predict(X_test) / folds.n_splits


#     del model, X_train_fold, y_train_log_fold, X_valid_fold, y_valid_log_fold
#     gc.collect()

# optimal_estimators = int(np.mean(best_iterations))
# print(f"\nOptimal number of estimators found: {optimal_estimators}")

In [None]:
# # --- Re-train the model on the ENTIRE training set ---
# print("\nRe-training final model on all data...")

# # Update parameters with the optimal n_estimators
# final_lgbm_params = lgbm_params.copy()
# final_lgbm_params['n_estimators'] = optimal_estimators

# final_model = lgb.LGBMRegressor(**final_lgbm_params)

# # Fit on the full training data (X_train, y_train_log)
# final_model.fit(X_train, y_train_log)

# print("Final model training complete.")


# # Make final predictions
# print("Making predictions on the test set...")
# log_predictions = final_model.predict(X_test)
# final_predictions = np.expm1(log_predictions)
# final_predictions[final_predictions < 0] = 0


# # Create submission file
# submission_df = pd.DataFrame({
#     'sample_id': df_test['sample_id'],
#     'price': final_predictions
# })
# submission_df.to_csv('submission_cv.csv', index=False)

# print("\nSubmission file 'submission_cv.csv' created successfully!")
# display(submission_df.head())

# # Save the final model
# import joblib
# model_filename = 'final_lgbm_model.pkl'
# joblib.dump(final_model, model_filename)
# print(f"Final model saved successfully to {model_filename}")

In [None]:
def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / (denominator + 1e-8)) * 100

In [None]:
def objective(trial):
    """The function Optuna will try to minimize."""

    # Define the KFold strategy for this trial
    NFOLDS = 5
    folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)

    # Suggest hyperparameters for Optuna to test in this trial
    params = {
        'objective': 'regression_l1',
        'metric': 'mae',
        'n_estimators': 5000, # Set high, early stopping will find the best value
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05),
        'num_leaves': trial.suggest_int('num_leaves', 20, 200),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 1.0, log=True),  # L1 regularization
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 1.0, log=True), # L2 regularization
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0), # Feature fraction
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),          # Bagging fraction
        'n_jobs': -1,
        'seed': 42,
        'boosting_type': 'gbdt',
    }

    fold_mae_scores = []

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y_log)):
        X_train_fold, y_train_log_fold = X.iloc[train_idx], y_log.iloc[train_idx]
        X_valid_fold, y_valid_log_fold = X.iloc[valid_idx], y_log.iloc[valid_idx]

        model = lgb.LGBMRegressor(**params)

        model.fit(
            X_train_fold, y_train_log_fold,
            eval_set=[(X_valid_fold, y_valid_log_fold)],
            eval_metric='mae',
            callbacks=[lgb.early_stopping(100, verbose=True)]
        )

        preds = model.predict(X_valid_fold)
        mae_score = mean_absolute_error(y_valid_log_fold, preds)
        fold_mae_scores.append(mae_score)

        del model, X_train_fold, y_train_log_fold, X_valid_fold, y_valid_log_fold
        gc.collect()

    average_mae = np.mean(fold_mae_scores)
    return average_mae

In [None]:
study = optuna.create_study(direction='minimize')

study.optimize(objective, n_trials=50, timeout = 3600)

# After the study is complete, print the best results
print("\n✅ Optimization finished.")
print("Best trial's score (MAE):", study.best_value)
print("Best trial's parameters:")
best_params = study.best_params
print(best_params)

In [None]:
pip install -U xgboost

In [None]:
import xgboost as xgb
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
import gc

if X_train is not None and y_train_log is not None and X_test is not None:
    NFOLDS = 5
    folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)
    oof_preds_xgb = np.zeros(X_train.shape[0])
    sub_preds_xgb = np.zeros(X_test.shape[0])

    xgb_params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'mae',
        'learning_rate': 0.02,
        'max_depth': 6,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'seed': 42,
        'nthread': -1
    }

    print("Starting K-Fold Cross-Validation for XGBoost...")
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, y_train_log)):
        X_train_fold, y_train_log_fold = X_train.iloc[train_idx], y_train_log.iloc[train_idx]
        X_valid_fold, y_valid_log_fold = X_train.iloc[valid_idx], y_train_log.iloc[valid_idx]

        # Convert to DMatrix
        dtrain = xgb.DMatrix(X_train_fold, label=y_train_log_fold)
        dvalid = xgb.DMatrix(X_valid_fold, label=y_valid_log_fold)
        dtest = xgb.DMatrix(X_test)

        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

        model_xgb = xgb.train(
            params=xgb_params,
            dtrain=dtrain,
            num_boost_round=2000,
            evals=watchlist,
            early_stopping_rounds=100,
            verbose_eval=100
        )

        oof_preds_xgb[valid_idx] = model_xgb.predict(dvalid, iteration_range=(0, model_xgb.best_iteration + 1))

        print(f"Fold {n_fold + 1} finished. Best iteration: {model_xgb.best_iteration}")

        sub_preds_xgb += model_xgb.predict(dtest, iteration_range=(0, model_xgb.best_iteration + 1)) / NFOLDS

        del model_xgb, dtrain, dvalid, X_train_fold, y_train_log_fold, X_valid_fold, y_valid_log_fold
        gc.collect()

    print("\nXGBoost K-Fold Cross-Validation complete.")

    # Final Predictions
    final_predictions_xgb = np.expm1(sub_preds_xgb)
    final_predictions_xgb[final_predictions_xgb < 0] = 0

    if df_test is not None:
        submission_df_xgb = pd.DataFrame({
            'sample_id': df_test['sample_id'],
            'price': final_predictions_xgb
        })
        submission_df_xgb.to_csv('submission_xgb_cv.csv', index=False)
        print("\n✅ XGBoost Submission file 'submission_xgb_cv.csv' created successfully!")
        display(submission_df_xgb.head())
    else:
        print("\nSkipping XGBoost submission file creation: Test DataFrame not available.")
else:
    print("Skipping XGBoost model training and prediction: Data not prepared correctly.")


# **Deep Learning Model**

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

# Define a more complex Sequential model
model_dl_complex = Sequential([
    Dense(1024, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(512, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='relu')  # Output layer for regression
])

# Compile the model with a potentially lower learning rate
optimizer = Adam(learning_rate=0.001)
model_dl_complex.compile(optimizer=optimizer,
                         loss='mse', # Using Mean Squared Error for regression
                         metrics=['mae']) # Using Mean Absolute Error as a metric

print("More complex Deep Learning Model defined and compiled successfully!")
model_dl_complex.summary()

In [None]:
# Train the complex deep learning model
history = model_dl_complex.fit(
    X_train, y_train_log,
    epochs=50,  # You might need to adjust the number of epochs
    batch_size=32, # You might need to adjust the batch size
    validation_split=0.2, # Using a validation split for monitoring
    verbose=1 # Set to 0 for silent training
)

print("Deep Learning Model training complete.")

In [None]:
# Make predictions on the test set
print("Making predictions on the test set using the Deep Learning model...")
log_predictions_dl = model_dl_complex.predict(X_test)
final_predictions_dl = np.expm1(log_predictions_dl).flatten()

final_predictions_dl[final_predictions_dl < 0] = 0

if df_test is not None:
    submission_df_dl = pd.DataFrame({
        'sample_id': df_test['sample_id'],
        'price': final_predictions_dl
    })
    submission_df_dl.to_csv('submission_dl.csv', index=False)
    print("\nDeep Learning Submission file 'submission_dl.csv' created successfully!")
    display(submission_df_dl.head())
else:
    print("\nSkipping Deep Learning submission file creation: Test DataFrame not available.")