In [1]:
import os
import pandas as pd
import requests
from PIL import Image , UnidentifiedImageError
from io import BytesIO
from transformers import pipeline
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import Ridge
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split , RandomizedSearchCV
from sklearn.metrics import make_scorer
from tqdm import tqdm
from utils import download_images



In [12]:
def load_image_from_file(image_link, base_folder):
    try:
        filename = os.path.basename(image_link)
        filepath = os.path.join(base_folder, filename)
        img = Image.open(filepath)
        img.load() 
        
        return img.convert("RGB")
    except (FileNotFoundError, OSError, UnidentifiedImageError) as e:
        return None

In [13]:
def engineer_features(df):

    df_featured = df.copy()
    content_lower = df_featured['catalog_content'].str.lower()

    #Making different feature columns that may affect pricing
    df_featured['pack_of'] = content_lower.str.extract(r'(?:pack of|includes|pack of:|pack)\s*(\d+)').astype(float)
    df_featured['per_case'] = content_lower.str.extract(r'(\d+)\s*per case').astype(float)
    df_featured['count'] = content_lower.str.extract(r'(\d+)\s*(?:count|ct|boxes)').astype(float)
    df_featured['weight_oz'] = content_lower.str.extract(r'(\d+\.?\d*)\s*(?:oz|ounce)').astype(float)
    df_featured['weight_lb'] = content_lower.str.extract(r'(\d+\.?\d*)\s*(?:lb|pound)').astype(float)
    df_featured['quantity'] = df_featured[['pack_of', 'per_case', 'count']].bfill(axis=1).iloc[:, 0]
    
    #Identifying Brands
    brands = df_featured['catalog_content'].str.extract(r'Item Name:\s*([A-Z][a-zA-Z\'-]+(?:\s[A-Z][a-zA-Z\'-]+){0,2})')
    df_featured['brand'] = brands[0].str.strip()

    #Other Keywords that may affect pricing
    keywords = ['organic', 'gourmet', 'sugar free', 'gluten free', 'nut-free', 'made in usa', 'original', 'kosher']
    for keyword in keywords:
        df_featured[f'is_{keyword.replace(" ", "_")}'] = content_lower.str.contains(keyword, regex=False).astype(int)
    
    df_featured['weight_oz'] = df_featured['weight_oz'].fillna(0) + df_featured['weight_lb'].fillna(0) * 16
    num_features_to_fill = ['quantity', 'weight_oz']
    df_featured[num_features_to_fill] = df_featured[num_features_to_fill].fillna(0)
    
    df_featured['brand'] = df_featured['brand'].fillna('Unknown')
    df_featured['brand_encoded'] = LabelEncoder().fit_transform(df_featured['brand'])
    
    return df_featured.drop(columns=['weight_lb','pack_of', 'per_case', 'count'])

In [14]:
def process_data_in_batches(df, batch_size, download_folder, text_pipeline, image_pipeline):

    all_text_embeddings = []
    all_image_embeddings = []
    all_engineered_features = []
    all_labels = []
    
    feature_cols = [col for col in df.columns if df[col].dtype in ['int64', 'float64', 'int32'] and col not in ['sample_id', 'price']]
    
    
    num_batches = (len(df) // batch_size) + 1

    
    for i in tqdm(range(0, len(df), batch_size), desc="Processing training batches", total=num_batches):
        batch_df = df.iloc[i:i + batch_size]
        
        batch_images = [load_image_from_file(url, download_folder) for url in batch_df['image_link']]
        
        valid_indices = [idx for idx, img in enumerate(batch_images) if img is not None]
        if not valid_indices:
            continue

        clean_batch_df = batch_df.iloc[valid_indices]
        clean_batch_images = [batch_images[idx] for idx in valid_indices]
        
        # Extract embeddings
        text_output = text_pipeline(clean_batch_df['catalog_content'].tolist(), truncation=True, max_length=512)
        all_text_embeddings.extend([res[0][0] for res in text_output])
        
        
        image_output = image_pipeline(clean_batch_images)
        all_image_embeddings.extend([res[0][0] for res in image_output])

        
        all_engineered_features.extend(clean_batch_df[feature_cols].values)
        all_labels.extend(clean_batch_df['price'].values)

    return np.array(all_text_embeddings), np.array(all_image_embeddings), np.array(all_engineered_features), np.array(all_labels)


In [49]:
def process_test_data_in_batches(df, batch_size, download_folder, text_pipeline, image_pipeline):

    
    all_text_embeddings = []
    all_image_embeddings = []
    all_engineered_features = []
    original_indices = [] # Keep track of which rows were successfully processed
    
    # Identify the engineered feature columns to use
    feature_cols = [col for col in df.columns if df[col].dtype in ['int64', 'float64', 'int32'] and col != 'sample_id']
    
    num_batches = (len(df) // batch_size) + 1
    for i in tqdm(range(0, len(df), batch_size), desc="Processing test batches", total=num_batches):
        batch_df = df.iloc[i:i + batch_size]
        
        # Load images for the current batch
        batch_images = [load_image_from_file(url, download_folder) for url in batch_df['image_link']]
        
        # Clean data and get local indices of valid images in the batch
        valid_local_indices = [idx for idx, img in enumerate(batch_images) if img is not None]
        
        if not valid_local_indices:
            continue # Skip batch if no images could be loaded
            
        # Get the original DataFrame indices of the valid rows
        clean_batch_df = batch_df.iloc[valid_local_indices]
        original_indices.extend(clean_batch_df.index)
        
        # Prepare clean data for the pipelines
        clean_batch_images = [batch_images[idx] for idx in valid_local_indices]
        clean_batch_text = clean_batch_df['catalog_content'].tolist()
        
        # Extract features for the batch
        text_output = text_pipeline(clean_batch_text, truncation=True, max_length=512)
        all_text_embeddings.extend([res[0][0] for res in text_output])
        
        image_output = image_pipeline(clean_batch_images)
        all_image_embeddings.extend([res[0][0] for res in image_output])
        
        # Append the corresponding engineered features
        all_engineered_features.extend(clean_batch_df[feature_cols].values)
        
    return (
        np.array(all_text_embeddings), 
        np.array(all_image_embeddings), 
        np.array(all_engineered_features), 
        original_indices
    )

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df.head(200)

Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.890
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.120
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.970
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.340
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.490
...,...,...,...,...
195,265271,Item Name: Gourmet Milk Chocolate Covered Almo...,https://m.media-amazon.com/images/I/61y2OL95+T...,21.990
196,255173,Item Name: Realemon Juice 4 Case 1 Gallon\nVal...,https://m.media-amazon.com/images/I/51yYpfPIUg...,88.090
197,165947,"Item Name: Jelly Belly, Gourmet Jelly Beans, 3...",https://m.media-amazon.com/images/I/41tIYtq4p+...,5.490
198,185249,Item Name: Swedish Lingonberry Preserves by Fe...,https://m.media-amazon.com/images/I/51uxP6z295...,7.290


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000 entries, 0 to 74999
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sample_id        75000 non-null  int64  
 1   catalog_content  75000 non-null  object 
 2   image_link       75000 non-null  object 
 3   price            75000 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 2.3+ MB


In [7]:
text_content = df['catalog_content']
image_url = df['image_link']
image_url_list = df['image_link'].tolist()
print(text_content)
print(image_url)

0        Item Name: La Victoria Green Taco Sauce Mild, ...
1        Item Name: Salerno Cookies, The Original Butte...
2        Item Name: Bear Creek Hearty Soup Bowl, Creamy...
3        Item Name: Judee’s Blue Cheese Powder 11.25 oz...
4        Item Name: kedem Sherry Cooking Wine, 12.7 Oun...
                               ...                        
74995    Item Name: ICE BREAKERS Spearmint Sugar Free M...
74996    Item Name: Davidson's Organics, Vanilla Essenc...
74997    Item Name: Jolly Rancher Hard Candy - Blue Ras...
74998    Item Name: Nescafe Dolce Gusto Capsules - CARA...
74999    Item Name: Pimenton de la Vera - Picante (2.47...
Name: catalog_content, Length: 75000, dtype: object
0        https://m.media-amazon.com/images/I/51mo8htwTH...
1        https://m.media-amazon.com/images/I/71YtriIHAA...
2        https://m.media-amazon.com/images/I/51+PFEe-w-...
3        https://m.media-amazon.com/images/I/41mu0HAToD...
4        https://m.media-amazon.com/images/I/41sA037+Qv...
    

In [15]:
df_featured= engineer_features(df)
df_featured.head(200)

Unnamed: 0,sample_id,catalog_content,image_link,price,weight_oz,quantity,brand,is_organic,is_gourmet,is_sugar_free,is_gluten_free,is_nut-free,is_made_in_usa,is_original,is_kosher,brand_encoded
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.890,12.00,6.0,La Victoria Green,0,0,0,0,0,0,0,0,20127
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.120,8.00,4.0,Salerno Cookies,0,0,0,0,0,0,1,0,32169
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.970,1.90,6.0,Bear Creek Hearty,0,0,0,0,0,0,0,0,2795
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.340,11.25,0.0,Judee,0,0,0,0,1,1,0,0,17931
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.490,12.70,12.0,Unknown,0,0,0,0,0,0,0,0,38338
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,265271,Item Name: Gourmet Milk Chocolate Covered Almo...,https://m.media-amazon.com/images/I/61y2OL95+T...,21.990,16.00,0.0,Gourmet Milk Chocolate,1,1,0,0,0,0,0,1,14029
196,255173,Item Name: Realemon Juice 4 Case 1 Gallon\nVal...,https://m.media-amazon.com/images/I/51yYpfPIUg...,88.090,0.00,0.0,Realemon Juice,0,0,0,0,0,0,0,0,30535
197,165947,"Item Name: Jelly Belly, Gourmet Jelly Beans, 3...",https://m.media-amazon.com/images/I/41tIYtq4p+...,5.490,7.00,4.0,Jelly Belly,0,1,0,1,0,0,0,0,17569
198,185249,Item Name: Swedish Lingonberry Preserves by Fe...,https://m.media-amazon.com/images/I/51uxP6z295...,7.290,14.50,2.0,Swedish Lingonberry Preserves,0,0,0,0,0,0,0,0,35623


In [8]:
DOWNLOAD_FOLDER = 'product_images'
BATCH_SIZE = 100

In [9]:
df_test = pd.read_csv("test.csv")

In [10]:
text_content_test = df_test['catalog_content']
image_url_test = df_test['image_link']
image_url_list_test = df_test['image_link'].tolist()
print(text_content_test)
print(image_url_test)

0        Item Name: Rani 14-Spice Eshamaya's Mango Chut...
1        Item Name: Natural MILK TEA Flavoring extract ...
2        Item Name: Honey Filled Hard Candy - Bulk Pack...
3        Item Name: Vlasic Snack'mm's Kosher Dill 16 Oz...
4        Item Name: McCormick Culinary Vanilla Extract,...
                               ...                        
74995    Item Name: Good Seasons Zezty Italian Salad Dr...
74996    Item Name: Colombina Swirled Love Tiger Pops, ...
74997    Item Name: Kerns, Guava Nectar, 11.5 Fl Oz Can...
74998    Item Name: NY SPICE SHOP Licorice Candy - 1 Po...
74999    Item Name: Rumford Baking Powder, 10-Ounces (P...
Name: catalog_content, Length: 75000, dtype: object
0        https://m.media-amazon.com/images/I/71hoAn78AW...
1        https://m.media-amazon.com/images/I/61ex8NHCIj...
2        https://m.media-amazon.com/images/I/61KCM61J8e...
3        https://m.media-amazon.com/images/I/51Ex6uOH7y...
4        https://m.media-amazon.com/images/I/71QYlrOMoS...
    

In [11]:
DOWNLOAD_FOLDER_TEST = 'product_images_test'

In [16]:
df_test_featured = engineer_features(df_test)
df_test_featured.head(200)

Unnamed: 0,sample_id,catalog_content,image_link,weight_oz,quantity,brand,is_organic,is_gourmet,is_sugar_free,is_gluten_free,is_nut-free,is_made_in_usa,is_original,is_kosher,brand_encoded
0,100179,Item Name: Rani 14-Spice Eshamaya's Mango Chut...,https://m.media-amazon.com/images/I/71hoAn78AW...,10.50,0.0,Rani,0,1,0,1,0,0,0,0,30073
1,245611,Item Name: Natural MILK TEA Flavoring extract ...,https://m.media-amazon.com/images/I/61ex8NHCIj...,2.00,1.0,Natural MILK TEA,0,1,0,1,0,0,1,0,24853
2,146263,Item Name: Honey Filled Hard Candy - Bulk Pack...,https://m.media-amazon.com/images/I/61KCM61J8e...,32.00,2.0,Honey Filled Hard,0,0,0,0,0,0,0,0,16147
3,95658,Item Name: Vlasic Snack'mm's Kosher Dill 16 Oz...,https://m.media-amazon.com/images/I/51Ex6uOH7y...,16.00,2.0,Vlasic Snack'mm's Kosher,0,0,0,0,0,0,0,1,38812
4,36806,"Item Name: McCormick Culinary Vanilla Extract,...",https://m.media-amazon.com/images/I/71QYlrOMoS...,0.00,0.0,McCormick Culinary Vanilla,0,0,0,1,0,0,0,1,22818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,295868,Item Name: Williams Country Gravy Flavored wit...,https://m.media-amazon.com/images/I/71Vu3plBWw...,2.50,0.0,Williams Country Gravy,0,0,0,0,0,0,0,0,39692
196,212510,Item Name: Generic 8. Trac.k Foods Blonde Chic...,https://m.media-amazon.com/images/I/81GXfclDxm...,15.00,12.0,Generic,0,0,0,0,0,0,0,0,13029
197,232238,"Item Name: Trolli Sour Brite Crawlers, Very Be...",https://m.media-amazon.com/images/I/91k2jgitIO...,28.80,2.0,Trolli Sour Brite,0,0,0,0,0,0,0,0,37680
198,269310,"Item Name: Purina Pro Plan Wet Dog Food, Adult...",https://m.media-amazon.com/images/I/51WhCfBPvs...,13.00,12.0,Purina Pro Plan,0,0,0,0,0,0,0,0,29627


In [None]:
download_images(image_url_list, DOWNLOAD_FOLDER)
print(f"✅ All images have been downloaded to the '{DOWNLOAD_FOLDER}' folder.")

In [None]:
download_images(image_url_list_test, DOWNLOAD_FOLDER_TEST)
print(f"✅ All images have been downloaded to the '{DOWNLOAD_FOLDER_TEST}' folder.")

In [34]:
text_feature_extractor = pipeline(
    'feature-extraction',
    model='sentence-transformers/all-distilroberta-v1',
    device=0)


config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


In [35]:
image_feature_extractor = pipeline(
    'image-feature-extraction',
    model='google/vit-base-patch16-224',
    device=0,
    use_fast=True)

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
Device set to use cuda:0


In [36]:
X_text, X_images, X_engineered, y = process_data_in_batches(
    df_featured,
    BATCH_SIZE,
    DOWNLOAD_FOLDER,
    text_feature_extractor,
    image_feature_extractor
)

Processing training batches:   1%|▏         | 10/751 [01:11<1:26:33,  7.01s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing training batches: 100%|█████████▉| 750/751 [1:41:27<00:08,  8.12s/it]


In [50]:
np.save('text_embeddings.npy', X_text)
np.save('image_embeddings.npy', X_images)
np.save('feature_embeddings.npy',X_engineered)
np.save('labels.npy', y)

In [37]:
np.save('text_embeddings_2.npy', X_text)
np.save('image_embeddings_2.npy', X_images)
np.save('feature_embeddings_2.npy',X_engineered)
np.save('labels_2.npy', y)

In [26]:
X_text=np.load('text_embeddings.npy')
X_images=np.load('image_embeddings.npy')
X_engineered=np.load('feature_embeddings.npy')
y=np.load('labels.npy')

In [38]:
y_log = np.log1p(y)

In [39]:
X_train_text, X_test_text, X_train_images, X_test_images, X_train_eng, X_test_eng, y_train_log, y_test_log = train_test_split(
    X_text,
    X_images,
    X_engineered,
    y_log, 
    test_size=0.2, 
    random_state=42
)


In [40]:
scaler = StandardScaler()

In [41]:
X_train_eng_scaled = scaler.fit_transform(X_train_eng)
X_test_eng_scaled = scaler.transform(X_test_eng)

In [42]:
X_train_text_eng = np.concatenate((X_train_text, X_train_eng_scaled), axis=1)
X_test_text_eng = np.concatenate((X_test_text, X_test_eng_scaled), axis=1)

X_train_image_eng = np.concatenate((X_train_images, X_train_eng_scaled), axis=1)
X_test_image_eng = np.concatenate((X_test_images, X_test_eng_scaled), axis=1)


In [69]:
model_text = xgb.XGBRegressor(
    subsample=0.7,
    n_estimators=2000, 
    max_depth=7,
    learning_rate=0.02,
    colsample_bytree=0.7,
    n_jobs=-1, 
    random_state=42,
    early_stopping_rounds=50,
    device='gpu'
)

print("Training Text + Engineered Features Model:")
model_text.fit(
    X_train_text_eng,
    y_train_log,
    eval_set=[(X_test_text_eng, y_test_log)],
    verbose=True 
)


model_image = xgb.XGBRegressor(
    subsample=0.7,
    n_estimators=2000,
    max_depth=7,
    learning_rate=0.02,
    colsample_bytree=0.7,
    n_jobs=-1, 
    random_state=123, 
    early_stopping_rounds=50,
    device='gpu'
)

print("Training Image + Engineered Features Model:")
model_image.fit(
    X_train_image_eng,
    y_train_log,
    eval_set=[(X_test_image_eng, y_test_log)],
    verbose=True
)


preds_text = model_text.predict(X_test_text_eng)
preds_image = model_image.predict(X_test_image_eng)

stacked_predictions = np.column_stack((preds_text, preds_image))


meta_model = Ridge(alpha=0.5)
print("Training Meta-Model:")
meta_model.fit(stacked_predictions, y_test_log)


train_preds_text = model_text.predict(X_train_text_eng)
train_preds_image = model_image.predict(X_train_image_eng)
stacked_train_preds = np.column_stack((train_preds_text, train_preds_image))

meta_model_final = Ridge(alpha=0.5)
meta_model_final.fit(
    np.vstack((stacked_train_preds, stacked_predictions)),
    np.hstack((y_train_log, y_test_log))
)

final_ensemble_preds = meta_model.predict(stacked_predictions)
final_score = r2_score(y_test_log, final_ensemble_preds)

print(f"Ensemble R^2 score: {final_score:.4f}")

Training Text + Engineered Features Model:
[0]	validation_0-rmse:0.95166
[1]	validation_0-rmse:0.94802
[2]	validation_0-rmse:0.94567
[3]	validation_0-rmse:0.94242
[4]	validation_0-rmse:0.93910
[5]	validation_0-rmse:0.93616
[6]	validation_0-rmse:0.93389
[7]	validation_0-rmse:0.93164
[8]	validation_0-rmse:0.92845
[9]	validation_0-rmse:0.92551
[10]	validation_0-rmse:0.92280
[11]	validation_0-rmse:0.92004
[12]	validation_0-rmse:0.91743
[13]	validation_0-rmse:0.91469
[14]	validation_0-rmse:0.91209
[15]	validation_0-rmse:0.90948
[16]	validation_0-rmse:0.90709
[17]	validation_0-rmse:0.90486
[18]	validation_0-rmse:0.90261
[19]	validation_0-rmse:0.90028
[20]	validation_0-rmse:0.89794
[21]	validation_0-rmse:0.89566
[22]	validation_0-rmse:0.89336
[23]	validation_0-rmse:0.89172
[24]	validation_0-rmse:0.88960
[25]	validation_0-rmse:0.88728
[26]	validation_0-rmse:0.88515
[27]	validation_0-rmse:0.88324
[28]	validation_0-rmse:0.88175
[29]	validation_0-rmse:0.88024
[30]	validation_0-rmse:0.87841
[31]	v

**Testing the Model**

In [50]:
X_text_test, X_images_test, X_engineered_test, valid_indices = process_test_data_in_batches(
    df_test_featured,
    BATCH_SIZE,
    DOWNLOAD_FOLDER_TEST,
    text_feature_extractor,
    image_feature_extractor
)

Processing test batches: 100%|█████████▉| 750/751 [2:33:23<00:12, 12.27s/it]  


In [51]:
np.save('text_embeddings_test_2.npy', X_text_test)
np.save('image_embeddings_test_2.npy', X_images_test)
np.save('feature_embeddings_test_2.npy',X_engineered_test)
np.save('valid_indices_2.npy', valid_indices)

In [68]:
X_text_test=np.load('text_embeddings_test_2.npy')
X_images_test=np.load('image_embeddings_test_2.npy')
X_engineered_test=np.load('feature_embeddings_test_2.npy')
valid_indices=np.load('valid_indices_2.npy')

In [62]:
X_engineered_test_scaled = scaler.transform(X_engineered_test)

In [63]:
X_test_text_eng_final = np.concatenate((X_text_test, X_engineered_test_scaled), axis=1)
X_test_image_eng_final = np.concatenate((X_images_test, X_engineered_test_scaled), axis=1)

In [64]:
log_preds_text_final = model_text.predict(X_test_text_eng_final)
log_preds_image_final = model_image.predict(X_test_image_eng_final)

In [65]:
stacked_preds_final = np.column_stack((log_preds_text_final, log_preds_image_final))
final_log_predictions = meta_model_final.predict(stacked_preds_final)

In [66]:
final_predictions = np.expm1(final_log_predictions)
final_predictions = np.maximum(0, final_predictions)

In [67]:
submission_df = pd.DataFrame({
    'sample_id': df_test.loc[valid_indices, 'sample_id'],
    'price': final_predictions
})

# Handle any samples that were dropped due to image loading errors
if len(submission_df) != len(df_test):
    full_submission_df = pd.DataFrame({'sample_id': df_test['sample_id']})
    full_submission_df = full_submission_df.merge(submission_df, on='sample_id', how='left')
    mean_price = df['price'].mean() # Use mean from original training data
    full_submission_df['price'].fillna(mean_price, inplace=True)
    submission_df = full_submission_df

submission_df.to_csv('submission.csv', index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  full_submission_df['price'].fillna(mean_price, inplace=True)
  full_submission_df['price'].fillna(mean_price, inplace=True)


**Sample Testing**

In [53]:
df_sample = pd.read_csv("sample_test.csv")

In [54]:
text_content_sample = df_sample['catalog_content']
image_url_sample = df_sample['image_link']
image_url_list_sample = df_sample['image_link'].tolist()
print(text_content_sample)
print(image_url_sample)

0     Item Name: Gift Basket Village Gourmet Meat an...
1     Item Name: NPG Dried Lotus Seeds 16 Oz, Uncook...
2     Item Name: Annies Homegrown Macaroni and Chees...
3     Item Name: Bear Creek Country Kitchens Creamy ...
4     Item Name: Japanese Kelp Kombu Umami Soup Stoc...
                            ...                        
95    Item Name: Desert Essence Coconut Body Wash - ...
96    Item Name: Pacific Merchants Acaciaware 10- by...
97    Item Name: Near East Quinoa Rosemary Olive Oil...
98    Item Name: Foods Alive | Organic Zesty Italian...
99    Item Name: Jack Link's Classics Meat Sticks, O...
Name: catalog_content, Length: 100, dtype: object
0     https://m.media-amazon.com/images/I/91GB1wC6Ob...
1     https://m.media-amazon.com/images/I/81VnzF1vkv...
2     https://m.media-amazon.com/images/I/51aCDMHMnI...
3     https://m.media-amazon.com/images/I/71dzRyLGPi...
4     https://m.media-amazon.com/images/I/71Yu21cGwr...
                            ...                       

In [55]:
DOWNLOAD_FOLDER_SAMPLE = 'product_images_sample'

In [452]:
download_images(image_url_list_sample, DOWNLOAD_FOLDER_SAMPLE)
print(f"✅ All images have been downloaded to the '{DOWNLOAD_FOLDER_SAMPLE}' folder.")

100%|██████████| 100/100 [00:15<00:00,  6.43it/s]


✅ All images have been downloaded to the 'product_images_sample' folder.


In [56]:
df_sample_featured = engineer_features(df_sample)
df_sample_featured.head(200)

Unnamed: 0,sample_id,catalog_content,image_link,weight_oz,quantity,brand,is_organic,is_gourmet,is_sugar_free,is_gluten_free,is_nut-free,is_made_in_usa,is_original,is_kosher,brand_encoded
0,217392,Item Name: Gift Basket Village Gourmet Meat an...,https://m.media-amazon.com/images/I/91GB1wC6Ob...,7.0,0.0,Gift Basket Village,0,1,0,0,0,0,0,0,32
1,209156,"Item Name: NPG Dried Lotus Seeds 16 Oz, Uncook...",https://m.media-amazon.com/images/I/81VnzF1vkv...,16.0,0.0,NPG Dried Lotus,0,0,0,0,0,0,0,0,60
2,262333,Item Name: Annies Homegrown Macaroni and Chees...,https://m.media-amazon.com/images/I/51aCDMHMnI...,6.0,0.0,Annies Homegrown Macaroni,1,0,0,0,0,0,0,0,1
3,295979,Item Name: Bear Creek Country Kitchens Creamy ...,https://m.media-amazon.com/images/I/71dzRyLGPi...,10.1,0.0,Bear Creek Country,0,0,0,0,0,0,0,0,4
4,50604,Item Name: Japanese Kelp Kombu Umami Soup Stoc...,https://m.media-amazon.com/images/I/71Yu21cGwr...,14.0,10.0,Japanese Kelp Kombu,0,0,0,0,0,0,0,0,44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,289489,Item Name: Desert Essence Coconut Body Wash - ...,https://m.media-amazon.com/images/I/610gc5aZ-y...,8.0,0.0,Desert Essence Coconut,1,0,0,0,0,0,0,0,20
96,297033,Item Name: Pacific Merchants Acaciaware 10- by...,https://m.media-amazon.com/images/I/81dLFRyIBB...,0.0,0.0,Pacific Merchants Acaciaware,0,0,0,0,0,0,0,0,65
97,79518,Item Name: Near East Quinoa Rosemary Olive Oil...,https://m.media-amazon.com/images/I/61VqmUxzK4...,0.0,0.0,Near East Quinoa,0,0,0,0,0,0,0,0,61
98,262905,Item Name: Foods Alive | Organic Zesty Italian...,https://m.media-amazon.com/images/I/71a7Yoov2Z...,16.0,0.0,Foods Alive,1,0,0,0,0,0,0,0,30


In [454]:
X_text_sample, X_images_sample, X_engineered_sample, valid_indices_sample = process_test_data_in_batches(
    df_sample_featured,
    BATCH_SIZE,
    DOWNLOAD_FOLDER_SAMPLE,
    text_feature_extractor,
    image_feature_extractor
)

Processing test batches:  50%|█████     | 1/2 [00:10<00:10, 10.48s/it]


In [455]:
np.save('text_embeddings_sample.npy', X_text_sample)
np.save('image_embeddings_sample.npy', X_images_sample)
np.save('feature_embeddings_sample.npy',X_engineered_sample)
np.save('valid_indices_sample.npy', valid_indices_sample)

In [57]:
X_text_sample=np.load('text_embeddings_sample.npy')
X_images_sample=np.load('image_embeddings_sample.npy')
X_engineered_sample=np.load('feature_embeddings_sample.npy')
valid_indices_sample=np.load('valid_indices_sample.npy')

In [58]:
X_engineered_sample_scaled = scaler.transform(X_engineered_sample)

In [59]:
X_combined_sample = np.concatenate((X_text_sample, X_images_sample, X_engineered_sample_scaled), axis=1)

In [60]:
log_predictions_sample = model.predict(X_combined_sample)

In [61]:
final_predictions_sample = np.expm1(log_predictions_sample)

In [62]:
final_predictions_sample = np.maximum(0, final_predictions_sample)

In [56]:
submission_df_sample = pd.DataFrame({
    'sample_id': df_sample.loc[valid_indices_sample, 'sample_id'],
    'price': final_predictions_sample
})

In [57]:
if len(submission_df_sample) != len(df_sample):
    full_submission_df_sample = pd.DataFrame({'sample_id': df_sample['sample_id']})
    full_submission_df_sample = full_submission_df_sample.merge(submission_df_sample, on='sample_id', how='left')
    # Fill any missing predictions (from failed image loads) with the mean price
    mean_price_sample = full_submission_df_sample['price'].mean()
    full_submission_df_sample['price'].fillna(mean_price_sample, inplace=True)
    submission_df_sample = full_submission_df_sample

In [58]:
submission_df_sample.to_csv('submission_sample.csv', index=False)