<a href="https://colab.research.google.com/github/ShaiikHaider/PricePredictionML/blob/main/ProductPricePrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ====================================================
# SMART PRODUCT PRICING CHALLENGE - PIPELINE WITH IMAGE FEATURE CACHING
# ====================================================

from google.colab import drive
drive.mount('/content/drive')

import os, zipfile, urllib, re
import pandas as pd
import numpy as np
from tqdm import tqdm
from pathlib import Path
from functools import partial
import multiprocessing
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from PIL import Image
import tensorflow as tf
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input

# ----------------------------
# PATHS
# ----------------------------
DATASET_FOLDER = '/content/drive/MyDrive/student_resource/dataset'
TRAIN_CSV = os.path.join(DATASET_FOLDER, 'train.csv')
TEST_CSV  = os.path.join(DATASET_FOLDER, 'test.csv')
IMAGE_FOLDER = 'downloaded_product_images'
os.makedirs(IMAGE_FOLDER, exist_ok=True)
os.makedirs("submission_files", exist_ok=True)

# Cache files for image features
TRAIN_IMG_FEATURES_FILE = 'train_img_features.npy'
TEST_IMG_FEATURES_FILE  = 'test_img_features.npy'

# ----------------------------
# LOAD DATA
# ----------------------------
train = pd.read_csv(TRAIN_CSV)
test  = pd.read_csv(TEST_CSV)
print("Train shape:", train.shape)
print("Test shape:", test.shape)

# ----------------------------
# OUTLIER REMOVAL
# ----------------------------
Q1 = train['price'].quantile(0.25)
Q3 = train['price'].quantile(0.75)
IQR = Q3-Q1
upper_bound = Q3 + 1.5*IQR
train = train[train['price'] <= upper_bound]

# ----------------------------
# TEXT CLEANING
# ----------------------------
def clean_text(text):
    text = str(text).lower()
    text = ''.join(c if c.isalnum() or c.isspace() else ' ' for c in text)
    return ' '.join(text.split())

train['clean_text'] = train['catalog_content'].apply(clean_text)
test['clean_text']  = test['catalog_content'].apply(clean_text)

# ----------------------------
# WEIGHT/COUNT FEATURES
# ----------------------------
def extract_weight_features(catalog_content):
    pattern = r"(\d+\.?\d*)\s*(Oz|oz|LB|lb|G|g|KG|kg|Count|count|Pack|pack|Packs|packs)"
    match = re.search(pattern, str(catalog_content), re.IGNORECASE)
    weight_in_grams = 0.0
    weight_found = 0
    count_pack = 0
    if match:
        try:
            value = float(match.group(1))
            unit = match.group(2).lower()
            if unit=='oz': weight_in_grams=value*28.35
            elif unit=='lb': weight_in_grams=value*453.59
            elif unit=='kg': weight_in_grams=value*1000
            elif unit=='g': weight_in_grams=value
            elif unit in ['count','pack','packs']: count_pack=value
            weight_found=1
        except: pass
    return pd.Series([weight_in_grams, weight_found, count_pack])

train[['weight_g','weight_found','count_pack']] = train['catalog_content'].apply(extract_weight_features)
test[['weight_g','weight_found','count_pack']] = test['catalog_content'].apply(extract_weight_features)

# ----------------------------
# DOWNLOAD IMAGES
# ----------------------------
def download_image(image_link, savefolder):
    if isinstance(image_link, str):
        filename = Path(image_link).name
        image_save_path = os.path.join(savefolder, filename)
        if not os.path.exists(image_save_path):
            try:
                urllib.request.urlretrieve(image_link, image_save_path)
            except Exception as ex:
                print(f'Warning: Not able to download - {image_link}\n{ex}')
        else:
            return
    return

def download_images(image_links, folder):
    download_partial = partial(download_image, savefolder=folder)
    with multiprocessing.Pool(50) as pool:
        for _ in tqdm(pool.imap(download_partial, image_links), total=len(image_links), desc="Downloading Images"):
            pass
        pool.close()
        pool.join()

print("Downloading train images...")
download_images(train['image_link'].dropna().unique().tolist(), IMAGE_FOLDER)
print("Downloading test images...")
download_images(test['image_link'].dropna().unique().tolist(), IMAGE_FOLDER)

# ----------------------------
# RESNET50 IMAGE FEATURE EXTRACTION WITH CACHING
# ----------------------------
resnet_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
TARGET_SIZE=(224,224)

def extract_img_feature(file_path):
    try:
        img = Image.open(file_path).convert('RGB').resize(TARGET_SIZE)
        x = np.expand_dims(np.array(img), axis=0)
        x = preprocess_input(x)
        feat = resnet_model.predict(x, verbose=0)
        return feat[0]
    except:
        return np.zeros(2048)

def bulk_extract_features(df, folder, cache_file):
    if os.path.exists(cache_file):
        print(f"Loading cached image features from {cache_file}")
        return np.load(cache_file)
    feats=[]
    for link in tqdm(df['image_link'].dropna(), desc=f"Extracting features for {cache_file}"):
        path=os.path.join(folder, Path(link).name)
        feats.append(extract_img_feature(path))
    feats = np.array(feats)
    np.save(cache_file, feats)
    return feats

train_img_feats = bulk_extract_features(train, IMAGE_FOLDER, TRAIN_IMG_FEATURES_FILE)
test_img_feats  = bulk_extract_features(test, IMAGE_FOLDER, TEST_IMG_FEATURES_FILE)

# Map back to dataframe order
def map_features(df, feats):
    feat_map = {link:feat for link,feat in zip(df['image_link'].dropna(), feats)}
    out_feats = []
    for link in df['image_link']:
        out_feats.append(feat_map.get(link,np.zeros(2048)))
    return np.array(out_feats)

X_img_train = map_features(train, train_img_feats)
X_img_test  = map_features(test, test_img_feats)

# PCA to reduce dimensions
pca = PCA(n_components=100)
X_img_train = pca.fit_transform(X_img_train)
X_img_test  = pca.transform(X_img_test)

# ----------------------------
# TEXT FEATURES
# ----------------------------
tfidf = TfidfVectorizer(max_features=5000)
X_text_train = tfidf.fit_transform(train['clean_text']).toarray()
X_text_test  = tfidf.transform(test['clean_text']).toarray()

# ----------------------------
# COMBINE ALL FEATURES
# ----------------------------
X_train_combined = np.hstack([X_text_train, X_img_train, train[['weight_g','weight_found','count_pack']].values])
X_test_combined  = np.hstack([X_text_test, X_img_test, test[['weight_g','weight_found','count_pack']].values])
y_train = train['price'].values

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_combined)
X_test_scaled  = scaler.transform(X_test_combined)

# ----------------------------
# TRAIN/VALIDATION SPLIT
# ----------------------------
X_tr,X_val,y_tr,y_val = train_test_split(X_train_scaled,y_train,test_size=0.1,random_state=42)

# ----------------------------
# TRAIN XGBOOST
# ----------------------------
model = XGBRegressor(
    n_estimators=500, learning_rate=0.05, max_depth=8,
    subsample=0.8, colsample_bytree=0.8, tree_method="hist", random_state=42
)
model.fit(X_tr, y_tr, eval_set=[(X_val,y_val)], verbose=False)
pred_val = model.predict(X_val)
print("Validation MAE:", mean_absolute_error(y_val, pred_val))

# ----------------------------
# PREDICT TEST SET
# ----------------------------
test_preds = model.predict(X_test_scaled)
test_preds = np.maximum(test_preds, 1.0)

# ----------------------------
# SAVE SUBMISSION CSV & ZIP
# ----------------------------
submission = pd.DataFrame({"sample_id": test['sample_id'], "price": test_preds})
submission.to_csv("submission_files/test_out.csv", index=False)

with open("submission_files/Documentation.txt","w") as f:
    f.write("""Smart Product Pricing Challenge
Model: TF-IDF + ResNet50 + XGBoost
Features: Cleaned text, Weight/Count, PCA on image embeddings
Evaluation: SMAPE
Developed in Colab
""")

with zipfile.ZipFile("submission.zip","w") as zipf:
    zipf.write("submission_files/test_out.csv")
    zipf.write("submission_files/Documentation.txt")

print("\n✅ Submission ready!")
print("- submission_files/test_out.csv")
print("- submission_files/Documentation.txt")
print("- submission.zip")


Mounted at /content/drive
Train shape: (75000, 4)
Test shape: (75000, 3)
Downloading train images...


Downloading Images:  53%|█████▎    | 35308/67184 [03:04<02:24, 221.30it/s]

HTTP Error 404: Not Found

Downloading Images:  53%|█████▎    | 35366/67184 [03:04<02:06, 252.17it/s]




Downloading Images: 100%|██████████| 67184/67184 [05:39<00:00, 198.03it/s]


Downloading test images...


Downloading Images:  57%|█████▋    | 41026/72222 [03:31<01:59, 260.24it/s]

HTTP Error 404: Not Found


Downloading Images: 100%|██████████| 72222/72222 [06:34<00:00, 183.14it/s]


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 0us/step


Extracting features for train_img_features.npy:  21%|██▏       | 14808/69476 [37:58<2:41:48,  5.63it/s]