In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#test_df = pd.read_csv("test.csv")
train_df = pd.read_csv(
    'train.csv',
    on_bad_lines='skip',      # Skip malformed lines (like row 42805)
    encoding='utf-8',
    engine='python'           # More forgiving than default 'c' engine
)


In [None]:
test_df = pd.read_csv(
    "test.csv",
    on_bad_lines='skip',   # skips malformed lines
    encoding='utf-8',      # try utf-8 first
    engine='python'        # more forgiving parser
)

In [None]:
test_df.head()

Unnamed: 0,sample_id,catalog_content,image_link
0,100179,Item Name: Rani 14-Spice Eshamaya's Mango Chut...,https://m.media-amazon.com/images/I/71hoAn78AW...
1,245611,Item Name: Natural MILK TEA Flavoring extract ...,https://m.media-amazon.com/images/I/61ex8NHCIj...
2,146263,Item Name: Honey Filled Hard Candy - Bulk Pack...,https://m.media-amazon.com/images/I/61KCM61J8e...
3,95658,Item Name: Vlasic Snack'mm's Kosher Dill 16 Oz...,https://m.media-amazon.com/images/I/51Ex6uOH7y...
4,36806,"Item Name: McCormick Culinary Vanilla Extract,...",https://m.media-amazon.com/images/I/71QYlrOMoS...


In [None]:
train_df.head()

Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


In [None]:
print(f"Test Shape:{test_df.shape}")
print(f"Train Shape:{train_df.shape}")

Test Shape:(75000, 3)
Train Shape:(75000, 4)


In [None]:
print(f"Missing Values:{test_df.isnull().sum}")
print(f"Missing Values:{train_df.isnull().sum}")

Missing Values:<bound method DataFrame.sum of        sample_id  catalog_content  image_link
0          False            False       False
1          False            False       False
2          False            False       False
3          False            False       False
4          False            False       False
...          ...              ...         ...
74995      False            False       False
74996      False            False       False
74997      False            False       False
74998      False            False       False
74999      False            False       False

[75000 rows x 3 columns]>
Missing Values:<bound method DataFrame.sum of        sample_id  catalog_content  image_link  price
0          False            False       False  False
1          False            False       False  False
2          False            False       False  False
3          False            False       False  False
4          False            False       False  False
...        

In [None]:
# Remove negative or zero prices
train_df = train_df[train_df['price'] > 0]

# Optional: Remove extreme outliers (e.g., price > $10,000 if unrealistic)
train_df = train_df[train_df['price'] < 10000]

In [None]:
train_df['catalog_content'] = train_df['catalog_content'].fillna('unknown')
test_df['catalog_content'] = test_df['catalog_content'].fillna('unknown')

In [None]:
print(train_df['catalog_content'] == train_df['catalog_content'].fillna('unknown'))
print(test_df['catalog_content'] == test_df['catalog_content'].fillna('unknown'))

0        True
1        True
2        True
3        True
4        True
         ... 
74995    True
74996    True
74997    True
74998    True
74999    True
Name: catalog_content, Length: 75000, dtype: bool
0        True
1        True
2        True
3        True
4        True
         ... 
74995    True
74996    True
74997    True
74998    True
74999    True
Name: catalog_content, Length: 75000, dtype: bool


In [None]:
import re

def clean_text(text):
    text = str(text).lower()                     # lowercase
    text = re.sub(r'[^\w\s]', ' ', text)         # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()     # remove extra spaces
    return text

train_df['clean_text'] = train_df['catalog_content'].apply(clean_text)
test_df['clean_text'] = test_df['catalog_content'].apply(clean_text)

Handelling Images

In [None]:
import requests
from PIL import Image
from io import BytesIO
import numpy as np

def load_image(url, size=(224, 224)):
    try:
        response = requests.get(url, timeout=10)
        img = Image.open(BytesIO(response.content)).convert('RGB')
        img = img.resize(size)
        return np.array(img) / 255.0  # normalize to [0,1]
    except:
        # Return a blank image if download fails
        return np.zeros((*size, 3))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use your cleaned text column
tfidf = TfidfVectorizer(
    max_features=10000,     # limit to top 10k words
    stop_words='english',   # remove common words like "the", "and"
    ngram_range=(1, 2)      # use single words + pairs ("iphone case")
)

X_train_text = tfidf.fit_transform(train_df['clean_text'])
X_test_text = tfidf.transform(test_df['clean_text'])

y_train = train_df['price'].values

In [None]:
import torch
import torchvision.models as models
from torchvision import transforms
from PIL import Image
import numpy as np
import os

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pretrained MobileNetV2
mobilenet = models.mobilenet_v2(pretrained=True)
mobilenet = torch.nn.Sequential(*list(mobilenet.children())[:-1])  # Remove classifier layer
mobilenet.eval().to(device)

# Define image transform
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def get_mobilenet_features(img_path):
    try:
        img = Image.open(img_path).convert('RGB')
        img_t = transform(img).unsqueeze(0).to(device)  # Add batch dim
        with torch.no_grad():
            features = mobilenet(img_t)
        return features.cpu().numpy().flatten()  # Shape: (1280,)
    except Exception as e:
        print(f"Error loading {img_path}: {e}")
        return np.zeros(1280)  # MobileNetV2 last layer before classifier has 1280 features



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Clean text (if not already done)
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train_df['clean_text'] = train_df['catalog_content'].fillna('unknown').apply(clean_text)
test_df['clean_text'] = test_df['catalog_content'].fillna('unknown').apply(clean_text)

# TF-IDF for text
tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2))
X_train_text = tfidf.fit_transform(train_df['clean_text'])
X_test_text = tfidf.transform(test_df['clean_text'])

In [None]:
import os
import requests
from PIL import Image
from io import BytesIO

def download_image(url, save_path):
    try:
        response = requests.get(url, timeout=10)
        img = Image.open(BytesIO(response.content)).convert('RGB')
        img.save(save_path)
        return True
    except Exception as e:
        print(f"❌ Failed to download {url} → {e}")
        # Save blank image so feature extractor doesn't crash
        blank = Image.new('RGB', (224, 224), (0, 0, 0))
        blank.save(save_path)
        return False

# Create folders
os.makedirs('train_images', exist_ok=True)
os.makedirs('test_images', exist_ok=True)

# Download train images
for idx, row in train_df.iterrows():
    img_path = f"train_images/{row['sample_id']}.jpg"
    if not os.path.exists(img_path):  # only download if not already there
        download_image(row['image_link'], img_path)
    if idx % 100 == 0:
        print(f"✅ Downloaded {idx} train images...")

# Download test images
for idx, row in test_df.iterrows():
    img_path = f"test_images/{row['sample_id']}.jpg"
    if not os.path.exists(img_path):
        download_image(row['image_link'], img_path)
    if idx % 100 == 0:
        print(f"✅ Downloaded {idx} test images...")

✅ Downloaded 0 train images...
✅ Downloaded 100 train images...
✅ Downloaded 200 train images...
✅ Downloaded 300 train images...
✅ Downloaded 400 train images...
✅ Downloaded 500 train images...
✅ Downloaded 600 train images...
✅ Downloaded 700 train images...
✅ Downloaded 800 train images...
✅ Downloaded 900 train images...
✅ Downloaded 1000 train images...
✅ Downloaded 1100 train images...
✅ Downloaded 1200 train images...
✅ Downloaded 1300 train images...
✅ Downloaded 1400 train images...
✅ Downloaded 1500 train images...
✅ Downloaded 1600 train images...
✅ Downloaded 1700 train images...
✅ Downloaded 1800 train images...
✅ Downloaded 1900 train images...
✅ Downloaded 2000 train images...
✅ Downloaded 2100 train images...
✅ Downloaded 2200 train images...
✅ Downloaded 2300 train images...
✅ Downloaded 2400 train images...
✅ Downloaded 2500 train images...
✅ Downloaded 2600 train images...
✅ Downloaded 2700 train images...
✅ Downloaded 2800 train images...
✅ Downloaded 2900 train im

In [None]:
import torch
import torchvision.models as models
from torchvision import transforms

# Load MobileNetV2 (pretrained)
mobilenet = models.mobilenet_v2(pretrained=True)
mobilenet = torch.nn.Sequential(*list(mobilenet.children())[:-1])  # Remove classifier
mobilenet.eval()

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def get_mobilenet_features(img_path):
    try:
        img = Image.open(img_path).convert('RGB')
        img_t = transform(img).unsqueeze(0)  # Add batch dimension
        with torch.no_grad():
            features = mobilenet(img_t)
        return features.squeeze().numpy()  # Shape: (1280,)
    except Exception as e:
        print(f"⚠️ Error processing {img_path}: {e}")
        return np.zeros(1280)  # fallback

# Now extract features for all downloaded images
X_train_img = []
for idx, row in train_df.iterrows():
    img_path = f"train_images/{row['sample_id']}.jpg"
    feat = get_mobilenet_features(img_path)
    X_train_img.append(feat)
X_train_img = np.array(X_train_img)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train_df['clean_text'] = train_df['catalog_content'].fillna('unknown').apply(clean_text)
test_df['clean_text'] = test_df['catalog_content'].fillna('unknown').apply(clean_text)

tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2))
X_train_text = tfidf.fit_transform(train_df['clean_text'])
X_test_text = tfidf.transform(test_df['clean_text'])

In [None]:
# Convert TF-IDF to dense (if memory allows)
X_train_text_dense = X_train_text.toarray()
X_test_text_dense = X_test_text.toarray()

# Reshape image features to be 2D for stacking
X_train_img_reshaped = X_train_img.reshape(X_train_img.shape[0], -1)
# Assuming X_test_img is also extracted and available
# If not, you'll need to extract features for test images similarly to how you did for train images
# For now, assuming X_test_img exists and reshaping it:
# X_test_img_reshaped = X_test_img.reshape(X_test_img.shape[0], -1)
# NOTE: If you haven't extracted X_test_img yet, you'll need to add that step before this cell.

# Combine with image features
# Ensure X_test_img is available before running this part
# For demonstration, I'll assume X_test_img is available and reshaped
# X_train_combined = np.hstack([X_train_text_dense, X_train_img_reshaped])
# X_test_combined = np.hstack([X_test_text_dense, X_test_img_reshaped])

# Temporarily using only text features for model training to fix the stacking issue first.
# You will need to uncomment the lines above and ensure X_test_img is generated
# before running the combined model.
X_train_combined = X_train_text_dense
X_test_combined = X_test_text_dense



In [None]:
import numpy as np

def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0  # avoid division by zero
    return np.mean(diff) * 100


In [None]:
from xgboost import XGBRegressor

model = XGBRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    tree_method='hist'
)

model.fit(X_train_combined, train_df['price'])


In [None]:
y_true = train_df['price']
y_pred = model.predict(X_train_combined)

smape_score = smape(y_true, y_pred)
print(f"SMAPE: {smape_score:.2f}%")


SMAPE: 61.36%


In [None]:
submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': y_pred
})
submission.to_csv('test_out.csv', index=False)

print("✅ Submission saved to test_out.csv")

✅ Submission saved to test_out.csv
