In [1]:
# Cell 1: Setup and Unzip All Images (CORRECTED)
import os
import shutil
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm

print("Step 1 shuru: Sabhi zip files se images ko nikal kar ek jagah jama kiya ja raha hai...")

# --- Configuration (UPDATED PATH) ---
# Original data ka path
DATA_DIR = "/kaggle/input/ml-am-ds/student_resource/dataset/" 
# Sahi path jahan aapki zip files hain (most likely 'zipped_batches' subfolder mein)
IMAGES_DATASET_DIR = "/kaggle/input/amazon-product-images/" 
# Final folder jahan saari images aayengi
FINAL_IMG_DIR = "/kaggle/working/all_images/" 

os.makedirs(FINAL_IMG_DIR, exist_ok=True)

# --- Unzipping Logic ---
def unzip_all_batches(source_dir, dest_dir):
    # source_dir ke andar maujood sabhi .zip files ko dhoondhein
    zip_files = glob.glob(os.path.join(source_dir, "*.zip"))
    print(f"'{source_dir}' se {len(zip_files)} zip files mili.")
    
    # Agar 0 files mili to error dega.
    if len(zip_files) == 0:
        print("\n🚨🚨 CRITICAL ERROR: Koi zip file nahi mili. Path dobara check karein! 🚨🚨")
        print("Agar aapki files direct '/kaggle/input/amazon-product-images/' mein hain, to upar wala path badal dein.")
        return 0
        
    for zip_file in tqdm(zip_files, desc="Unzipping batches"):
        try:
            shutil.unpack_archive(zip_file, dest_dir)
        except Exception as e:
            print(f"Error unzipping {zip_file}: {e}")
            
    return len(os.listdir(dest_dir))

# Images ko final directory mein unzip karein
total_images = unzip_all_batches(IMAGES_DATASET_DIR, FINAL_IMG_DIR)

# --- CLEANUP LINES REMOVED: '/kaggle/input' se kuch bhi delete nahi kiya ja sakta. ---
# shutil.rmtree(os.path.join("/kaggle/input", "amazon-product-images")) line hata di gayi hai.

if total_images > 0:
    print(f"\n✅ Step 1 poora hua! Kul {total_images} images '{FINAL_IMG_DIR}' folder mein hain.")
    print("Ab aap bina kisi badlav ke STEP 2 aur STEP 3 run kar sakte hain.")
else:
    print("\n❌ Step 1 FAILED. Please check your Kaggle data path.")

Step 1 shuru: Sabhi zip files se images ko nikal kar ek jagah jama kiya ja raha hai...
'/kaggle/input/amazon-product-images/zipped_batches/' se 0 zip files mili.

🚨🚨 CRITICAL ERROR: Koi zip file nahi mili. Path dobara check karein! 🚨🚨
Agar aapki files direct '/kaggle/input/amazon-product-images/' mein hain, to upar wala path badal dein.

❌ Step 1 FAILED. Please check your Kaggle data path.


In [3]:
# Cell 1/2 Combined: Direct Image Loading and Feature Extraction

import os
import shutil
import glob
import numpy as np
import torch
from torchvision import models, transforms
from PIL import Image
from tqdm import tqdm

print("🚀 Direct Image Loading aur Feature Extraction shuru...")

# --- Configuration ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Sahi path jahan aapke 'images_batch_X' folders hain
IMAGES_ROOT_DIR = "/kaggle/input/amazon-product-images/" 
EMBEDDINGS_FILE = "/kaggle/working/image_embeddings.npz"

if DEVICE.type == 'cpu':
    print("🚨 WARNING: GPU not found. Please enable GPU in Notebook Settings.")

# --- Pre-trained Model (ResNet50) ko load karein ---
model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT) 
model = torch.nn.Sequential(*list(model.children())[:-1]) 
model.to(DEVICE)
model.eval()

# --- Image Transformations ---
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# --- Feature nikaalne ka process ---
image_embeddings = {}
# Sahi tarah se 'images_batch_X' folders ki list taiyar karein
batch_folders = glob.glob(os.path.join(IMAGES_ROOT_DIR, "images_batch_*"))
total_images_processed = 0

if len(batch_folders) == 0:
    print(f"\n🚨🚨 CRITICAL ERROR: '{IMAGES_ROOT_DIR}' mein koi 'images_batch_X' folder nahi mila. Path galat hai. 🚨🚨")
else:
    print(f"✅ Kul {len(batch_folders)} image batches mile. Features nikalna shuru...")

    with torch.no_grad(): 
        for folder in tqdm(batch_folders, desc="Processing Batches"):
            image_files = glob.glob(os.path.join(folder, "*.jpg"))
            
            for image_path in image_files:
                try:
                    sample_id = int(os.path.splitext(os.path.basename(image_path))[0]) 
                    
                    img = Image.open(image_path).convert('RGB')
                    img_tensor = transform(img).unsqueeze(0).to(DEVICE)
                    
                    feature_vector = model(img_tensor).squeeze().cpu().numpy() 
                    image_embeddings[str(sample_id)] = feature_vector
                    total_images_processed += 1
                    
                except Exception:
                    continue

    # Features ko .npz file mein save karein
    np.savez_compressed(EMBEDDINGS_FILE, **image_embeddings) 

    # Memory saaf karein
    del model, transform, image_embeddings
    torch.cuda.empty_cache()

    print(f"\n✅ Image Processing Poora Hua!")
    print(f"Kul images se features nikale gaye: {total_images_processed}")

🚀 Direct Image Loading aur Feature Extraction shuru...


Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 209MB/s]


✅ Kul 16 image batches mile. Features nikalna shuru...


Processing Batches: 100%|██████████| 16/16 [1:23:26<00:00, 312.90s/it]



✅ Image Processing Poora Hua!
Kul images se features nikale gaye: 79999


In [12]:
# Cell 3: Feature Fusion, Model Training, Aur Submission (FINAL WORKING VERSION)
import lightgbm as lgb
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from tqdm import tqdm

print("Step 3 shuru: Features ko joda ja raha hai aur model train kiya ja raha hai...")
print("-" * 50)

# --- Configuration ---
DATA_DIR = "/kaggle/input/a-ds-ml/student_resource/dataset/" 
EMBEDDINGS_FILE = "/kaggle/working/image_embeddings.npz"
SUBMISSION_FILE = "/kaggle/working/submission.csv"

# --- Data Load Karein ---
try:
    train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
    test_df = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
    image_embeddings_loaded = np.load(EMBEDDINGS_FILE, allow_pickle=True)
except Exception as e:
    print(f"\n🚨 CRITICAL ERROR: Data Loading mein samasya. Exception: {e}")
    raise

# --- 3.1: Text Feature Extraction (TF-IDF) ---
train_text = train_df['catalog_content'].fillna('')
test_text = test_df['catalog_content'].fillna('')

tfidf = TfidfVectorizer(stop_words='english', max_features=50000)
X_train_text = tfidf.fit_transform(train_text)
X_test_text = tfidf.transform(test_text)

print(f"TF-IDF features ka shape: Train={X_train_text.shape}, Test={X_test_text.shape}")

# --- 3.2: Image Feature Integration ---
def get_image_features(df, embeddings_npz, feature_dim=2048):
    features = []
    default_feature = np.zeros(feature_dim, dtype=np.float32) 
    key_mapping = {int(k): v for k, v in embeddings_npz.items()}
    
    for sample_id in tqdm(df['sample_id'].values, desc="Image features ko prepare kiya ja raha hai"):
        feature = key_mapping.get(sample_id, default_feature)
        features.append(feature)
        
    return np.array(features)

X_train_img = get_image_features(train_df, image_embeddings_loaded)
X_test_img = get_image_features(test_df, image_embeddings_loaded)

image_embeddings_loaded.close()

print(f"Image features ka shape: Train={X_train_img.shape}, Test={X_test_img.shape}")

# --- 3.3: Feature Fusion (Text + Image) ---
X_train = hstack([X_train_text, X_train_img])
X_test = hstack([X_test_text, X_test_img])
y_train = train_df['price']

# 🌟 FINAL FIX: Sparse matrix ko CSR mein badlein taki slicing (indexing) ho sake
X_train = X_train.tocsr() 

print(f"Final Combined Feature Matrix ka shape: Train={X_train.shape}, Test={X_test.shape}")

# --- 3.4: LightGBM Model Training (K-Fold Cross-Validation) ---
NFOLDS = 5
folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)
oof_preds = np.zeros(X_train.shape[0])
sub_preds = np.zeros(X_test.shape[0])

lgb_params = {
    'objective': 'regression_l1',
    'metric': 'mae',
    'n_estimators': 3000,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'num_leaves': 31,
    'verbose': -1,
    'n_jobs': -1,
    'seed': 42
}

print("\nModel Training shuru ho rahi hai (5 folds)...")

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, y_train)):
    # Yeh line ab chal jaayegi kyunki X_train ab CSR format mein hai
    X_train_fold, y_train_fold = X_train[train_idx], y_train[train_idx]
    X_valid_fold, y_valid_fold = X_train[valid_idx], y_train[valid_idx]
    
    lgb_model = lgb.LGBMRegressor(**lgb_params)
    lgb_model.fit(X_train_fold, y_train_fold,
                  eval_set=[(X_valid_fold, y_valid_fold)],
                  eval_metric='mae',
                  callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)])

    oof_preds[valid_idx] = lgb_model.predict(X_valid_fold)
    sub_preds += lgb_model.predict(X_test) / folds.n_splits

print(f"\nModel MAE (Cross-Validation): {np.mean(np.abs(oof_preds - y_train)):.4f}")

# --- 3.5: Submission File Creation ---
submission_df = pd.DataFrame({'ID': test_df['sample_id'], 'Price': sub_preds})
submission_df['Price'] = np.maximum(0, submission_df['Price']) 
submission_df.to_csv(SUBMISSION_FILE, index=False)

print(f"\n🚀✅ Step 3 Poora Hua! Final submission file '{SUBMISSION_FILE}' ban chuka hai. Congratulations!")

Step 3 shuru: Features ko joda ja raha hai aur model train kiya ja raha hai...
--------------------------------------------------
TF-IDF features ka shape: Train=(75000, 50000), Test=(75000, 50000)


Image features ko prepare kiya ja raha hai: 100%|██████████| 75000/75000 [00:00<00:00, 1741319.22it/s]
Image features ko prepare kiya ja raha hai: 100%|██████████| 75000/75000 [00:00<00:00, 2880307.65it/s]


Image features ka shape: Train=(75000, 2048), Test=(75000, 2048)
Final Combined Feature Matrix ka shape: Train=(75000, 52048), Test=(75000, 52048)

Model Training shuru ho rahi hai (5 folds)...





Model MAE (Cross-Validation): 11.7352

🚀✅ Step 3 Poora Hua! Final submission file '/kaggle/working/submission.csv' ban chuka hai. Congratulations!


In [10]:
print("Train columns:", train_df.columns.tolist())
print("Test columns:", test_df.columns.tolist())

Train columns: ['sample_id', 'catalog_content', 'image_link', 'price']
Test columns: ['sample_id', 'catalog_content', 'image_link']
