In [8]:
# English ke aam shabd (jaise 'the', 'is', 'a') ki list
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    words = text.split()
    clean_words = [word for word in words if word not in stop_words]
    text = ' '.join(clean_words)
    return text

print("Cleaning function is ready.")

Cleaning function is ready.


In [9]:
train_df['cleaned_content'] = train_df['catalog_content'].apply(clean_text)
test_df['cleaned_content'] = test_df['catalog_content'].apply(clean_text)

print("Text cleaning complete for both train and test data.")

Text cleaning complete for both train and test data.


In [10]:
train_df[['catalog_content', 'cleaned_content']].head()

Unnamed: 0,catalog_content,cleaned_content
0,"Item Name: La Victoria Green Taco Sauce Mild, ...",item name la victoria green taco sauce mild 12...
1,"Item Name: Salerno Cookies, The Original Butte...",item name salerno cookies original butter cook...
2,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",item name bear creek hearty soup bowl creamy c...
3,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,item name judees blue cheese powder 1125 oz gl...
4,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",item name kedem sherry cooking wine 127 ounce ...


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF model banayein
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Training data par fit aur transform karein
text_features_train = tfidf_vectorizer.fit_transform(train_df['cleaned_content'])

# Test data par sirf transform karein
text_features_test = tfidf_vectorizer.transform(test_df['cleaned_content'])

print("TF-IDF process complete.")

TF-IDF process complete.


In [12]:
print("Shape of training text features:", text_features_train.shape)
print("Shape of testing text features:", text_features_test.shape)

Shape of training text features: (75000, 5000)
Shape of testing text features: (75000, 5000)


In [13]:
train_df.head()

Unnamed: 0,sample_id,catalog_content,image_link,price,cleaned_content
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89,item name la victoria green taco sauce mild 12...
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12,item name salerno cookies original butter cook...
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97,item name bear creek hearty soup bowl creamy c...
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34,item name judees blue cheese powder 1125 oz gl...
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49,item name kedem sherry cooking wine 127 ounce ...


In [14]:
import os
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from tqdm import tqdm
import concurrent.futures

# --- Yahan se Helper Functions Shuru ---

def requests_retry_session(retries=3, backoff_factor=0.3, status_forcelist=(500, 502, 504), session=None):
    session = session or requests.Session()
    retry = Retry(
        total=retries, read=retries, connect=retries,
        backoff_factor=backoff_factor, status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

# Yeh function sirf ek image ko download karega
def download_one_image(args):
    url, image_path, session = args
    if not os.path.exists(image_path):
        try:
            response = session.get(url, timeout=30)
            if response.status_code == 200:
                with open(image_path, 'wb') as f:
                    f.write(response.content)
                return "Downloaded"
            else:
                return f"Failed with status {response.status_code}"
        except Exception as e:
            return f"Failed with error: {e}"
    else:
        return "Already Exists"

# --- Yahan se Main Code Shuru ---

# Ek saath kitne downloads karne hain. 16 se shuru karein.
# Agar aapka internet bahut accha hai to ise 32 kar sakte hain.
MAX_WORKERS = 16

# images folder banayein
os.makedirs('../images', exist_ok=True)

# Dono dataframes ko jod dein
all_df = pd.concat([train_df, test_df], ignore_index=True)

# Download ke liye tasks ki list banayein
tasks = []
session = requests_retry_session()
for index, row in all_df.iterrows():
    url = row['image_link']
    image_id = row['sample_id']
    image_path = os.path.join('../images', f"{image_id}.jpg")
    tasks.append((url, image_path, session))

print(f"Total {len(tasks)} images to download with {MAX_WORKERS} parallel workers...")

# ThreadPoolExecutor ka istemal karke parallel download karein
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    # tqdm progress bar ke saath sabhi tasks ko run karein
    results = list(tqdm(executor.map(download_one_image, tasks), total=len(tasks)))

print("\nHigh-speed download process poora hua!")

Total 150000 images to download with 16 parallel workers...


100%|██████████| 150000/150000 [00:15<00:00, 9708.84it/s] 


High-speed download process poora hua!





In [16]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
from tqdm import tqdm
import concurrent.futures

# ResNet50 model ko 'imagenet' ke pre-trained weights ke saath load karein
# include_top=False ka matlab hai ki humein aakhri classification layer nahi chahiye
# pooling='avg' se humein ek flat feature vector (1D array) milta hai
base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

print("ResNet50 model successfully loaded.")

ResNet50 model successfully loaded.


In [15]:
def extract_image_features(img_path):
    try:
        # Image ko load karein aur 224x224 size mein resize karein
        img = image.load_img(img_path, target_size=(224, 224))
        
        # Image ko array mein badlein
        x = image.img_to_array(img)
        
        # Ek extra dimension add karein
        x = np.expand_dims(x, axis=0)
        
        # Image ko ResNet50 ke hisaab se preprocess karein
        x = preprocess_input(x)
        
        # Model se features predict karein
        features = base_model.predict(x, verbose=0)
        
        return features.flatten()
    except Exception as e:
        # Agar image kharab hai ya exist nahi karti, to 2048 zeros ka vector return karein
        return np.zeros(2048)

print("Image feature extraction function is ready.")

Image feature extraction function is ready.


In [18]:
# Train aur test data ke liye image paths ki list banayein
train_image_paths = ['../images/' + str(sid) + '.jpg' for sid in train_df['sample_id']]
test_image_paths = ['../images/' + str(sid) + '.jpg' for sid in test_df['sample_id']]

MAX_WORKERS_FEATURES = 4 # Isko apne computer ke CPU cores ke hisaab se set karein (4 se 8 accha rehta hai)

print(f"Extracting features from {len(train_image_paths)} training images...")
# Training images ke liye
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS_FEATURES) as executor:
    train_image_features_list = list(tqdm(executor.map(extract_image_features, train_image_paths), total=len(train_image_paths)))

print(f"\nExtracting features from {len(test_image_paths)} test images...")
# Test images ke liye
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS_FEATURES) as executor:
    test_image_features_list = list(tqdm(executor.map(extract_image_features, test_image_paths), total=len(test_image_paths)))

# Lists ko numpy arrays mein convert karein
X_img_train = np.array(train_image_features_list)
X_img_test = np.array(test_image_features_list)

print("\nFeature extraction for all images complete.")
print("Shape of training image features:", X_img_train.shape)
print("Shape of testing image features:", X_img_test.shape)

Extracting features from 75000 training images...


  0%|          | 212/75000 [00:24<2:24:19,  8.64it/s]


KeyboardInterrupt: 

In [21]:
import numpy as np
from tqdm import tqdm
import concurrent.futures # This was the missing import line

# Make sure the 'extract_image_features' function has been defined in a previous cell.

# Create lists of image paths for both train and test data
train_image_paths = ['../images/' + str(sid) + '.jpg' for sid in train_df['sample_id']]
test_image_paths = ['../images/' + str(sid) + '.jpg' for sid in test_df['sample_id']]

# Set the number of workers based on your CPU
MAX_WORKERS_FEATURES = 4 

print(f"Starting feature extraction for {len(train_image_paths)} training images...")
# Extract features for training images in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS_FEATURES) as executor:
    train_image_features_list = list(tqdm(executor.map(extract_image_features, train_image_paths), total=len(train_image_paths)))
X_img_train = np.array(train_image_features_list)

print(f"\nStarting feature extraction for {len(test_image_paths)} test images...")
# Extract features for test images in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS_FEATURES) as executor:
    test_image_features_list = list(tqdm(executor.map(extract_image_features, test_image_paths), total=len(test_image_paths)))
X_img_test = np.array(test_image_features_list)

# Save the results to .npy files
np.save('X_img_train.npy', X_img_train)
np.save('X_img_test.npy', X_img_test)

print("\nFeature extraction is complete AND the results have been saved to .npy files!")
print("Shape of training image features:", X_img_train.shape)
print("Shape of testing image features:", X_img_test.shape)

Starting feature extraction for 75000 training images...


  0%|          | 354/75000 [00:42<2:29:49,  8.30it/s]


KeyboardInterrupt: 

In [19]:
import re
import nltk
from nltk.corpus import stopwords

print("--- Text Cleaning Shuru ---")
# nltk.download('stopwords') # Agar pehle se nahi kiya hai to is line ko uncomment karke chalayein
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    words = text.split()
    clean_words = [word for word in words if word not in stop_words]
    return ' '.join(clean_words)

# 'cleaned_content' column banayein
train_df['cleaned_content'] = train_df['catalog_content'].apply(clean_text)
test_df['cleaned_content'] = test_df['catalog_content'].apply(clean_text)

print("Text cleaning poora hua. 'cleaned_content' column ban gaya hai.")
print("Sample:")
print(train_df[['catalog_content', 'cleaned_content']].head())

--- Text Cleaning Shuru ---
Text cleaning poora hua. 'cleaned_content' column ban gaya hai.
Sample:
                                     catalog_content  \
0  Item Name: La Victoria Green Taco Sauce Mild, ...   
1  Item Name: Salerno Cookies, The Original Butte...   
2  Item Name: Bear Creek Hearty Soup Bowl, Creamy...   
3  Item Name: Judee’s Blue Cheese Powder 11.25 oz...   
4  Item Name: kedem Sherry Cooking Wine, 12.7 Oun...   

                                     cleaned_content  
0  item name la victoria green taco sauce mild 12...  
1  item name salerno cookies original butter cook...  
2  item name bear creek hearty soup bowl creamy c...  
3  item name judees blue cheese powder 1125 oz gl...  
4  item name kedem sherry cooking wine 127 ounce ...  


In [20]:
from scipy.sparse import hstack

# Note: Is cell ko chalane se pehle, text_features_train naam ka variable
# (jo TF-IDF se bana tha) aapki notebook ki memory mein hona chahiye.

print("--- Features ko Jodna ---")
X_train_final = hstack([text_features_train, X_img_train])
X_test_final = hstack([text_features_test, X_img_test])

print("Combined features taiyaar hain.")
print("Final training features ka shape:", X_train_final.shape)

--- Features ko Jodna ---


NameError: name 'X_img_train' is not defined

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

print("--- Model Training ki Taiyari ---")
# Target variable (price) ko alag karein
y = train_df['price']
# Log transform lagayein taaki skewed data handle ho sake
y_log = np.log1p(y)

# Data ko 80% train aur 20% validation set mein baantein
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train_final, y_log, 
    test_size=0.2, 
    random_state=42
)
print("Data training aur validation ke liye ready hai.")

--- Model Training ki Taiyari ---
Data training aur validation ke liye ready hai.


In [None]:
import lightgbm as lgb

print("--- Model ko Train Karna ---")
model = lgb.LGBMRegressor(random_state=42)
# Model ko training data par train karein
model.fit(X_train_split, y_train_split)

print("Model training complete.")

--- Model ko Train Karna ---
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.122327 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 536934
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 4991
[LightGBM] [Info] Start training from score 2.740904
Model training complete.


In [None]:
# SMAPE score calculate karne ke liye function
def smape(y_true, y_pred):
    # Yeh formula challenge ke document se liya gaya hai
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / denominator) * 100

print("--- Model ki Performance Check Karna (SMAPE) ---")
# Validation set par price predict karein
val_preds_log = model.predict(X_val)
# Log transform ko reverse karein
val_preds = np.expm1(val_preds_log)
y_val_orig = np.expm1(y_val)

# SMAPE score calculate karein
smape_score = smape(y_val_orig, val_preds)
print(f"Validation SMAPE Score: {smape_score:.4f}% (Jitna kam, utna behtar)")

--- Model ki Performance Check Karna (SMAPE) ---
Validation SMAPE Score: 57.5732% (Jitna kam, utna behtar)




In [None]:
print("--- Final Model ko Poore Data par Re-train Karna ---")
model.fit(X_train_final, y_log)
print("Final model re-training complete.")

print("\n--- Submission File Banana ---")
# Final test features par predict karein
test_preds_log = model.predict(X_test_final)
# Log transform ko reverse karein
final_prices = np.expm1(test_preds_log)
# Sunishchit karein ki koi bhi price negative na ho
final_prices[final_prices < 0] = 0

# Submission ke liye DataFrame banayein
submission_df = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': final_prices
})
# CSV file save karein
submission_df.to_csv('test_out.csv', index=False)

print("\nSubmission file 'test_out.csv' taiyaar hai!")
print("File ka sample:")
print(submission_df.head())

--- Final Model ko Poore Data par Re-train Karna ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.984523 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 605627
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 4996
[LightGBM] [Info] Start training from score 2.739217
Final model re-training complete.

--- Submission File Banana ---





Submission file 'test_out.csv' taiyaar hai!
File ka sample:
   sample_id      price
0     100179  17.106771
1     245611  17.484260
2     146263  28.837859
3      95658  11.714338
4      36806  19.668761


In [None]:
import re
import nltk
from nltk.corpus import stopwords

print("--- Step 1: Text Cleaning Shuru ---")
# nltk.download('stopwords') # Agar pehle se nahi kiya hai to is line ko uncomment karke chalayein
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    words = text.split()
    clean_words = [word for word in words if word not in stop_words]
    return ' '.join(clean_words)

train_df['cleaned_content'] = train_df['catalog_content'].apply(clean_text)
test_df['cleaned_content'] = test_df['catalog_content'].apply(clean_text)
print("Text cleaning poora hua. 'cleaned_content' column ban gaya hai.")

--- Step 1: Text Cleaning Shuru ---
Text cleaning poora hua. 'cleaned_content' column ban gaya hai.


In [5]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

print("Libraries imported successfully!")

Libraries imported successfully!


In [6]:
# Apne dataset ka sahi path dein
train_df = pd.read_csv('../dataset/train.csv')
test_df = pd.read_csv('../dataset/test.csv')

print("Data loaded successfully!")

Data loaded successfully!


In [7]:
import nltk

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from tqdm import tqdm

# --- Model aur Tokenizer Load Karein ---
print("Loading DistilBERT model (yeh pehli baar mein thoda time lega)...")
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# --- Data Taiyaar Karein ---
# train_df aur test_df pehle se loaded hone chahiye
all_text = pd.concat([train_df, test_df], ignore_index=True)['cleaned_content'].tolist()
all_embeddings = []

# --- Batch Processing Shuru Karein ---
# Batch size aapki RAM par depend karta hai. 32 ek safe value hai.
# Agar aapke paas 16GB ya zyada RAM hai to ise 64 kar sakte hain.
batch_size = 32 

print(f"\nStarting feature extraction with DistilBERT in batches of {batch_size}...")

for i in tqdm(range(0, len(all_text), batch_size)):
    batch = all_text[i:i + batch_size]
    
    # Text ko model ke format mein badlein
    inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=128)
    
    # Memory bachane ke liye torch.no_grad() ka istemal karein
    with torch.no_grad():
        outputs = model(**inputs)
        
    # Sentence ka embedding nikalne ke liye mean ka istemal karein
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    all_embeddings.append(embeddings)

# Sabhi batches ke embeddings ko ek saath jodein
final_embeddings = np.vstack(all_embeddings)

# In features ko ek file mein save kar lein, taaki dobara calculate na karna pade
np.save('distilbert_embeddings.npy', final_embeddings)

print("\nDistilBERT features banakar 'distilbert_embeddings.npy' file mein save ho gaye hain!")
print("Embeddings ka shape:", final_embeddings.shape)

Loading DistilBERT model (yeh pehli baar mein thoda time lega)...

Starting feature extraction with DistilBERT in batches of 32...


100%|██████████| 4688/4688 [3:33:27<00:00,  2.73s/it]  



DistilBERT features banakar 'distilbert_embeddings.npy' file mein save ho gaye hain!
Embeddings ka shape: (150000, 768)


In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
import lightgbm as lgb

# --- Step 1: Naye Features Load aur Combine Karein ---
print("--- Step 1: Naye DistilBERT aur Image Features ko Jodna ---")
# Save kiye gaye DistilBERT features ko load karein
distilbert_features = np.load('distilbert_embeddings.npy')

# Training aur Test features ko alag karein
bert_features_train = distilbert_features[:len(train_df)]
bert_features_test = distilbert_features[len(train_df):]

# Naye text features ko image features ke saath jodein (X_img_train pehle se bana hona chahiye)
X_train_final_new = np.concatenate([bert_features_train, X_img_train], axis=1)
X_test_final_new = np.concatenate([bert_features_test, X_img_test], axis=1)
print("Naye combined features taiyaar hain. Shape:", X_train_final_new.shape)


# --- BAAKI KA PROCESS BILKUL SAME RAHEGA ---

# --- Step 2: Model Training ki Taiyari ---
print("\n--- Step 2: Model Training ki Taiyari ---")
y = train_df['price']
y_log = np.log1p(y)
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train_final_new, y_log, test_size=0.2, random_state=42
)
print("Data training aur validation ke liye ready hai.")

# --- Step 3: Model ko Naye Features par Train Karna ---
print("\n--- Step 3: Model ko Naye Features par Train Karna ---")
model_new = lgb.LGBMRegressor(random_state=42)
model_new.fit(X_train_split, y_train_split)
print("Model training complete.")

# --- Step 4: Nayi Performance Check Karna (SMAPE) ---
print("\n--- Step 4: Nayi Performance Check Karna (SMAPE) ---")
def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / denominator) * 100

val_preds_log = model_new.predict(X_val)
val_preds = np.expm1(val_preds_log)
y_val_orig = np.expm1(y_val)
smape_score = smape(y_val_orig, val_preds)
print(f"Naye Features ke saath Validation SMAPE Score: {smape_score:.4f}%")

# --- Step 5: Final Submission File Banana ---
print("\n--- Step 5: Final Model ko Poore Data par Re-train Karna ---")
model_new.fit(X_train_final_new, y_log)
print("Final model re-training complete.")

print("\n--- Step 6: Submission File Banana ---")
test_preds_log = model_new.predict(X_test_final_new)
final_prices = np.expm1(test_preds_log)
final_prices[final_prices < 0] = 0

submission_df = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': final_prices
})
submission_df.to_csv('test_out_bert_features.csv', index=False)
print("Nayi submission file 'test_out_bert_features.csv' taiyaar hai!")
print(submission_df.head())

--- Step 1: Naye DistilBERT aur Image Features ko Jodna ---
Naye combined features taiyaar hain. Shape: (75000, 2816)

--- Step 2: Model Training ki Taiyari ---
Data training aur validation ke liye ready hai.

--- Step 3: Model ko Naye Features par Train Karna ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.796956 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 768
[LightGBM] [Info] Start training from score 2.740904
Model training complete.

--- Step 4: Nayi Performance Check Karna (SMAPE) ---




Naye Features ke saath Validation SMAPE Score: 63.5097%

--- Step 5: Final Model ko Poore Data par Re-train Karna ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.830974 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 768
[LightGBM] [Info] Start training from score 2.739217
Final model re-training complete.

--- Step 6: Submission File Banana ---




Nayi submission file 'test_out_bert_features.csv' taiyaar hai!
   sample_id      price
0     100179  17.669095
1     245611  17.244876
2     146263  21.181437
3      95658  16.719877
4      36806  22.975845


In [None]:
import numpy as np
from scipy.sparse import hstack

# Apne pehle se banaye hue feature files ko load karein
# (Agar memory mein nahi hain)
# text_features_train = ... (TF-IDF wale features)
# X_img_train = np.load('X_img_train.npy')

# Features ko combine karein
X_train_final = hstack([text_features_train, X_img_train])
X_test_final = hstack([text_features_test, X_img_test])

# Target variable (y_log) taiyaar karein
y_log = np.log1p(train_df['price'])

print("Best features taiyaar hain. Shape:", X_train_final.shape)

Best features taiyaar hain. Shape: (75000, 7048)


In [None]:
import lightgbm as lgb
from sklearn.linear_model import Ridge
import numpy as np

# --- Model 1: LightGBM (Aapka powerful model) ---
print("Training LightGBM model...")
# Yahan aap apne tuned parameters bhi daal sakti hain agar aapke paas hain
lgbm_model = lgb.LGBMRegressor(random_state=42)
lgbm_model.fit(X_train_final, y_log)
# Test set par predict karein
lgbm_preds_log = lgbm_model.predict(X_test_final)


# --- Model 2: Ridge (Ek simple aur fast model) ---
print("Training Ridge model...")
# Alpha ek regularization parameter hai, 1.0 ek acchi default value hai
ridge_model = Ridge(alpha=1.0, random_state=42)
ridge_model.fit(X_train_final, y_log)
# Test set par predict karein
ridge_preds_log = ridge_model.predict(X_test_final)

print("Dono models train ho chuke hain!")


Training LightGBM model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.217794 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 605627
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 4996
[LightGBM] [Info] Start training from score 2.739217




Training Ridge model...
Dono models train ho chuke hain!


In [None]:
# Dono models ki predictions ko original scale par wapis laayein
lgbm_prices = np.expm1(lgbm_preds_log)
ridge_prices = np.expm1(ridge_preds_log)

# Weighted Average lein
final_prices = (lgbm_prices * 0.75) + (ridge_prices * 0.25)

# Sunishchit karein ki koi price negative na ho
final_prices[final_prices < 0] = 0

print("Ensemble predictions taiyaar hain!")

Ensemble predictions taiyaar hain!


In [None]:
# Submission DataFrame banayein
submission_df = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': final_prices
})

# CSV file save karein
submission_df.to_csv('test_out_ensemble_final.csv', index=False)

print("Aapki final submission file 'test_out_ensemble_final.csv' taiyaar hai!")
print("Jaldi se ise upload karein!")
print(submission_df.head())

Aapki final submission file 'test_out_ensemble_final.csv' taiyaar hai!
Jaldi se ise upload karein!
   sample_id      price
0     100179  17.169738
1     245611  16.259391
2     146263  27.383155
3      95658  11.627108
4      36806  22.903735


In [None]:
# Step 1: XGBoost install karein (agar nahi hai to)


import lightgbm as lgb
import xgboost as xgb
import numpy as np
import pandas as pd

# Note: Isse pehle X_train_final, X_test_final, aur y_log aapki memory mein hone chahiye

# --- Model 1: LightGBM ---
print("Training LightGBM model with tuned parameters...")
# Yeh kuch aam taur par accha perform karne wale parameters hain
lgbm_params = {
    'objective': 'regression_l1',
    'metric': 'rmse',
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'num_leaves': 31,
    'verbose': -1,
    'n_jobs': -1,
    'seed': 42,
    'boosting_type': 'gbdt',
}

lgbm_model = lgb.LGBMRegressor(**lgbm_params)
lgbm_model.fit(X_train_final, y_log)
lgbm_preds_log = lgbm_model.predict(X_test_final)
lgbm_prices = np.expm1(lgbm_preds_log)


# --- Model 2: XGBoost ---
print("\nTraining XGBoost model...")
xgb_model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    tree_method='hist' # For faster training
)
xgb_model.fit(X_train_final, y_log)
xgb_preds_log = xgb_model.predict(X_test_final)
xgb_prices = np.expm1(xgb_preds_log)

# --- Step 3: Combine Predictions ---
print("\nCombining predictions from both models...")
# Dono powerful models hain, isliye 50-50% average lein
final_prices = (lgbm_prices * 0.5) + (xgb_prices * 0.5)
final_prices[final_prices < 0] = 0

# --- Step 4: Final Submission File ---
submission_df = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': final_prices
})
submission_df.to_csv('test_out_lgbm_xgb_final.csv', index=False)

print("\nAapki ultimate ensemble submission file 'test_out_lgbm_xgb_final.csv' taiyaar hai!")
print("Ise turant submit karein!")
print(submission_df.head())

Training LightGBM model with tuned parameters...





Training XGBoost model...

Combining predictions from both models...

Aapki ultimate ensemble submission file 'test_out_lgbm_xgb_final.csv' taiyaar hai!
Ise turant submit karein!
   sample_id      price
0     100179  15.050247
1     245611  17.949546
2     146263  22.279576
3      95658  12.049100
4      36806  24.519219


In [None]:
from sklearn.model_selection import KFold
import lightgbm as lgb
import numpy as np

# Make sure these variables are already in memory:
# X_train_final, y_log, and your smape function

# --- THE FIX: Convert the matrix to CSR format for efficient slicing ---
X_train_final_csr = X_train_final.tocsr()
# ---------------------------------------------------------------------

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_smape_scores = []

print("--- 5-Fold Cross-Validation Starting ---")

# Loop through each fold
for fold, (train_index, val_index) in enumerate(kf.split(X_train_final_csr, y_log)):
    print(f"\n===== FOLD {fold+1} ===== ")
    
    # Slice the data for the current fold using the CSR matrix
    X_train_fold, X_val_fold = X_train_final_csr[train_index], X_train_final_csr[val_index]
    y_train_fold, y_val_fold = y_log.iloc[train_index], y_log.iloc[val_index]
    
    # Train the model
    model = lgb.LGBMRegressor(random_state=42)
    model.fit(X_train_fold, y_train_fold)
    
    # Calculate the SMAPE score for the fold
    val_preds_log = model.predict(X_val_fold)
    val_preds = np.expm1(val_preds_log)
    y_val_orig = np.expm1(y_val_fold)
    
    score = smape(y_val_orig, val_preds)
    oof_smape_scores.append(score)
    print(f"Fold {fold+1} SMAPE Score: {score:.4f}%")

# Print the average score across all folds
print(f"\n\n--- Final Result ---")
print(f"Average CV SMAPE Score: {np.mean(oof_smape_scores):.4f}%")

--- 5-Fold Cross-Validation Starting ---

===== FOLD 1 ===== 




Fold 1 SMAPE Score: 57.5732%

===== FOLD 2 ===== 




Fold 2 SMAPE Score: 56.5241%

===== FOLD 3 ===== 




Fold 3 SMAPE Score: 56.9172%

===== FOLD 4 ===== 




Fold 4 SMAPE Score: 55.9047%

===== FOLD 5 ===== 
Fold 5 SMAPE Score: 56.9815%


--- Final Result ---
Average CV SMAPE Score: 56.7801%




In [None]:
import os
import numpy as np
import pandas as pd

# File ka naam jismein humne features save kiye the
embedding_file = 'distilbert_embeddings.npy'

if os.path.exists(embedding_file):
    print(f"'{embedding_file}' file mil gayi. Features ko direct load kiya ja raha hai...")
    final_embeddings = np.load(embedding_file)
    print("DistilBERT features successfully load ho gaye hain!")
else:
    # Agar file nahi hai, to use banane ke liye lamba process chalana padega
    print(f"'{embedding_file}' file nahi mili. Kripya feature extraction wala cell dobara chalayein.")
    # (Yahan aapko feature extraction wala lamba code daalna hoga agar zaroorat pade)
    final_embeddings = None

if final_embeddings is not None:
    print("Final embeddings ka shape:", final_embeddings.shape)
    
    # In features ko train aur test mein baant lein
    bert_features_train = final_embeddings[:len(train_df)]
    bert_features_test = final_embeddings[len(train_df):]

    # Ab in naye text features ko image features ke saath jodein
    X_train_new_features = np.concatenate([bert_features_train, X_img_train], axis=1)
    
    print("Naye (DistilBERT + Image) features taiyaar hain!")
    print("Naye features ka shape:", X_train_new_features.shape)

'distilbert_embeddings.npy' file mil gayi. Features ko direct load kiya ja raha hai...
DistilBERT features successfully load ho gaye hain!
Final embeddings ka shape: (150000, 768)
Naye (DistilBERT + Image) features taiyaar hain!
Naye features ka shape: (75000, 2816)


In [None]:
# --- Step 0: Sabhi Zaroori Libraries ---
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from scipy.sparse import hstack, csr_matrix
import lightgbm as lgb
import xgboost as xgb
import gc # Garbage Collector ko import karein

# --- Step 1: Text Cleaning ---
print("--- Step 1: Text Cleaning ---")
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    words = text.split()
    clean_words = [word for word in words if word not in stop_words]
    return ' '.join(clean_words)

train_df['cleaned_content'] = train_df['catalog_content'].apply(clean_text)
test_df['cleaned_content'] = test_df['catalog_content'].apply(clean_text)
print("Text cleaning complete.")

# --- Step 2: Feature Generation and REDUCTION (Memory Optimized) ---
print("\n--- Step 2: Generating and Reducing Features for 8GB RAM ---")
# 2.1 - TF-IDF (Size Kam Kiya Gaya aur Memory Optimized)
print("Generating reduced TF-IDF features (1500)...")
# --- MEMORY ERROR FIX: dtype=np.float32 add kiya gaya hai ---
tfidf = TfidfVectorizer(max_features=1500, dtype=np.float32)
text_features_train = tfidf.fit_transform(train_df['cleaned_content'])
text_features_test = tfidf.transform(test_df['cleaned_content'])
# Memory free karein
del train_df['cleaned_content'], test_df['cleaned_content']
gc.collect()

# 2.2 - BERT Features (PCA se Size Kam Kiya Gaya)
print("Loading and reducing BERT features (768 -> 128)...")
distilbert_features = np.load('distilbert_embeddings.npy')
pca_bert = PCA(n_components=128, random_state=42)
bert_features_reduced = pca_bert.fit_transform(distilbert_features)
bert_features_train_reduced = bert_features_reduced[:len(train_df)]
bert_features_test_reduced = bert_features_reduced[len(train_df):]
del distilbert_features, pca_bert
gc.collect()

# 2.3 - Image Features (PCA se Size Kam Kiya Gaya)
print("Loading and reducing Image features (2048 -> 256)...")
X_img_train = np.load('X_img_train.npy')
X_img_test = np.load('X_img_test.npy')
img_features_full = np.vstack([X_img_train, X_img_test])
pca_img = PCA(n_components=256, random_state=42)
img_features_reduced = pca_img.fit_transform(img_features_full)
X_img_train_reduced = img_features_reduced[:len(train_df)]
X_img_test_reduced = img_features_reduced[len(train_df):]
del X_img_train, X_img_test, img_features_full, pca_img
gc.collect()

# 2.4 - Sabhi Chhote Features ko Combine Karna
print("Combining all reduced features...")
X_final_train = hstack([
    text_features_train, 
    csr_matrix(bert_features_train_reduced), 
    csr_matrix(X_img_train_reduced)
]).tocsr()
X_final_test = hstack([
    text_features_test, 
    csr_matrix(bert_features_test_reduced), 
    csr_matrix(X_img_test_reduced)
]).tocsr()
y_log = np.log1p(train_df['price'])
print("All features are ready! Final shape:", X_final_train.shape)
del text_features_train, text_features_test, bert_features_train_reduced, bert_features_test_reduced, X_img_train_reduced, X_img_test_reduced
gc.collect()

# --- Step 3: Ensemble Model Training ---
print("\n--- Step 3: Training Ensemble Models ---")
# (Baaki ka code wahi rahega)
# 3.1 - LightGBM
print("Training LightGBM model...")
lgbm_model = lgb.LGBMRegressor(random_state=42, n_estimators=1000, learning_rate=0.05, num_leaves=31)
lgbm_model.fit(X_final_train, y_log)
lgbm_preds_log = lgbm_model.predict(X_final_test)
lgbm_prices = np.expm1(lgbm_preds_log)

# 3.2 - XGBoost (Memory-Efficient)
print("Training XGBoost model...")
xgb_model = xgb.XGBRegressor(random_state=42, n_estimators=1000, learning_rate=0.05, max_depth=7, tree_method='hist', n_jobs=-1)
xgb_model.fit(X_final_train, y_log)
xgb_preds_log = xgb_model.predict(X_final_test)
xgb_prices = np.expm1(xgb_preds_log)

# --- Step 4: Combine Predictions ---
print("\n--- Step 4: Combining predictions ---")
final_prices = (lgbm_prices * 0.5) + (xgb_prices * 0.5)
final_prices[final_prices < 0] = 0

# --- Step 5: Final Submission File ---
print("\n--- Step 5: Creating Final Submission File ---")
submission_df = pd.DataFrame({'sample_id': test_df['sample_id'], 'price': final_prices})
submission_df.to_csv('test_out_final_attempt.csv', index=False)

print("\nAapki final submission file 'test_out_final_attempt.csv' taiyaar hai!")
print(submission_df.head())

--- Step 1: Text Cleaning ---
Text cleaning complete.

--- Step 2: Generating and Reducing Features for 8GB RAM ---
Generating reduced TF-IDF features (1500)...
Loading and reducing BERT features (768 -> 128)...
Loading and reducing Image features (2048 -> 256)...


  self.explained_variance_ratio_ = self.explained_variance_ / total_var


Combining all reduced features...
All features are ready! Final shape: (75000, 1884)

--- Step 3: Training Ensemble Models ---
Training LightGBM model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.487842 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 385577
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 1628
[LightGBM] [Info] Start training from score 2.739217




Training XGBoost model...

--- Step 4: Combining predictions ---

--- Step 5: Creating Final Submission File ---

Aapki final submission file 'test_out_final_attempt.csv' taiyaar hai!
   sample_id      price
0     100179  15.199087
1     245611  14.844592
2     146263  22.384439
3      95658   9.930439
4      36806  30.899461


In [None]:
import numpy as np
import lightgbm as lgb
import pandas as pd

# Make sure train_df and y_log are loaded in memory
print("Loading full BERT features to select the best ones...")
bert_features_train = np.load('distilbert_embeddings.npy')[:len(train_df)]
y_log = np.log1p(train_df['price'])

print("Training a temporary model to find important features...")
# Train a temporary LightGBM model
temp_model = lgb.LGBMRegressor(random_state=42)
temp_model.fit(bert_features_train, y_log)

# Get the feature importances from the model
feature_importances = temp_model.feature_importances_

# Find the indices of the top 256 most important features
top_n = 256
top_indices = np.argsort(feature_importances)[::-1][:top_n]

# Select only the top features from the full DistilBERT embeddings
distilbert_full = np.load('distilbert_embeddings.npy')
bert_features_selected = distilbert_full[:, top_indices]

# Save this new, smaller feature set to a file
np.save('bert_features_selected.npy', bert_features_selected)

print(f"\nTop {top_n} features have been selected and saved to 'bert_features_selected.npy'.")
print("New shape:", bert_features_selected.shape)

Loading full BERT features to select the best ones...
Training a temporary model to find important features...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.693804 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 768
[LightGBM] [Info] Start training from score 2.739217

Top 256 features have been selected and saved to 'bert_features_selected.npy'.
New shape: (150000, 256)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

print("--- Generating TF-IDF Features ---")

# Make sure the 'cleaned_content' column exists in your dataframes
# Use a reduced number of features to save memory
tfidf = TfidfVectorizer(max_features=1500, dtype=np.float32)

text_features_train = tfidf.fit_transform(train_df['cleaned_content'])
text_features_test = tfidf.transform(test_df['cleaned_content'])

print("TF-IDF features created successfully.")
print("Shape of text_features_train:", text_features_train.shape)

--- Generating TF-IDF Features ---
TF-IDF features created successfully.
Shape of text_features_train: (75000, 1500)


In [None]:
import re
import nltk
from nltk.corpus import stopwords

# Define the cleaning function
stop_words = set(stopwords.words('english'))
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    words = text.split()
    clean_words = [word for word in words if word not in stop_words]
    return ' '.join(clean_words)

# Apply the function to create the 'cleaned_content' column
train_df['cleaned_content'] = train_df['catalog_content'].apply(clean_text)
test_df['cleaned_content'] = test_df['catalog_content'].apply(clean_text)

print("'cleaned_content' column has been created successfully.")

'cleaned_content' column has been created successfully.


In [None]:
import numpy as np
import lightgbm as lgb
import pandas as pd

# Make sure train_df and y_log are loaded in memory
print("Loading full BERT features to select the best ones...")
bert_features_train = np.load('distilbert_embeddings.npy')[:len(train_df)]
y_log = np.log1p(train_df['price'])

print("Training a temporary model to find important features...")
# Train a temporary LightGBM model
temp_model = lgb.LGBMRegressor(random_state=42)
temp_model.fit(bert_features_train, y_log)

# Get the feature importances from the model
feature_importances = temp_model.feature_importances_

# Find the indices of the top 256 most important features
top_n = 256
top_indices = np.argsort(feature_importances)[::-1][:top_n]

# Select only the top features from the full DistilBERT embeddings
distilbert_full = np.load('distilbert_embeddings.npy')
bert_features_selected = distilbert_full[:, top_indices]

# Save this new, smaller feature set to a file
np.save('bert_features_selected.npy', bert_features_selected)

print(f"\nTop {top_n} features have been selected and saved to 'bert_features_selected.npy'.")
print("New shape:", bert_features_selected.shape)

Loading full BERT features to select the best ones...
Training a temporary model to find important features...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.726773 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 768
[LightGBM] [Info] Start training from score 2.739217

Top 256 features have been selected and saved to 'bert_features_selected.npy'.
New shape: (150000, 256)


In [None]:
import numpy as np
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
import lightgbm as lgb
import xgboost as xgb
import gc

# --- Step 2.1: Sabhi Best Features ko Load aur Combine Karein ---
print("--- Loading and Combining all best features ---")
# TF-IDF features (1500 wale)
# text_features_train, text_features_test

# Selected BERT features (256 wale)
bert_selected = np.load('bert_features_selected.npy')
bert_train_selected = bert_selected[:len(train_df)]
bert_test_selected = bert_selected[len(train_df):]

# Reduced Image features (256 wale)
# X_img_train_reduced, X_img_test_reduced

X_final_train = hstack([text_features_train, csr_matrix(bert_train_selected), csr_matrix(X_img_train_reduced)]).tocsr()
X_final_test = hstack([text_features_test, csr_matrix(bert_test_selected), csr_matrix(X_img_test_reduced)]).tocsr()
y_log = np.log1p(train_df['price'])
print("All best features combined. Final shape:", X_final_train.shape)
gc.collect()

# --- Step 2.2: Stacking Ensemble ---
print("\n--- Starting K-Fold Stacking ---")
kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_preds_lgbm = np.zeros(X_final_train.shape[0])
oof_preds_xgb = np.zeros(X_final_train.shape[0])
test_preds_lgbm = []
test_preds_xgb = []

lgbm = lgb.LGBMRegressor(random_state=42)
xgboost = xgb.XGBRegressor(random_state=42, n_jobs=-1, tree_method='hist')

for fold, (train_index, val_index) in enumerate(kf.split(X_final_train, y_log)):
    print(f"\n===== FOLD {fold+1} ===== ")
    # ... (Yahan Stacking ka poora code daalein, jo pehle diya gaya tha) ...
    # ... Training LGBM, getting OOF and Test preds ...
    # ... Training XGBoost, getting OOF and Test preds ...
    # ... Free up memory with gc.collect() at the end of the loop ...

# --- Step 2.3: Meta-Model ---
# ... (Yahan Meta-Model (Ridge) ko train karne aur final prediction lene ka code daalein) ...
# Stacking_prices = ... (Final predictions from the stacking model)

--- Loading and Combining all best features ---
All best features combined. Final shape: (75000, 2012)

--- Starting K-Fold Stacking ---

===== FOLD 1 ===== 

===== FOLD 2 ===== 

===== FOLD 3 ===== 

===== FOLD 4 ===== 

===== FOLD 5 ===== 


In [None]:
from sklearn.decomposition import PCA
import numpy as np

# Load your original, large image feature files
print("Loading original image features...")
X_img_train = np.load('X_img_train.npy')
X_img_test = np.load('X_img_test.npy')

# Combine them to apply PCA consistently
img_features_full = np.vstack([X_img_train, X_img_test])

# Reduce the dimensionality using PCA (e.g., from 2048 to 256)
print("Reducing image features using PCA...")
pca_img = PCA(n_components=256, random_state=42)
img_features_reduced = pca_img.fit_transform(img_features_full)

# Split the reduced features back into training and testing sets
X_img_train_reduced = img_features_reduced[:len(train_df)]
X_img_test_reduced = img_features_reduced[len(train_df):]

print("Reduced image features are ready!")
print("New shape:", X_img_train_reduced.shape)

Loading original image features...
Reducing image features using PCA...


  self.explained_variance_ratio_ = self.explained_variance_ / total_var


Reduced image features are ready!
New shape: (75000, 256)


In [22]:
# Ek simple baseline (jaise sabhi products ki average price)
average_price = np.expm1(y_log).mean()

# Stacking model ke results ko is simple average ke saath thoda sa "blend" karein
final_blended_prices = stacking_prices * 0.99 + average_price * 0.01

# Final submission file banayein
submission_df = pd.DataFrame({'sample_id': test_df['sample_id'], 'price': final_blended_prices})
submission_df.to_csv('test_out_advanced_final.csv', index=False)
print("Aapki advanced submission file 'test_out_advanced_final.csv' taiyaar hai!")

NameError: name 'y_log' is not defined

In [None]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
import lightgbm as lgb
import xgboost as xgb
import gc

# Ensure these variables are in memory before running:
# X_final_train, X_final_test, y_log

print("--- Starting K-Fold Stacking Ensemble ---")
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# To store predictions for the meta-model
oof_preds_lgbm = np.zeros(X_final_train.shape[0])
oof_preds_xgb = np.zeros(X_final_train.shape[0])

# To store test predictions from each fold
test_preds_lgbm = []
test_preds_xgb = []

# Initialize base models
lgbm = lgb.LGBMRegressor(random_state=42)
xgboost = xgb.XGBRegressor(random_state=42, n_jobs=-1, tree_method='hist')

for fold, (train_index, val_index) in enumerate(kf.split(X_final_train, y_log)):
    print(f"\n===== FOLD {fold+1} ===== ")
    X_train_fold, X_val_fold = X_final_train[train_index], X_final_train[val_index]
    y_train_fold, y_val_fold = y_log.iloc[train_index], y_log.iloc[val_index]
    
    # Train and predict with LightGBM
    print("Training LightGBM...")
    lgbm.fit(X_train_fold, y_train_fold)
    oof_preds_lgbm[val_index] = lgbm.predict(X_val_fold)
    test_preds_lgbm.append(lgbm.predict(X_final_test))
    
    # Train and predict with XGBoost
    print("Training XGBoost...")
    xgboost.fit(X_train_fold, y_train_fold)
    oof_preds_xgb[val_index] = xgboost.predict(X_val_fold)
    test_preds_xgb.append(xgboost.predict(X_final_test))
    
    # Clean up memory
    del X_train_fold, X_val_fold, y_train_fold, y_val_fold
    gc.collect()

print("\n--- Training Meta-Model ---")
# Create training data for the meta-model
X_meta_train = np.column_stack([oof_preds_lgbm, oof_preds_xgb])

# Train the meta-model
meta_model = Ridge()
meta_model.fit(X_meta_train, y_log)

print("\n--- Making Final Predictions ---")
# Average the test predictions from each fold for the base models
final_lgbm_preds = np.mean(test_preds_lgbm, axis=0)
final_xgb_preds = np.mean(test_preds_xgb, axis=0)

# Create test data for the meta-model
X_meta_test = np.column_stack([final_lgbm_preds, final_xgb_preds])

# Get final predictions from the meta-model
final_preds_log = meta_model.predict(X_meta_test)
stacking_prices = np.expm1(final_preds_log)
stacking_prices[stacking_prices < 0] = 0

print("\nStacking predictions are ready and stored in the 'stacking_prices' variable!")

--- Starting K-Fold Stacking Ensemble ---


NameError: name 'X_final_train' is not defined

In [23]:
import os
import numpy as np

# File names for saved features
img_train_file = 'X_img_train.npy'
img_test_file = 'X_img_test.npy'

if os.path.exists(img_train_file) and os.path.exists(img_test_file):
    print("Saved image features found! Loading from files...")
    X_img_train = np.load(img_train_file)
    X_img_test = np.load(img_test_file)
    print("Image features loaded successfully.")
else:
    print("Saved image features not found. Starting extraction process (this will take time)...")
    
    # Yahan image feature extraction ka poora lamba code daalein
    # (jismein 'extract_image_features' function aur ThreadPoolExecutor ka istemal hota hai)
    # ...
    # ... (Feature extraction code) ...
    # ...
    
    # Process ke ant mein, results ko save karein
    np.save(img_train_file, X_img_train)
    np.save(img_test_file, X_img_test)
    print("\nImage feature extraction complete AND results have been saved for future use!")

print("Shape of training image features:", X_img_train.shape)

Saved image features found! Loading from files...
Image features loaded successfully.
Shape of training image features: (75000, 2048)


In [24]:
import os
import numpy as np

embedding_file = 'distilbert_embeddings.npy'

if os.path.exists(embedding_file):
    print(f"'{embedding_file}' found! Loading features directly...")
    final_embeddings = np.load(embedding_file)
    print("DistilBERT features loaded successfully.")
else:
    print(f"'{embedding_file}' not found. Starting extraction process (this will take a long time)...")
    
    # Yahan DistilBERT feature extraction ka poora lamba code daalein
    # (jismein tokenizer, model, aur batch processing ka istemal hota hai)
    # ...
    # ... (DistilBERT code) ...
    # ...
    
    np.save(embedding_file, final_embeddings)
    print(f"\nDistilBERT features extracted AND saved to '{embedding_file}' for future use!")

# Saved ya newly created features ko train/test mein baant lein
bert_features_train = final_embeddings[:len(train_df)]
bert_features_test = final_embeddings[len(train_df):]
print("Shape of training text features:", bert_features_train.shape)

'distilbert_embeddings.npy' found! Loading features directly...
DistilBERT features loaded successfully.
Shape of training text features: (75000, 768)
