In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
image_feature_path="/Users/sanskarparab/Downloads/Amazon ML/all_image_features.npy"

In [3]:
all_image_features = np.load(image_feature_path, allow_pickle=True).item()

In [4]:
train_df = pd.read_csv('train_preprocessed.csv')
test_df = pd.read_csv('test_preprocessed.csv')

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

In [6]:
image_filenames = list(all_image_features.keys())
image_feature_vectors = np.array(list(all_image_features.values()))

In [7]:
print(f"Shape of image feature vectors: {image_feature_vectors.shape}")

Shape of image feature vectors: (140564, 2048)


In [8]:
print(f"Number of image filenames: {len(image_filenames)}")

Number of image filenames: 140564


In [9]:
image_features_df = pd.DataFrame(image_feature_vectors, index=image_filenames)
image_features_df.index.name = 'filename'

In [10]:
image_features_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
81HBeypM9OL.jpg,0.345448,0.45605,0.988337,0.188612,1.000779,0.882697,0.28463,0.178169,0.913208,0.174903,...,0.343704,0.365218,0.503408,0.772206,0.450264,0.022612,0.786766,0.19878,0.568652,0.55172
810csPIv4ML.jpg,0.454637,2.012997,0.054166,0.197275,1.565557,0.391016,0.379122,0.0,0.899287,0.111666,...,1.13299,0.085762,0.614638,0.066585,0.42178,0.064931,0.826992,0.535516,0.021,0.137041
6101ttR-AYL.jpg,0.484811,1.458574,0.069766,0.000352,0.866891,0.236085,0.923649,0.148647,1.097531,0.172585,...,0.954126,0.139147,0.311189,0.424142,0.037661,0.162902,0.316644,0.052916,0.537284,1.053032
61NBIHn2h2L.jpg,0.07306,2.343555,0.076419,0.007601,0.797889,0.610318,0.472866,0.150804,0.255709,0.307382,...,1.641964,0.236993,0.58828,0.240443,0.023469,0.123733,0.721824,0.131618,0.071839,0.093984
71zXpZMpAXL.jpg,0.718636,0.452312,0.414025,0.102791,0.555732,1.55647,0.29505,0.192128,0.072017,0.022006,...,0.230032,0.054227,0.802946,0.030594,0.414451,0.012609,0.181495,0.134494,0.91455,0.180633


In [11]:
train_df.shape

(75000, 7)

In [12]:
test_df.shape

(75000, 4)

In [13]:
all_df = pd.concat([train_df, test_df], ignore_index=True)

In [14]:
all_df.shape

(150000, 7)

In [15]:
image_features_df.shape

(140564, 2048)

In [16]:
all_df['price'].isna().sum()

75000

In [17]:
all_df['filename'] = all_df['image_link'].apply(lambda x: os.path.basename(x))

In [18]:
final_all_df = all_df.merge(image_features_df, on='filename', how='left')

In [19]:
final_all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Columns: 2056 entries, sample_id to 2047
dtypes: float32(2048), float64(3), int64(1), object(4)
memory usage: 1.2+ GB


In [20]:
final_all_df.shape

(150000, 2056)

In [21]:
image_feature_cols = [col for col in final_all_df.columns if isinstance(col, (int, float))]

In [22]:
non_image_feature_cols = [col for col in final_all_df.columns if not isinstance(col, (int, float))]

# Print the list of image feature columns (a small sample)
print("--- Image Feature Columns (First 5) ---")
print(image_feature_cols[:5])

# Print the list of non-image feature columns (all of them)
print("\n--- Non-Image Feature Columns ---")
print(non_image_feature_cols)

--- Image Feature Columns (First 5) ---
[0, 1, 2, 3, 4]

--- Non-Image Feature Columns ---
['sample_id', 'catalog_content', 'image_link', 'price', 'word_count', 'log_price', 'clean_catalog_content', 'filename']


In [23]:
final_all_df[image_feature_cols] = final_all_df[image_feature_cols].fillna(0)

In [24]:
test_ids = test_df['sample_id'].tolist()

# Split the data back into final train and test sets
train_df_final = final_all_df.loc[~final_all_df['sample_id'].isin(test_ids)].reset_index(drop=True)
test_df_final = final_all_df.loc[final_all_df['sample_id'].isin(test_ids)].reset_index(drop=True)

print(f"Final training set shape: {train_df_final.shape}")
print(f"Final test set shape: {test_df_final.shape}")

Final training set shape: (75000, 2056)
Final test set shape: (75000, 2056)


In [25]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
X_all_features = train_df_final.drop(['price', 'log_price', 'sample_id', 'image_link', 'word_count', 'filename'], axis=1)

In [27]:
y_all_log = train_df_final['log_price']

In [28]:
from scipy.sparse import hstack

In [45]:
from sklearn.model_selection import train_test_split
X_train_subset, X_val, y_train_log_subset, y_val_log = train_test_split(
    X_all_features,
    y_all_log,
    test_size=0.2,
    random_state=42
)

In [46]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=10000, stop_words='english')

# The fit_transform is done on the training subset
X_train_subset_text_tfidf = vectorizer.fit_transform(X_train_subset['clean_catalog_content'])

# The transform is done on the validation set, using the same vectorizer
X_val_text_tfidf = vectorizer.transform(X_val['clean_catalog_content'])

# --- Get image features for training and validation ---
# Identify the image feature columns
image_feature_cols = [col for col in X_train_subset.columns if isinstance(col, (int, float))]
X_train_subset_image = X_train_subset[image_feature_cols].values
X_val_image = X_val[image_feature_cols].values

print("Vectorizer and feature matrices created successfully!")

Vectorizer and feature matrices created successfully!


In [32]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.5-py3-none-macosx_12_0_arm64.whl (2.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m0m eta [36m0:00:01[0m0:01[0m:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-3.0.5

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [47]:
from xgboost import XGBRegressor

In [48]:
def smape_score(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    smape = np.mean(np.abs(y_pred - y_true) / denominator) * 100
    return smape

In [50]:
# Import XGBoost
from xgboost import XGBRegressor
import numpy as np

# Assuming your data and variables are correctly prepared:
# X_train_subset_text_tfidf, X_val_text_tfidf, y_train_log_subset, y_val_log
# X_train_subset_image, X_val_image

# -----------------
# FINAL MODEL TRAINING
# -----------------

# Train a model on TEXT features only
print("Training XGBoost model on Text features...")
text_model = XGBRegressor(
    n_estimators=2000,
    learning_rate=0.01,
    max_depth=7,
    subsample=0.7,
    colsample_bytree=0.7,
    random_state=42,
    n_jobs=-1,
)
text_model.fit(X_train_subset_text_tfidf, y_train_log_subset,
               eval_set=[(X_val_text_tfidf, y_val_log)],
               verbose=False)
text_preds_val_log = text_model.predict(X_val_text_tfidf)

# Train a separate model on IMAGE features only
print("Training XGBoost model on Image features...")
image_model = XGBRegressor(
    n_estimators=2000,
    learning_rate=0.01,
    max_depth=7,
    subsample=0.7,
    colsample_bytree=0.7,
    random_state=42,
    n_jobs=-1,
)
image_model.fit(X_train_subset_image, y_train_log_subset,
                eval_set=[(X_val_image, y_val_log)],
                verbose=False)
image_preds_val_log = image_model.predict(X_val_image)

# Combine the predictions
combined_preds_val_log = (text_preds_val_log * 0.5) + (image_preds_val_log * 0.5)

# Inverse transform to get the final prices
combined_preds_val = np.expm1(combined_preds_val_log)
combined_preds_val[combined_preds_val < 0] = 0.01

# Calculate SMAPE
smape = smape_score(np.expm1(y_val_log), combined_preds_val)
print(f"Your combined XGBoost model's SMAPE score on the validation set is: {smape:.2f}%")

Training XGBoost model on Text features...
Training XGBoost model on Image features...
Your combined XGBoost model's SMAPE score on the validation set is: 57.56%


In [52]:
# --------------------------
# ✅ PREDICT ON TEST DATA
# --------------------------

# 1️⃣ Vectorize text in test data using the same vectorizer
X_test_text_tfidf = vectorizer.transform(test_df_final['clean_catalog_content'])

# 2️⃣ Get image features for test data (same way as train)
X_test_image = test_df_final[image_feature_cols].values

# 3️⃣ Predict log prices separately for text & image
text_preds_test_log = text_model.predict(X_test_text_tfidf)
image_preds_test_log = image_model.predict(X_test_image)

# 4️⃣ Combine them (you can tune these weights)
final_predictions_log = (text_preds_test_log * 0.5) + (image_preds_test_log * 0.5)

# 5️⃣ Convert back from log scale to original prices
final_predictions = np.expm1(final_predictions_log)
final_predictions[final_predictions < 0] = 0.01  # Avoid negatives

# 6️⃣ Create the final submission CSV
submission_df = pd.DataFrame({
    'sample_id': test_df_final['sample_id'],
    'price': final_predictions
})
submission_df.to_csv('final_submission_final.csv', index=False)

print("✅ Final submission file created successfully as 'final_submission_final.csv'!")

✅ Final submission file created successfully as 'final_submission_final.csv'!


In [53]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from scipy.sparse import hstack, csr_matrix

# ---------------------------
# Combine TEXT + IMAGE features
# ---------------------------

# Combine the TF-IDF (sparse) and image (dense) features
X_train_combined = hstack([X_train_subset_text_tfidf, csr_matrix(X_train_subset_image)])
X_val_combined = hstack([X_val_text_tfidf, csr_matrix(X_val_image)])

print("Combined training and validation features successfully!")

# ---------------------------
# Train XGBoost model
# ---------------------------
print("Training final XGBoost model on combined features...")

combined_model = XGBRegressor(
    n_estimators=2500,
    learning_rate=0.02,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
)

combined_model.fit(
    X_train_combined,
    y_train_log_subset,
    eval_set=[(X_val_combined, y_val_log)],
    verbose=200
)

# ---------------------------
# Validation Predictions + SMAPE
# ---------------------------
preds_val_log = combined_model.predict(X_val_combined)
preds_val = np.expm1(preds_val_log)
preds_val[preds_val < 0] = 0.01

# Compute SMAPE
smape = smape_score(np.expm1(y_val_log), preds_val)
print(f"✅ SMAPE on validation set: {smape:.2f}%")

# ---------------------------
# Predict on TEST DATA
# ---------------------------
# Combine TF-IDF and image features for test set
X_test_combined = hstack([X_test_text_tfidf, csr_matrix(X_test_image)])

# Predict
final_predictions_log = combined_model.predict(X_test_combined)
final_predictions = np.expm1(final_predictions_log)
final_predictions[final_predictions < 0] = 0.01

# ---------------------------
# Create Final Submission
# ---------------------------
submission_df = pd.DataFrame({
    'sample_id': test_df_final['sample_id'],
    'price': final_predictions
})

submission_df.to_csv('final_submission_final2.csv', index=False)
print("🎯 Final submission file saved as 'final_submission_final2.csv'!")

Combined training and validation features successfully!
Training final XGBoost model on combined features...
[0]	validation_0-rmse:0.94739
[200]	validation_0-rmse:0.75481
[400]	validation_0-rmse:0.73562
[600]	validation_0-rmse:0.72616
[800]	validation_0-rmse:0.71974
[1000]	validation_0-rmse:0.71509
[1200]	validation_0-rmse:0.71121
[1400]	validation_0-rmse:0.70830
[1600]	validation_0-rmse:0.70567
[1800]	validation_0-rmse:0.70364
[2000]	validation_0-rmse:0.70201
[2200]	validation_0-rmse:0.70043
[2400]	validation_0-rmse:0.69904
[2499]	validation_0-rmse:0.69856
✅ SMAPE on validation set: 53.70%
🎯 Final submission file saved as 'final_submission_final2.csv'!
