In [1]:
from google.colab import drive
import pandas as pd
import numpy as np
import os

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
drive_path = '/content/drive/My Drive'

In [4]:
all_image_features = np.load(os.path.join(drive_path, 'all_image_features.npy'), allow_pickle=True).item()

In [5]:
train_df = pd.read_csv('train_preprocessed.csv')
test_df = pd.read_csv('test_preprocessed.csv')

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
from scipy.sparse import hstack

In [7]:
image_filenames = list(all_image_features.keys())
image_feature_vectors = np.array(list(all_image_features.values()))

In [8]:
print(f"Shape of image feature vectors: {image_feature_vectors.shape}")

Shape of image feature vectors: (140564, 2048)


In [9]:
print(f"Number of image filenames: {len(image_filenames)}")

Number of image filenames: 140564


In [10]:
image_features_df = pd.DataFrame(image_feature_vectors, index=image_filenames)
image_features_df.index.name = 'filename'

In [11]:
image_features_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
81HBeypM9OL.jpg,0.345448,0.45605,0.988337,0.188612,1.000779,0.882697,0.28463,0.178169,0.913208,0.174903,...,0.343704,0.365218,0.503408,0.772206,0.450264,0.022612,0.786766,0.19878,0.568652,0.55172
810csPIv4ML.jpg,0.454637,2.012997,0.054166,0.197275,1.565557,0.391016,0.379122,0.0,0.899287,0.111666,...,1.13299,0.085762,0.614638,0.066585,0.42178,0.064931,0.826992,0.535516,0.021,0.137041
6101ttR-AYL.jpg,0.484811,1.458574,0.069766,0.000352,0.866891,0.236085,0.923649,0.148647,1.097531,0.172585,...,0.954126,0.139147,0.311189,0.424142,0.037661,0.162902,0.316644,0.052916,0.537284,1.053032
61NBIHn2h2L.jpg,0.07306,2.343555,0.076419,0.007601,0.797889,0.610318,0.472866,0.150804,0.255709,0.307382,...,1.641964,0.236993,0.58828,0.240443,0.023469,0.123733,0.721824,0.131618,0.071839,0.093984
71zXpZMpAXL.jpg,0.718636,0.452312,0.414025,0.102791,0.555732,1.55647,0.29505,0.192128,0.072017,0.022006,...,0.230032,0.054227,0.802946,0.030594,0.414451,0.012609,0.181495,0.134494,0.91455,0.180633


In [12]:
train_df.shape

(75000, 7)

In [13]:
test_df.shape

(75000, 4)

In [14]:
all_df = pd.concat([train_df, test_df], ignore_index=True)

In [15]:
all_df.shape

(150000, 7)

In [16]:
image_features_df.shape

(140564, 2048)

In [17]:
all_df['price'].isna().sum()

np.int64(75000)

In [18]:
all_df['filename'] = all_df['image_link'].apply(lambda x: os.path.basename(x))

In [19]:
final_all_df = all_df.merge(image_features_df, on='filename', how='left')

In [20]:
final_all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Columns: 2056 entries, sample_id to 2047
dtypes: float32(2048), float64(3), int64(1), object(4)
memory usage: 1.2+ GB


In [21]:
final_all_df.shape

(150000, 2056)

In [22]:
image_feature_cols = [col for col in final_all_df.columns if isinstance(col, (int, float))]

In [23]:
non_image_feature_cols = [col for col in final_all_df.columns if not isinstance(col, (int, float))]

# Print the list of image feature columns (a small sample)
print("--- Image Feature Columns (First 5) ---")
print(image_feature_cols[:5])

# Print the list of non-image feature columns (all of them)
print("\n--- Non-Image Feature Columns ---")
print(non_image_feature_cols)

--- Image Feature Columns (First 5) ---
[0, 1, 2, 3, 4]

--- Non-Image Feature Columns ---
['sample_id', 'catalog_content', 'image_link', 'price', 'word_count', 'log_price', 'clean_catalog_content', 'filename']


In [24]:
final_all_df[image_feature_cols] = final_all_df[image_feature_cols].fillna(0)

In [25]:
test_ids = test_df['sample_id'].tolist()

# Split the data back into final train and test sets
train_df_final = final_all_df.loc[~final_all_df['sample_id'].isin(test_ids)].reset_index(drop=True)
test_df_final = final_all_df.loc[final_all_df['sample_id'].isin(test_ids)].reset_index(drop=True)

print(f"Final training set shape: {train_df_final.shape}")
print(f"Final test set shape: {test_df_final.shape}")

Final training set shape: (75000, 2056)
Final test set shape: (75000, 2056)


In [26]:
import pandas as pd
import numpy as np
import os
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack


In [27]:
X_train_text = train_df_final['clean_catalog_content']
X_test_text = test_df_final['clean_catalog_content']
y_train_log = train_df_final['log_price']

In [28]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=10000, stop_words='english')
X_train_text_tfidf = vectorizer.fit_transform(X_train_text)
X_test_text_tfidf = vectorizer.transform(X_test_text)

In [29]:
image_feature_cols = [col for col in train_df_final.columns if isinstance(col, (int, float))]
X_train_image = train_df_final[image_feature_cols].values
X_test_image = test_df_final[image_feature_cols].values

In [30]:
print("Training model on Text features...")
text_model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)
text_model.fit(X_train_text_tfidf, train_df_final['log_price'])
text_preds_log = text_model.predict(X_test_text_tfidf)

Training model on Text features...




In [31]:
print("Training model on Image features...")
image_model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

# Make sure X_train_image and X_test_image are dense NumPy arrays
image_feature_cols = [col for col in train_df_final.columns if isinstance(col, (int, float))]
X_train_image = train_df_final[image_feature_cols].values
X_test_image = test_df_final[image_feature_cols].values

image_model.fit(X_train_image, train_df_final['log_price'])
print("Image model training complete!")

Training model on Image features...
Image model training complete!


In [32]:
# Assuming you have the text_preds_log and image_preds_log from the previous steps

# Make predictions with the image model on the test data
image_preds_log = image_model.predict(X_test_image)

# Combine the predictions from both models
# We'll use a simple average (50/50 split)
print("Combining predictions...")
final_predictions_log = (text_preds_log * 0.5) + (image_preds_log * 0.5)

# Inverse transform the predictions from the log scale
final_predictions = np.expm1(final_predictions_log)

# Ensure all predictions are positive, as required
final_predictions[final_predictions < 0] = 0.01



Combining predictions...


In [33]:
# Create the final submission file
submission_df = pd.DataFrame({
    'sample_id': test_df_final['sample_id'],
    'price': final_predictions
})

# Save the submission file
submission_df.to_csv('final_submission.csv', index=False)
print("Final submission file created successfully as 'final_submission.csv'!")

Final submission file created successfully as 'final_submission.csv'!


In [37]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
from scipy.sparse import hstack
import os

# Assuming you have successfully created train_df_final as planned.

# Select all features from the final combined DataFrame
# We will drop the target variable and other columns not used for prediction
X_all_features = train_df_final.drop(['price', 'log_price', 'sample_id', 'image_link', 'word_count', 'filename'], axis=1)

# The target variable (log-transformed price)
y_all_log = train_df_final['log_price']

# Split the data into a training set and a validation set
X_train_subset, X_val, y_train_log_subset, y_val_log = train_test_split(
    X_all_features,
    y_all_log,
    test_size=0.2,
    random_state=42
)

print(f"New Training set size: {X_train_subset.shape[0]} rows")
print(f"Validation set size: {X_val.shape[0]} rows")

# Now, you can correctly separate the features for the models
# The 'clean_catalog_content' column exists in both X_train_subset and X_val

# Create text features for training and validation
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=10000, stop_words='english')
X_train_subset_text_tfidf = vectorizer.fit_transform(X_train_subset['clean_catalog_content'])
X_val_text_tfidf = vectorizer.transform(X_val['clean_catalog_content'])

# Get image features for training and validation
image_feature_cols = [col for col in X_train_subset.columns if isinstance(col, (int, float))]
X_train_subset_image = X_train_subset[image_feature_cols].values
X_val_image = X_val[image_feature_cols].values

# The rest of your training and SMAPE calculation code will now work correctly.
# You can copy and paste the training and prediction code from previous messages.

New Training set size: 60000 rows
Validation set size: 15000 rows


In [38]:
# Retrain the text model on the smaller training subset
text_model.fit(X_train_subset_text_tfidf, y_train_log_subset)
text_preds_val_log = text_model.predict(X_val_text_tfidf)

# Retrain the image model on the smaller training subset
image_model.fit(X_train_subset_image, y_train_log_subset)
image_preds_val_log = image_model.predict(X_val_image)

# Combine the predictions
combined_preds_val_log = (text_preds_val_log * 0.5) + (image_preds_val_log * 0.5)

# Inverse transform to get the final prices
combined_preds_val = np.expm1(combined_preds_val_log)
combined_preds_val[combined_preds_val < 0] = 0.01

# Get the actual prices for comparison
y_val = np.expm1(y_val_log)

# Calculate SMAPE
smape = smape_score(y_val, combined_preds_val)
print(f"Your model's SMAPE score on the validation set is: {smape:.2f}%")



NameError: name 'smape_score' is not defined

In [39]:
def smape_score(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    smape = np.mean(np.abs(y_pred - y_true) / denominator) * 100
    return smape

# Calculate SMAPE
smape = smape_score(y_val, combined_preds_val)
print(f"Your model's SMAPE score on the validation set is: {smape:.2f}%")

Your model's SMAPE score on the validation set is: 56.00%
