In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
# Make sure the path is correct!
!unzip /content/drive/MyDrive/68e8d1d70b66d_student_resource.zip -d /content/challenge

Archive:  /content/drive/MyDrive/68e8d1d70b66d_student_resource.zip
   creating: /content/challenge/student_resource/
  inflating: /content/challenge/__MACOSX/._student_resource  
  inflating: /content/challenge/student_resource/sample_code.py  
  inflating: /content/challenge/__MACOSX/student_resource/._sample_code.py  
  inflating: /content/challenge/student_resource/Documentation_template.md  
  inflating: /content/challenge/__MACOSX/student_resource/._Documentation_template.md  
  inflating: /content/challenge/student_resource/.DS_Store  
  inflating: /content/challenge/__MACOSX/student_resource/._.DS_Store  
   creating: /content/challenge/student_resource/dataset/
  inflating: /content/challenge/__MACOSX/student_resource/._dataset  
  inflating: /content/challenge/student_resource/README.md  
  inflating: /content/challenge/__MACOSX/student_resource/._README.md  
   creating: /content/challenge/student_resource/src/
  inflating: /content/challenge/__MACOSX/student_resource/._src 

In [None]:
# ==============================================================================
# 1. SETUP AND IMPORTS
# ==============================================================================
import os
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten, concatenate, Dropout
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.feature_extraction.text import TfidfVectorizer
from PIL import Image
import requests
from io import BytesIO
from tqdm.auto import tqdm
import scipy.sparse

# ✅ Set the CORRECTED base directory based on your unzipped folder structure
BASE_DIR = '/content/challenge/student_resource/'

# ==============================================================================
# 2. HELPER FUNCTIONS (DATA LOADING, PREPROCESSING)
# ==============================================================================

# Custom SMAPE metric for Keras evaluation
def smape(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)
    numerator = tf.abs(y_pred - y_true)
    denominator = (tf.abs(y_true) + tf.abs(y_pred)) / 2.0
    # Add a small epsilon to avoid division by zero
    return tf.reduce_mean(numerator / (denominator + 1e-8)) * 100.0

# Function to download and process a single image
def download_and_process_image(url, target_size=(128, 128)):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        img = Image.open(BytesIO(response.content)).convert('RGB')
        img = img.resize(target_size)
        return img_to_array(img) / 255.0
    except Exception as e:
        # Return a blank image if download fails
        return np.zeros((target_size[0], target_size[1], 3))

# Function to extract Item Pack Quantity (IPQ) - A key feature engineering step!
def extract_ipq(text):
    text = str(text).lower()
    # Patterns to find "pack of X", "X count", etc.
    patterns = [
        r'pack of (\d+)', r'(\d+)\s*pack', r'(\d+)\s*count',
        r'(\d+)\s*ct', r'set of (\d+)', r'\((\d+)\)'
    ]
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return int(match.group(1))
    return 1 # Default to 1 if no pack size is found

# ==============================================================================
# 3. LOAD AND PREPROCESS DATA
# ==============================================================================
print("Loading data...")
train_df = pd.read_csv(os.path.join(BASE_DIR, 'dataset/train.csv'))
test_df = pd.read_csv(os.path.join(BASE_DIR, 'dataset/test.csv'))

# --- Text Preprocessing ---
print("Preprocessing text data...")

# Extract IPQ feature
train_df['ipq'] = train_df['catalog_content'].apply(extract_ipq)
test_df['ipq'] = test_df['catalog_content'].apply(extract_ipq)

# Use TF-IDF for text vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=20000, min_df=5)
train_text_features = tfidf_vectorizer.fit_transform(train_df['catalog_content'])
test_text_features = tfidf_vectorizer.transform(test_df['catalog_content'])

# Combine TF-IDF features with our engineered IPQ feature
train_ipq_reshaped = np.array(train_df['ipq']).reshape(-1, 1)
test_ipq_reshaped = np.array(test_df['ipq']).reshape(-1, 1)

train_full_text_features = scipy.sparse.hstack([train_text_features, train_ipq_reshaped]).tocsr()
test_full_text_features = scipy.sparse.hstack([test_text_features, test_ipq_reshaped]).tocsr()

# --- Image Preprocessing ---
print("Downloading and processing images... (This might take a while)")
IMG_SIZE = (128, 128)

# Using tqdm for a progress bar
tqdm.pandas()

# It's better to save processed images so you don't have to download them every time
os.makedirs('/content/processed_images', exist_ok=True)
def get_image_array(row, dataset_type):
    img_path = f"/content/processed_images/{dataset_type}_{row['sample_id']}.npy"
    if os.path.exists(img_path):
        return np.load(img_path)
    else:
        img_array = download_and_process_image(row['image_link'], target_size=IMG_SIZE)
        np.save(img_path, img_array)
        return img_array

train_images = np.array(train_df.progress_apply(lambda row: get_image_array(row, 'train'), axis=1).tolist())
test_images = np.array(test_df.progress_apply(lambda row: get_image_array(row, 'test'), axis=1).tolist())

# Target variable
train_prices = train_df['price'].values

# ==============================================================================
# 4. BUILD THE MULTI-INPUT MODEL
# ==============================================================================
print("Building the model...")

# --- Text Input Branch ---
text_input_shape = train_full_text_features.shape[1]
text_input = Input(shape=(text_input_shape,), sparse=True, name='text_input')
x1 = Dense(512, activation='relu')(text_input)
x1 = Dropout(0.4)(x1)
x1 = Dense(256, activation='relu')(x1)
text_output = Model(inputs=text_input, outputs=x1)

# --- Image Input Branch (Simple CNN) ---
image_input_shape = (IMG_SIZE[0], IMG_SIZE[1], 3)
image_input = Input(shape=image_input_shape, name='image_input')
x2 = Conv2D(32, (3, 3), activation='relu')(image_input)
x2 = MaxPooling2D((2, 2))(x2)
x2 = Conv2D(64, (3, 3), activation='relu')(x2)
x2 = MaxPooling2D((2, 2))(x2)
x2 = Flatten()(x2)
x2 = Dense(128, activation='relu')(x2)
image_output = Model(inputs=image_input, outputs=x2)

# --- Combine Branches ---
combined = concatenate([text_output.output, image_output.output])

# --- Final Prediction Head ---
z = Dense(256, activation='relu')(combined)
z = Dropout(0.4)(z)
z = Dense(128, activation='relu')(z)
# Output layer: 1 neuron for price, 'relu' to ensure price is non-negative
output = Dense(1, activation='relu', name='price_output')(z)

# --- Create and Compile the Final Model ---
model = Model(inputs=[text_output.input, image_output.input], outputs=output)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='mean_absolute_error',
              metrics=[smape])

model.summary()

# ==============================================================================
# 5. TRAIN THE MODEL
# ==============================================================================
print("Training the model...")

# Convert sparse matrix to a format Keras can handle for training
train_full_text_features_dense = train_full_text_features.toarray()

history = model.fit(
    [train_full_text_features_dense, train_images],
    train_prices,
    validation_split=0.1,
    epochs=15,
    batch_size=128,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    ]
)

# ==============================================================================
# 6. MAKE PREDICTIONS AND GENERATE SUBMISSION FILE
# ==============================================================================
print("Making predictions on the test set...")

test_full_text_features_dense = test_full_text_features.toarray()
predictions = model.predict([test_full_text_features_dense, test_images])

# Ensure predictions are positive floats
final_predictions = np.maximum(1.0, predictions.flatten())

# Create submission DataFrame
submission_df = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': final_predictions
})

# Save to CSV
submission_path = '/content/test_out.csv'
submission_df.to_csv(submission_path, index=False)

print(f"Submission file created successfully at: {submission_path}")
print("Displaying first 5 rows of the submission file:")
print(submission_df.head())

Loading data...
Preprocessing text data...
Downloading and processing images... (This might take a while)


  0%|          | 0/75000 [00:00<?, ?it/s]