In [5]:
# ===================================================================
# 1. SETUP AND IMPORTS
# ===================================================================
import pandas as pd
import numpy as np
import re
import torch
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

# Activate tqdm for pandas apply
tqdm.pandas()

# --- Configuration Block ---
class CFG:
    # UPDATED: Added TEST_FILE
    TRAIN_FILE = '/content/train.csv'
    TEST_FILE = "/content/test.csv"  # Assuming your test file is named this
    EMBEDDING_MODEL = 'intfloat/e5-base-v2'
    RANDOM_SEED = 42

In [2]:

# ===================================================================
# 2. ALL HELPER FUNCTIONS (UNCHANGED)
# ===================================================================
def find_ipq(text):
    text = str(text)
    pattern_before = r'(?:pack of|case of|set of|bundle of|pk)\s*(\d+)'
    match = re.search(pattern_before, text, re.IGNORECASE)
    if match: return int(match.group(1))
    pattern_per = r'(\d+)\s*per\s*case'
    match = re.search(pattern_per, text, re.IGNORECASE)
    if match: return int(match.group(1))
    pattern_after = r'(\d+)\s*[-]?\s*(?:pack|count|pk|ct|pcs|case|pouch|pouches|servings|bottles|cans|bars|rolls|units)'
    match = re.search(pattern_after, text, re.IGNORECASE)
    if match: return int(match.group(1))
    pattern_x = r'\(?x\s*(\d+)\)?'
    match = re.search(pattern_x, text, re.IGNORECASE)
    if match: return int(match.group(1))
    return 1

def parse_full_content_final(text):
    if pd.isna(text):
        text = ""
    else:
        text = str(text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F" u"\U0001F300-\U0001F5FF" u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF" u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    item_name_match = re.search(r'Item Name:\s*(.*?)(?=\s*Bullet Point|Product Description|Value|$)', text, re.DOTALL)
    item_name = item_name_match.group(1).strip().lower() if item_name_match else None
    bullets = re.findall(r'Bullet Point(?:\s+\d+)?:(.*?)(?=\s*Bullet Point|Product Description|Value|$)', text, re.DOTALL)
    bullet_points_text = ' '.join([b.strip() for b in bullets]).strip()
    desc_match = re.search(r'Product Description:\s*(.*?)(?=\s*Value|$)', text, re.DOTALL)
    product_description_text = desc_match.group(1).strip() if desc_match else ''
    description_parts = [part for part in [bullet_points_text, product_description_text] if part]
    description = ' '.join(description_parts).lower() if description_parts else None
    ipq = find_ipq(text)
    value_match = re.search(r'.*Value:\s*([\d\.]+)', text, re.DOTALL)
    unit_match = re.search(r'.*Unit:\s*(.*)', text, re.DOTALL)
    value = float(value_match.group(1).strip()) if value_match else np.nan
    unit = unit_match.group(1).strip().lower() if unit_match else None
    return {'item_name': item_name, 'description': description, 'ipq': ipq, 'value': value, 'unit': unit}

UNIT_CONVERSION_MAP = {
    'fluid ounce(s)': ('fl oz', 1.0), 'fluid ounce': ('fl oz', 1.0), 'fl. oz.': ('fl oz', 1.0), 'fl. oz': ('fl oz', 1.0),
    'fl oz': ('fl oz', 1.0), 'floz': ('fl oz', 1.0), 'liter': ('fl oz', 33.814), 'liters': ('fl oz', 33.814),
    'ltr': ('fl oz', 33.814), 'milliliter': ('fl oz', 0.033814), 'millilitre': ('fl oz', 0.033814), 'mililitro': ('fl oz', 0.033814),
    'ml': ('fl oz', 0.033814), 'ounce': ('oz', 1.0), 'ounces': ('oz', 1.0), 'oz': ('oz', 1.0), 'pound': ('oz', 16.0),
    'pounds': ('oz', 16.0), 'lb': ('oz', 16.0), 'gram': ('oz', 0.035274), 'grams(gm)': ('oz', 0.035274), 'grams': ('oz', 0.035274),
    'gramm': ('oz', 0.035274), 'gr': ('oz', 0.035274), 'kg': ('oz', 35.274), 'count': ('ct', 1.0), 'ct': ('ct', 1.0),
    'each': ('ct', 1.0), 'piece': ('ct', 1.0), 'packs': ('ct', 1.0), 'pack': ('ct', 1.0), 'bottle': ('ct', 1.0),
    'bottles': ('ct', 1.0), 'bag': ('ct', 1.0), 'bags': ('ct', 1.0), 'can': ('ct', 1.0), 'jar': ('ct', 1.0),
    'k-cups': ('ct', 1.0), 'per carton': ('ct', 1.0), 'pouch': ('ct', 1.0), 'per package': ('ct', 1.0), 'per box': ('ct', 1.0),
    'paper cupcake liners': ('ct', 1.0), 'capsule': ('ct', 1.0), 'carton': ('ct', 1.0), 'ziplock bags': ('ct', 1.0),
    'units': ('ct', 1.0), 'box': ('ct', 1.0), 'bucket': ('ct', 1.0), 'none': ('None', 1.0), '---': ('None', 1.0)
}

unit_keys = '|'.join(re.escape(k) for k in UNIT_CONVERSION_MAP.keys())
VALUE_IN_TITLE_REGEX = re.compile(r'(\d+(?:\.\d*)?|\.\d+)\s*(' + unit_keys + r')\b|(' + unit_keys + r')\s*(\d+(?:\.\d*)?|\.\d+)\b', re.IGNORECASE)

def extract_and_standardize_from_title(title):
    if pd.isna(title): return np.nan, None
    match = VALUE_IN_TITLE_REGEX.search(str(title))
    if not match: return np.nan, None
    if match.group(1): value_str, unit_str = match.group(1), match.group(2)
    else: unit_str, value_str = match.group(3), match.group(4)
    value = float(value_str)
    unit_lower = unit_str.lower()
    standard_unit, conversion_factor = UNIT_CONVERSION_MAP.get(unit_lower, (None, 1.0))
    if standard_unit: return value * conversion_factor, standard_unit
    return np.nan, None

def standardize_original_value(value, unit):
    if pd.isna(value) or pd.isna(unit): return np.nan, None
    unit_lower = str(unit).lower()
    if unit_lower in UNIT_CONVERSION_MAP:
        standard_unit, conversion_factor = UNIT_CONVERSION_MAP[unit_lower]
        return value * conversion_factor, standard_unit
    for keyword, (mapping, conversion) in UNIT_CONVERSION_MAP.items():
        if keyword in unit_lower: return value * conversion, mapping
    return value, 'other'

def rectify_value_and_unit(row):
    value_from_title, unit_from_title = extract_and_standardize_from_title(row['item_name'])
    value_from_content, unit_from_content = standardize_original_value(row['value'], row['unit'])
    ipq = row['ipq'] if row['ipq'] > 0 else 1
    if pd.notna(value_from_title):
        final_value = value_from_title
        final_unit = unit_from_title
    else:
        final_value = value_from_content / ipq if pd.notna(value_from_content) else np.nan
        final_unit = unit_from_content
    return pd.Series([final_value, final_unit])

def extract_brand_by_length(text):
    if not isinstance(text, str) or not text: return "Unknown"
    words = text.split()
    if len(words) == 0: return "Unknown"
    if len(words) == 1: return words[0]
    first_word = words[0]
    if len(first_word) > 2: return first_word
    else: return ' '.join(words[:2])


In [3]:
# ===================================================================
# 3. PIPELINE FUNCTIONS
# ===================================================================
def load_and_parse_data(file_path):
    print(f"Loading data from {file_path}...")
    df = pd.read_csv(file_path)
    # Only process price if it exists
    if 'price' in df.columns:
        df["price"] = pd.to_numeric(df["price"], errors='coerce')
    print("Parsing catalog_content...")
    extracted_features = df['catalog_content'].progress_apply(parse_full_content_final).apply(pd.Series)
    df = pd.concat([df, extracted_features], axis=1)
    return df

def create_base_features(df):
    print("Creating base features (brand, value, unit)...")
    df[['value_final', 'unit_final']] = df.progress_apply(rectify_value_and_unit, axis=1)
    df['value_final'] = np.where((df['unit_final'] == 'ct') & (df['ipq'] == df['value']), 1.0, df['value_final'])
    df['value_final'] = np.where(df['value_final'] > 1000, df['value'], df['value_final'])
    df['ipq'] = np.where(df['ipq'] > 5000, 5000, df['ipq'])
    df["value"] = df["value_final"].combine_first(df["value"])
    df["unit"] = df["unit_final"]
    df = df.drop(columns=["value_final", "unit_final"])
    df['total_value'] = df['ipq'] * df['value']
    df['brand'] = df['item_name'].progress_apply(extract_brand_by_length)

    # MODIFIED: Only create log_price if 'price' column exists (i.e., for the train set)
    if 'price' in df.columns:
        df['log_price'] = np.log1p(df['price'])
    return df

def generate_text_embeddings(df, text_column, prefix):
    print(f"Generating embeddings for '{text_column}'...")
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")
    model = SentenceTransformer(CFG.EMBEDDING_MODEL, device=device)
    texts = df[text_column].fillna('').tolist()
    embeddings = model.encode(texts, show_progress_bar=True, batch_size=128)
    emb_df = pd.DataFrame(embeddings, columns=[f'{prefix}_{i}' for i in range(embeddings.shape[1])])
    return pd.concat([df.reset_index(drop=True), emb_df.reset_index(drop=True)], axis=1)

def finalize_features_for_merge(df):
    print("Finalizing features for merging...")
    df['brand_cat'] = pd.factorize(df['brand'])[0]
    df['unit_cat'] = pd.factorize(df['unit'])[0]
    columns_to_drop = ['catalog_content', 'description', 'image_link']
    df = df.drop(columns=columns_to_drop)
    print("✅ Final feature set for merging is ready.")
    return df

# ===================================================================
# 4. NEW - MASTER PROCESSING FUNCTION
# ===================================================================
def create_feature_set(file_path):
    """Runs the full feature engineering pipeline on a given file."""
    df = load_and_parse_data(file_path)
    df = create_base_features(df)
    df = generate_text_embeddings(df, text_column='item_name', prefix='name_emb')
    df = generate_text_embeddings(df, text_column='description', prefix='desc_emb')
    df_final = finalize_features_for_merge(df)
    return df_final
# print("✅ Successfully saved both files to S3.")

In [None]:

# ===================================================================
# 5. MAIN EXECUTION PIPELINE - FOR BOTH TRAIN AND TEST
# ===================================================================
print("--- Starting Full Feature Generation Pipeline ---")

# --- Process TRAINING data ---
print("\n[Processing TRAINING Data]")
train_features_df = create_feature_set(CFG.TRAIN_FILE)
print("\n--- Training Feature Generation Complete ---")
print("Final Training DataFrame shape:", train_features_df.shape)

# --- Process TEST data ---
print("\n[Processing TEST Data]")
test_features_df = create_feature_set(CFG.TEST_FILE)
print("\n--- Test Feature Generation Complete ---")
print("Final Test DataFrame shape:", test_features_df.shape)


--- Starting Full Feature Generation Pipeline ---

[Processing TRAINING Data]
Loading data from /content/train.csv...
Parsing catalog_content...


  0%|          | 0/75000 [00:00<?, ?it/s]

Creating base features (brand, value, unit)...


  0%|          | 0/75000 [00:00<?, ?it/s]

  0%|          | 0/75000 [00:00<?, ?it/s]

Generating embeddings for 'item_name'...
Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Batches:   0%|          | 0/586 [00:00<?, ?it/s]

Generating embeddings for 'description'...
Using device: cuda


Batches:   0%|          | 0/586 [00:00<?, ?it/s]

In [None]:
train_features_df.head()

In [None]:
# # ===================================================================
# # 6. SAVE THE FINAL FEATURES
# # ===================================================================
# # --- Save TRAIN features ---
# local_train_save_path = 'train_text_and_tabular_features.csv'
# train_features_df.to_csv(local_train_save_path, index=False)
# print(f"\n✅ Successfully saved TRAIN features locally to: {local_train_save_path}")

# # --- Save TEST features ---
# local_test_save_path = 'test_text_and_tabular_features.csv'
# test_features_df.to_csv(local_test_save_path, index=False)
# print(f"✅ Successfully saved TEST features locally to: {local_test_save_path}")

# # You can add your S3 upload logic here if needed
# # s3_train_path = 's3://amazon-ml-challenge-yourname/features/train_text_and_tabular_features.csv'
# # s3_test_path = 's3://amazon-ml-challenge-yourname/features/test_text_and_tabular_features.csv'
# # train_features_df.to_csv(s3_train_path, index=False)
# # test_features_df.to_csv(s3_test_path, index=False)