In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/abcdefg/student_resource/Documentation_template.md
/kaggle/input/abcdefg/student_resource/README.md
/kaggle/input/abcdefg/student_resource/sample_code.py
/kaggle/input/abcdefg/student_resource/dataset/sample_test.csv
/kaggle/input/abcdefg/student_resource/dataset/sample_test_out.csv
/kaggle/input/abcdefg/student_resource/dataset/train.csv
/kaggle/input/abcdefg/student_resource/dataset/test.csv
/kaggle/input/abcdefg/student_resource/src/utils.py
/kaggle/input/abcdefg/student_resource/src/example.ipynb


In [2]:
import re
from tqdm import tqdm

In [3]:
OUTPUT_DIR = '/kaggle/working/'
INPUT_DIR = '/kaggle/input/abcdefg/student_resource/dataset'

In [4]:
TRAIN_CSV_PATH = os.path.join(INPUT_DIR, 'train.csv')
TEST_CSV_PATH = os.path.join(INPUT_DIR, 'test.csv')
PROCESSED_TRAIN_PATH = os.path.join(OUTPUT_DIR, 'processed_advanced_train.csv')
PROCESSED_TEST_PATH = os.path.join(OUTPUT_DIR, 'processed_advanced_test.csv')

In [5]:
tqdm.pandas(desc="Processing rows")

In [6]:
CONVERSIONS = {
    'ounce': 28.3495, 'oz': 28.3495,
    'fl oz': 29.5735, 'fluid ounce': 29.5735,
    'pound': 453.592, 'lb': 453.592,
    'gram': 1, 'g': 1,
    'milliliter': 1, 'ml': 1,
    'liter': 1000, 'l': 1000,
}

In [7]:
def normalize_brand(brand):
    if not isinstance(brand, str):
        return "unknown"
    brand = brand.lower()
    brand = re.sub(r'[,.\-&()]', '', brand) 
    brand = re.sub(r'\s+(inc|llc|co|ltd)\b', '', brand)
    return brand.strip()

In [8]:
def extract_advanced_features(text):
    text_lower = text.lower()
    
    pack_quantity = 1
    weight_g = np.nan
    volume_ml = np.nan
    item_name = "unknown"
    brand = "unknown"

    name_match = re.search(r'item name:\s*(.*?)\n', text, re.IGNORECASE)
    if name_match:
        item_name = name_match.group(1).strip()
        brand_parts = item_name.split()
        brand = brand_parts[0] if len(brand_parts) > 0 else "unknown"

    pack_patterns = [
        r'\(pack of (\d+)\)', r'(\d+)\s*[-]?\s*pack', r'(\d+)\s*count',
        r'(\d+)\s*ct', r'value:\s*(\d+)\s*\n'
    ]
    for pattern in pack_patterns:
        match = re.search(pattern, text_lower)
        if match:
            quantity_str = next((g for g in match.groups() if g is not None), None)
            if quantity_str:
                pack_quantity = int(quantity_str)
                break

    unit_patterns = {
        'weight': r'(\d+\.?\d*)\s*(pounds?|lbs?|ounces?|oz|grams?|g)\b',
        'volume': r'(\d+\.?\d*)\s*(fluid ounces?|fl oz|liters?|l|milliliters?|ml)\b'
    }

    weight_match = re.search(unit_patterns['weight'], text_lower)
    if weight_match:
        val = float(weight_match.group(1))
        unit = weight_match.group(2).replace('s', '')
        weight_g = val * CONVERSIONS.get(unit, 1)

    volume_match = re.search(unit_patterns['volume'], text_lower)
    if volume_match:
        val = float(volume_match.group(1))
        unit = volume_match.group(2).replace('s', '')
        volume_ml = val * CONVERSIONS.get(unit, 1)

    cleaned = re.sub(r'(item name:|brand:|value:|unit:|bullet point \d+:)', '', text, flags=re.IGNORECASE)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()

    return pd.Series([
        normalize_brand(brand), item_name, pack_quantity, 
        weight_g, volume_ml, cleaned
    ])


In [9]:
def process_data(file_path):
    df = pd.read_csv(file_path, engine='python')
    new_features = df['catalog_content'].progress_apply(extract_advanced_features)
    new_features.columns = [
        'brand_normalized', 'item_name', 'pack_quantity', 
        'weight_g', 'volume_ml', 'cleaned_content'
    ]
    df = pd.concat([df.drop(columns=['catalog_content']), new_features], axis=1)
    return df


In [10]:
train_df_processed = process_data(TRAIN_CSV_PATH)
train_df_processed.to_csv(PROCESSED_TRAIN_PATH, index=False)

Processing rows: 100%|██████████| 75000/75000 [00:27<00:00, 2749.57it/s]


In [11]:
print(train_df_processed.head())

   sample_id                                         image_link  price  \
0      33127  https://m.media-amazon.com/images/I/51mo8htwTH...   4.89   
1     198967  https://m.media-amazon.com/images/I/71YtriIHAA...  13.12   
2     261251  https://m.media-amazon.com/images/I/51+PFEe-w-...   1.97   
3      55858  https://m.media-amazon.com/images/I/41mu0HAToD...  30.34   
4     292686  https://m.media-amazon.com/images/I/41sA037+Qv...  66.49   

  brand_normalized                                          item_name  \
0               la  La Victoria Green Taco Sauce Mild, 12 Ounce (P...   
1          salerno  Salerno Cookies, The Original Butter Cookies, ...   
2             bear  Bear Creek Hearty Soup Bowl, Creamy Chicken wi...   
3          judee’s  Judee’s Blue Cheese Powder 11.25 oz - Gluten-F...   
4            kedem  kedem Sherry Cooking Wine, 12.7 Ounce - 12 per...   

   pack_quantity    weight_g  volume_ml  \
0              6  340.194000        NaN   
1              4  226.796000  

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [12]:
test_df_processed = process_data(TEST_CSV_PATH)
test_df_processed.to_csv(PROCESSED_TEST_PATH, index=False)
print(test_df_processed.head())

Processing rows: 100%|██████████| 75000/75000 [00:27<00:00, 2764.58it/s]


   sample_id                                         image_link  \
0     100179  https://m.media-amazon.com/images/I/71hoAn78AW...   
1     245611  https://m.media-amazon.com/images/I/61ex8NHCIj...   
2     146263  https://m.media-amazon.com/images/I/61KCM61J8e...   
3      95658  https://m.media-amazon.com/images/I/51Ex6uOH7y...   
4      36806  https://m.media-amazon.com/images/I/71QYlrOMoS...   

  brand_normalized                                          item_name  \
0             rani  Rani 14-Spice Eshamaya's Mango Chutney (Indian...   
1          natural  Natural MILK TEA Flavoring extract by HALO PAN...   
2            honey  Honey Filled Hard Candy - Bulk Pack 2 Pounds -...   
3           vlasic    Vlasic Snack'mm's Kosher Dill 16 Oz (Pack of 2)   
4        mccormick  McCormick Culinary Vanilla Extract, 32 fl oz -...   

   pack_quantity   weight_g  volume_ml  \
0              1  297.66975        NaN   
1              1   56.69900        NaN   
2              1  907.18400     

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
