In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataset-2/student_resource/Documentation_template.md
/kaggle/input/dataset-2/student_resource/README.md
/kaggle/input/dataset-2/student_resource/sample_code.py
/kaggle/input/dataset-2/student_resource/dataset/sample_test.csv
/kaggle/input/dataset-2/student_resource/dataset/sample_test_out.csv
/kaggle/input/dataset-2/student_resource/dataset/train.csv
/kaggle/input/dataset-2/student_resource/dataset/test.csv
/kaggle/input/dataset-2/student_resource/src/utils.py
/kaggle/input/dataset-2/student_resource/src/example.ipynb


In [2]:
df_test=pd.read_csv('/kaggle/input/dataset-2/student_resource/dataset/test.csv')
df_train=pd.read_csv('/kaggle/input/dataset-2/student_resource/dataset/train.csv')

In [3]:

import re
from tqdm import tqdm


In [4]:
tqdm.pandas()

### Extracts structured features like brand, item name, pack quantity, etc.,
    

In [5]:
def extract_features(text):
    if not isinstance(text, str):
        return pd.Series([np.nan, np.nan, 1, np.nan, np.nan],
                         index=['brand', 'item_name', 'pack_quantity', 'item_size', 'total_value'])

    brand = np.nan
    item_name = np.nan
    pack_quantity = 1
    item_size = np.nan
    total_value = np.nan

    brand_match = re.search(r'Brand:\s*([^\n\r]+)', text, re.IGNORECASE)
    if brand_match:
        brand = brand_match.group(1).strip()

    name_match = re.search(r'Item Name:\s*([^\n\r]+)', text, re.IGNORECASE)
    if name_match:
        item_name = name_match.group(1).strip()

    pack_match = re.search(r'\(Pack of (\d+)\)|\b(\d+)\s*[-]?pack\b|\b(\d+)\s*count\b', text, re.IGNORECASE)
    if pack_match:
        quantity_str = next((s for s in pack_match.groups() if s is not None), None)
        if quantity_str:
            pack_quantity = int(quantity_str)

    size_match = re.search(r'(\d+\.?\d*)\s*(Ounce|Oz|Fl Oz|g|ml)\b', text, re.IGNORECASE)
    if size_match:
        item_size = float(size_match.group(1))

    value_match = re.search(r'Value:\s*(\d+\.?\d*)', text, re.IGNORECASE)
    if value_match:
        total_value = float(value_match.group(1))

    return pd.Series([brand, item_name, pack_quantity, item_size, total_value],
                     index=['brand', 'item_name', 'pack_quantity', 'item_size', 'total_value'])


### Cleans the text by removing all identified structured parts and special characters.

In [6]:
def clean_text_for_embedding(text):
    
    if not isinstance(text, str):
        return ""

    text = re.sub(r'Brand:.*', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Item Name:.*', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\(Pack of \d+\)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\b\d+\s*[-]?pack\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\b\d+\s*count\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Value:.*', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Unit:.*', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Bullet Point \d+:', '', text, flags=re.IGNORECASE)

    # Remove any remaining non-alphanumeric characters
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = ' '.join(text.split())
    return text

In [7]:
 for df_name, df in [('train', df_train), ('test', df_test)]:
        extracted_features = df['catalog_content'].progress_apply(extract_features)
        df = pd.concat([df, extracted_features], axis=1)
        df['cleaned_content'] = df['catalog_content'].progress_apply(clean_text_for_embedding)

        if df_name == 'train':
            df_train = df
        else:
            df_test = df

100%|██████████| 75000/75000 [00:20<00:00, 3655.59it/s]
100%|██████████| 75000/75000 [00:10<00:00, 6998.38it/s]
100%|██████████| 75000/75000 [00:20<00:00, 3688.27it/s]
100%|██████████| 75000/75000 [00:10<00:00, 6884.79it/s]


In [8]:
df_train.to_csv('/kaggle/working/processed_train.csv', index=False)
df_test.to_csv('/kaggle/working/processed_test.csv', index=False)

In [9]:
!ls /kaggle/working

__notebook__.ipynb  processed_test.csv	processed_train.csv
