In [1]:
import pandas as pd 
train = pd.read_csv("jl_fs/train.csv")
test = pd.read_csv("jl_fs/test.csv")

In [2]:
import re
def extract_field_keys(text):
    if pd.isna(text):
        return []
    matches = re.findall(r'(?:^|\n)\s*([A-Za-z0-9\'\-\&\s]+):', text)
    return [m.strip() for m in matches]

# Collect all keys across all rows
all_keys = set()
for content in train["catalog_content"] + test["catalog_content"]:
    for key in extract_field_keys(str(content)):
        all_keys.add(key)

# Display the distinct keys sorted
print("Distinct field keys found:\n")
for key in sorted(all_keys):
    print("-", key)

Distinct field keys found:

- Bullet Point
- Bullet Point 1
- Bullet Point 10
- Bullet Point 11
- Bullet Point 12
- Bullet Point 13
- Bullet Point 14
- Bullet Point 15
- Bullet Point 16
- Bullet Point 17
- Bullet Point 18
- Bullet Point 19
- Bullet Point 2
- Bullet Point 20
- Bullet Point 21
- Bullet Point 22
- Bullet Point 23
- Bullet Point 24
- Bullet Point 25
- Bullet Point 26
- Bullet Point 27
- Bullet Point 28
- Bullet Point 29
- Bullet Point 3
- Bullet Point 30
- Bullet Point 4
- Bullet Point 5
- Bullet Point 6
- Bullet Point 7
- Bullet Point 8
- Bullet Point 9
- Item Name
- Item Name 1
- Item Name 2
- Item Name 3
- Product Description
- Product Description 1
- Product Description 2
- Product Description 3
- Product Description 4
- Product Description 5
- Unit
- Value


In [3]:
import re
import pandas as pd
import string

def make_simple_embedding_lines(df, typ="train"):
    """
    From each catalog_content:
      - Extract 'Item Name' and all variants (Item Name 1, 2, ...)
      - Extract 'Value:' and 'Unit:' and join as '{Value} {Unit}'
      - Remove any 'Bullet Point...' or 'Product Description...' lines
      - Remove all punctuation except full stops '.'
      - Concatenate => 'Item Name, Value Unit'
    Replaces df['catalog_content'] with cleaned text.
    Returns: DataFrame [sample_id, catalog_content, price]
    """

    # Precompute translation table to strip punctuation except '.'
    punctuation_to_remove = string.punctuation.replace(".", "")
    PUNCT_TRANSLATOR = str.maketrans("", "", punctuation_to_remove)

    def clean_one(text: str) -> str:
        if not isinstance(text, str):
            return ""

        # 1) Remove Bullet Point and Product Description lines
        text = re.sub(r'(?mi)^\s*(Bullet Point(?:\s*\d+)?|Product Description(?:\s*\d+)?):.*$', '', text)
        text = re.sub(r'\n+', '\n', text).strip()

        # 2) Extract all Item Name variants (Item Name, Item Name 1, 2, etc.)
        item_matches = re.findall(
            r'Item Name(?:\s*\d*)?:\s*(.*?)\s*(?=,|\n[A-Za-z].*?:|$)',
            text,
            flags=re.DOTALL
        )
        item_names = [m.strip() for m in item_matches if m.strip()]
        item_part = " | ".join(item_names)

        # 3) Extract Value and Unit
        
        m_val  = re.search(r'(?mi)\bValue:\s*([^\n,]+)', text)
        m_unit = re.search(r'(?mi)\bUnit:\s*([^\n,]+)', text)
        val  = m_val.group(1).strip() if m_val else ""
        unit = m_unit.group(1).strip() if m_unit else ""
        vu_part = f"{val} {unit}".strip()
        
        # 4) Combine text parts
        parts = []
        if item_part:
            parts.append(item_part)
        
        if vu_part:
            parts.append(vu_part)

        combined = ", ".join(parts)
        # 5) Remove all punctuation except '.'
        combined = combined.translate(PUNCT_TRANSLATOR)

        # Normalize whitespace
        combined = re.sub(r"\s+", " ", combined).strip()

        return combined

    # Apply cleaning and replace catalog_content directly
    df = df.copy()
    df["catalog_content"] = df["catalog_content"].astype(str).map(clean_one)
    if typ=="test":
        return df[["sample_id", "catalog_content"]]
    return df[["sample_id", "catalog_content", "price"]]

In [4]:
train_clean = make_simple_embedding_lines(train)
test_clean = make_simple_embedding_lines(test, "test")

In [5]:
train_clean.to_csv("train_cleaned.csv", index = False)
test_clean.to_csv("test_cleaned.csv", index = False)