In [1]:
# Cell 1: Setup and Load Raw Data
import pandas as pd
import numpy as np
import os

RAW_DATA_PATH = os.path.join('data', 'intern_data_ikarus.csv')
CLEANED_DATA_PATH = os.path.join('data', 'cleaned_products.csv')

print(f"Loading raw data from: {RAW_DATA_PATH}")
try:
    df = pd.read_csv(RAW_DATA_PATH)
    print(" Raw dataset loaded successfully.")
except FileNotFoundError:
    print(f"ERROR: File not found. Make sure '{RAW_DATA_PATH}' exists.")
    df = pd.DataFrame()

if not df.empty:
    print(f"Raw data has {df.shape[0]} rows and {df.shape[1]} columns.")
    display(df.head())

Loading raw data from: data/intern_data_ikarus.csv
 Raw dataset loaded successfully.
Raw data has 312 rows and 12 columns.


Unnamed: 0,title,brand,description,price,categories,images,manufacturer,package_dimensions,country_of_origin,material,color,uniq_id
0,"GOYMFK 1pc Free Standing Shoe Rack, Multi-laye...",GOYMFK,"multiple shoes, coats, hats, and other items E...",$24.99,"['Home & Kitchen', 'Storage & Organization', '...",['https://m.media-amazon.com/images/I/416WaLx1...,GOYMFK,"2.36""D x 7.87""W x 21.6""H",China,Metal,White,02593e81-5c09-5069-8516-b0b29f439ded
1,"subrtex Leather ding Room, Dining Chairs Set o...",subrtex,subrtex Dining chairs Set of 2,,"['Home & Kitchen', 'Furniture', 'Dining Room F...",['https://m.media-amazon.com/images/I/31SejUEW...,Subrtex Houseware INC,"18.5""D x 16""W x 35""H",,Sponge,Black,5938d217-b8c5-5d3e-b1cf-e28e340f292e
2,Plant Repotting Mat MUYETOL Waterproof Transpl...,MUYETOL,,$5.98,"['Patio, Lawn & Garden', 'Outdoor Décor', 'Doo...",['https://m.media-amazon.com/images/I/41RgefVq...,MUYETOL,"26.8""L x 26.8""W",,Polyethylene,Green,b2ede786-3f51-5a45-9a5b-bcf856958cd8
3,"Pickleball Doormat, Welcome Doormat Absorbent ...",VEWETOL,The decorative doormat features a subtle textu...,$13.99,"['Patio, Lawn & Garden', 'Outdoor Décor', 'Doo...",['https://m.media-amazon.com/images/I/61vz1Igl...,Contrence,"24""L x 16""W",,Rubber,A5589,8fd9377b-cfa6-5f10-835c-6b8eca2816b5
4,JOIN IRON Foldable TV Trays for Eating Set of ...,JOIN IRON Store,Set of Four Folding Trays With Matching Storag...,$89.99,"['Home & Kitchen', 'Furniture', 'Game & Recrea...",['https://m.media-amazon.com/images/I/41p4d4VJ...,,"18.9""D x 14.2""W x 26""H",,Iron,Grey Set of 4,bdc9aa30-9439-50dc-8e89-213ea211d66a


In [2]:
# Cell 2: Clean the Data (MOST AGGRESSIVE & INTELLIGENT VERSION)
import re
import string

def ultimate_text_cleaner(text):
    """
    An even more aggressive, multi-stage cleaning pipeline for product data.
    """
    if not isinstance(text, str) or len(text) < 10:
        return "No description available."

    # Stage 1: Initial Lowercasing for uniform processing
    text = text.lower()

    # Stage 2: Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Stage 3: Remove model numbers, SKUs, and gibberish alphanumeric codes.
    # e.g., 'ethr8501s2', 's2s3s1s1s', 'het02bpbk'
    text = re.sub(r'\b[a-z]*[0-9]+[a-z0-9]*\b', '', text)

    # Stage 4: Remove measurement units and dimensions.
    # e.g., '39"', '24 inch', '4.2" x 4.6"'
    text = re.sub(r'\b\d+(\.\d+)?\s*(inch|cm|mm|ft|")\b', '', text)
    text = re.sub(r'\b\d+(\.\d+)?\s*x\s*\d+(\.\d+)?\b', '', text)

    # Stage 5: Handle repetitive sentence structures (as before)
    first_sentence_match = re.search(r'[^.!?]*[.!?]', text)
    if first_sentence_match:
        first_sentence = first_sentence_match.group(0).strip()
        remaining_text = text.replace(first_sentence, "").strip()
        simplified_sentence = first_sentence.replace(" ", "")
        simplified_remaining = remaining_text.replace(" ", "")
        if simplified_remaining.startswith(simplified_sentence):
            text = first_sentence

    # Stage 6: Remove all special characters, keeping only letters, spaces, and basic sentence punctuation.
    # This is more aggressive and removes characters like ',' ':' ';' etc.
    text = re.sub(f'[^{re.escape(string.ascii_letters + " .!?")}]', '', text)

    # Stage 7: Standardize whitespace and remove leading/trailing punctuation
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.strip(string.punctuation + " ") # Remove any leftover punctuation at the start/end

    # Stage 8: Quality Gate - Ensure the cleaned text is meaningful
    if len(text) < 15 or len(text.split()) < 3:
        # If the result is too short, it's likely just leftover junk.
        return "No description available."
    
    # Stage 9: Final Formatting - Capitalize the first letter of the final clean sentence.
    return text.capitalize()


if not df.empty:
    print("\n--- Starting Data Cleaning Process ---")
    
    # --- Basic Cleaning Steps ---
    df.fillna({'description': 'No description available', 'title': 'No title'}, inplace=True)
    df['price'] = pd.to_numeric(df['price'].astype(str).str.replace('$', '').str.strip(), errors='coerce')
    median_price = df['price'].median()
    df['price'].fillna(median_price, inplace=True)
    df['categories'] = df['categories'].astype(str).str.extract(r"'(.*?)'").fillna('Uncategorized')
    df.drop_duplicates(subset=['uniq_id'], keep='first', inplace=True)
    df.dropna(subset=['title', 'uniq_id', 'images'], inplace=True)
    
    # --- APPLYING ULTIMATE CLEANING PIPELINE ---
    print("\nApplying ultimate cleaning pipeline to 'title' and 'description'...")
    
    df['description'] = df['description'].apply(ultimate_text_cleaner)
    
    # Also apply a simpler cleaning to the title to remove codes and standardize spacing
    df['title'] = df['title'].apply(lambda x: re.sub(r'\b[a-zA-Z]*[0-9]+[a-zA-Z0-9]*\b', '', x))
    df['title'] = df['title'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
    df['title'] = df['title'].str.title()

    print("Ultimate cleaning complete.")
    
    print("\n--- Data Cleaning Complete ---")
    

    df.to_csv(CLEANED_DATA_PATH, index=False)
    print(f"\nFinal, super-cleaned data has been saved to: {CLEANED_DATA_PATH}")

    print("\nSample of final cleaned data:")
    display(df[['title', 'description']].head())


--- Starting Data Cleaning Process ---

Applying aggressive cleaning to 'description' column to remove repetitions...
✅ Repetitive phrases removed.

Standardizing text case for 'title' and 'description' columns...
✅ Casing standardized successfully.

--- Data Cleaning Complete ---

✅ Cleaned data has been successfully saved to: data/cleaned_products.csv

Sample of cleaned and standardized data:


Unnamed: 0,title,description
0,"Goymfk 1Pc Free Standing Shoe Rack, Multi-Laye...","Multiple shoes, coats, hats, and other items e..."
1,"Subrtex Leather Ding Room, Dining Chairs Set O...",Subrtex dining chairs set of 2
2,Plant Repotting Mat Muyetol Waterproof Transpl...,No description available
3,"Pickleball Doormat, Welcome Doormat Absorbent ...",The decorative doormat features a subtle textu...
4,Join Iron Foldable Tv Trays For Eating Set Of ...,Set of four folding trays with matching storag...
