Imports & Setup



In [1]:
import os, re, json, unicodedata, pathlib
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from wordcloud import WordCloud

Tokenization utilities

In [3]:
# --- Tokenizer used for text fields ---

try:
    _STOP = set(stopwords.words("english"))
except LookupError:
    import nltk
    nltk.download("stopwords")
    _STOP = set(stopwords.words("english"))

_STEM = PorterStemmer()
_PUNCT = re.compile(r"[^\w\s]+")

def normalize_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    s = unicodedata.normalize("NFKC", text).lower()
    s = _PUNCT.sub(" ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def build_terms(text: str) -> list[str]:
    s = normalize_text(text)
    tokens = s.split()
    tokens = [t for t in tokens if t not in _STOP]
    tokens = [_STEM.stem(t) for t in tokens]
    tokens = [t for t in tokens if len(t) > 1 and not t.isdigit()]
    return tokens


Preprocessing functions

In [4]:
def _norm(s):
    if not isinstance(s, str):
        return ""
    s = unicodedata.normalize("NFKC", s).lower()
    s = re.sub(r"[^\w\s]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def _num(x):
    if x is None: return None
    s = str(x).replace(",", "")
    m = re.findall(r"\d+(?:\.\d+)?", s)
    return float(m[0]) if m else None

def _disc(x):
    if x is None: return None
    m = re.search(r"(\d+(?:\.\d+)?)", str(x))
    return float(m.group(1)) / 100.0 if m else None

def _rating(x):
    try: return float(x)
    except Exception: return None

def _details_tokens(lst):
    out = []
    if isinstance(lst, dict):
        items = lst.items()
    elif isinstance(lst, list):
        items = []
        for d in lst:
            if isinstance(d, dict):
                items.extend(d.items())
    else:
        items = []
    for k, v in items:
        out += build_terms(str(k))
        out += build_terms(str(v))
    return out

def preprocess_row(doc: dict) -> dict:
    title = doc.get("title", "")
    desc = doc.get("description", "")

    # keep single-letter or special brand names
    brand_raw = doc.get("brand", "")
    if isinstance(brand_raw, str):
        brand = unicodedata.normalize("NFKC", brand_raw).lower().strip()
        if brand == "": brand = "unknown"
    else:
        brand = "unknown"

    return {
        "pid": doc.get("pid"),
        "title_raw": title,
        "description_raw": desc,
        "product_details_raw": doc.get("product_details", {}),
        "discount_raw": doc.get("discount"),
        "url": doc.get("url", ""),
        "brand": brand,
        "category": _norm(doc.get("category", "")),
        "sub_category": _norm(doc.get("sub_category", "")),
        "seller": _norm(doc.get("seller", "")),
        "title_tokens": build_terms(title),
        "desc_tokens": build_terms(desc),
        "details_tokens": _details_tokens(doc.get("product_details", [])),
        "out_of_stock": bool(doc.get("out_of_stock", False)),
        "selling_price": _num(doc.get("selling_price")),
        "actual_price": _num(doc.get("actual_price")),
        "discount_frac": _disc(doc.get("discount")),
        "average_rating": _rating(doc.get("average_rating")),
    }

def preprocess_jsonl(input_path, output_parquet):
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    rows = [preprocess_row(doc) for doc in data]
    df = pd.DataFrame(rows)
    pathlib.Path(output_parquet).parent.mkdir(parents=True, exist_ok=True)
    df.to_parquet(output_parquet, index=False)
    return df


Run preprocessing

place datafiles in data/raw folder for the raw data and a data/processed folder to store the processed data

In [10]:
INP = "data/raw/fashion_products_dataset.json"
OUT = "data/processed/products_clean.parquet"

df = preprocess_jsonl(INP, OUT)
print(f"Processed {len(df)} documents with {len(df.columns)} columns")
df.head(3)


Processed 28080 documents with 18 columns


Unnamed: 0,pid,title_raw,description_raw,product_details_raw,discount_raw,url,brand,category,sub_category,seller,title_tokens,desc_tokens,details_tokens,out_of_stock,selling_price,actual_price,discount_frac,average_rating
0,TKPFCZ9EA7H5FYZH,Solid Women Multicolor Track Pants,Yorker trackpants made from 100% rich combed c...,"[{'Style Code': '1005COMBO2'}, {'Closure': 'El...",69% off,https://www.flipkart.com/yorker-solid-men-mult...,york,clothing and accessories,bottomwear,shyam enterprises,"[solid, women, multicolor, track, pant]","[yorker, trackpant, made, rich, comb, cotton, ...","[style, code, 1005combo2, closur, elast, pocke...",False,921.0,2999.0,0.69,3.9
1,TKPFCZ9EJZV2UVRZ,Solid Men Blue Track Pants,Yorker trackpants made from 100% rich combed c...,"[{'Style Code': '1005BLUE'}, {'Closure': 'Draw...",66% off,https://www.flipkart.com/yorker-solid-men-blue...,york,clothing and accessories,bottomwear,shyam enterprises,"[solid, men, blue, track, pant]","[yorker, trackpant, made, rich, comb, cotton, ...","[style, code, 1005blue, closur, drawstr, elast...",False,499.0,1499.0,0.66,3.9
2,TKPFCZ9EHFCY5Z4Y,Solid Men Multicolor Track Pants,Yorker trackpants made from 100% rich combed c...,"[{'Style Code': '1005COMBO4'}, {'Closure': 'El...",68% off,https://www.flipkart.com/yorker-solid-men-mult...,york,clothing and accessories,bottomwear,shyam enterprises,"[solid, men, multicolor, track, pant]","[yorker, trackpant, made, rich, comb, cotton, ...","[style, code, 1005combo4, closur, elast, pocke...",False,931.0,2999.0,0.68,3.9


Validate the output

In [11]:
val = pd.read_csv("data/raw/validation_labels.csv")
missing = val[~val["pid"].isin(df["pid"])]
print("Missing PIDs:", len(missing))
df.shape


Missing PIDs: 0


(28080, 18)

Exploratory Data Analysis

Generates summary statistics, token frequencies, plots, and a word cloud.
Outputs are stored in project_progress/part_1/

In [13]:
INP = "data/processed/products_clean.parquet"
OUTDIR = "project_progress/part_1"
os.makedirs(OUTDIR, exist_ok=True)

# ---------- Load data ----------
df = pd.read_parquet(INP)

# ---------- Summary statistics ----------
summary = {
    "docs": int(len(df)),
    "unique_brands": int(df["brand"].nunique()),
    "unique_categories": int(df["category"].nunique()),
    "avg_price": float(df["selling_price"].dropna().mean()),
    "avg_discount_frac": float(df["discount_frac"].dropna().mean()),
    "avg_rating": float(df["average_rating"].dropna().mean()),
    "out_of_stock_pct": float(100 * df["out_of_stock"].mean()),
}

print(summary)
with open(os.path.join(OUTDIR, "summary.json"), "w", encoding="utf-8") as f:
    json.dump(summary, f, indent=2)

# ---------- Top brands ----------
top_brands = (
    df["brand"]
    .value_counts()
    .head(20)
    .rename_axis("brand")
    .reset_index(name="count")
)
top_brands.to_csv(os.path.join(OUTDIR, "top_brands.csv"), index=False)
print(top_brands.head(10))

# ---------- Token frequencies ----------
v_title = Counter(t for ts in df["title_tokens"] for t in ts)
v_desc = Counter(t for ts in df["desc_tokens"] for t in ts)

pd.DataFrame(v_title.most_common(50), columns=["term", "freq"]).to_csv(
    os.path.join(OUTDIR, "top_terms_title.csv"), index=False
)
pd.DataFrame(v_desc.most_common(50), columns=["term", "freq"]).to_csv(
    os.path.join(OUTDIR, "top_terms_desc.csv"), index=False
)

# ---------- Distributions ----------
# Selling price
ax = df["selling_price"].dropna().plot(
    kind="hist", bins=40, title="Selling Price Distribution"
)
ax.set_xlabel("Price")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, "fig_price_hist.png"))
plt.clf()

# Discount
ax = df["discount_frac"].dropna().plot(
    kind="hist", bins=40, title="Discount Fraction Distribution"
)
ax.set_xlabel("Discount (0–1)")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, "fig_discount_hist.png"))
plt.clf()

# Rating
ax = df["average_rating"].dropna().plot(
    kind="hist", bins=20, title="Average Rating Distribution"
)
ax.set_xlabel("Rating")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, "fig_rating_hist.png"))
plt.clf()

# ---------- Description and title length ----------
df["desc_len"] = df["desc_tokens"].apply(len)
df["title_len"] = df["title_tokens"].apply(len)

df["desc_len"].plot(kind="hist", bins=40, title="Description Length Distribution")
plt.xlabel("Number of tokens")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, "fig_desc_length.png"))
plt.clf()

# ---------- Vocabulary statistics ----------
vocab = Counter(t for ts in df["title_tokens"] for t in ts)
print("Vocabulary size:", len(vocab))
print("Most frequent tokens:", vocab.most_common(20))

# ---------- Top brands and categories ----------
df["brand"].value_counts().head(5).plot(kind="barh", title="Top 5 Brands")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, "fig_top_brands.png"))
plt.clf()

df["category"].value_counts().head(4).plot(kind="barh", title="Categories")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, "fig_top_categories.png"))
plt.clf()

# ---------- Word cloud ----------
wc = WordCloud(width=1000, height=600).generate(
    " ".join(t for ts in df["title_tokens"] for t in ts)
)
plt.imshow(wc)
plt.axis("off")
plt.title("Most Common Tokens in Product Titles")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, "fig_wordcloud_titles.png"))
plt.clf()

print("EDA complete. Outputs saved to:", OUTDIR)


{'docs': 28080, 'unique_brands': 322, 'unique_categories': 4, 'avg_price': 705.6350879692286, 'avg_discount_frac': 0.5025689623507805, 'avg_rating': 3.6277237693171696, 'out_of_stock_pct': 5.854700854700854}
          brand  count
0       unknown   2009
1      ecko unl    951
2  free authori    860
3          arbo    806
4          reeb    802
5            pu    798
6       true bl    793
7           keo    660
8           amp    585
9    black beat    548
Vocabulary size: 656
Most frequent tokens: [('shirt', 15838), ('women', 13146), ('men', 13060), ('neck', 11952), ('solid', 9246), ('round', 8301), ('print', 8123), ('pack', 4839), ('blue', 4378), ('fit', 3923), ('black', 3503), ('casual', 2927), ('slim', 2893), ('polo', 2668), ('collar', 2427), ('multicolor', 2373), ('white', 2175), ('sleev', 2026), ('full', 2011), ('grey', 1917)]
EDA complete. Outputs saved to: project_progress/part_1


<Figure size 640x480 with 0 Axes>

Example document

Shows an example of a document after preprocessing

In [14]:
example = df.iloc[0].to_dict()
for k, v in example.items():
    print(f"{k:20}: {v}")


pid                 : TKPFCZ9EA7H5FYZH
title_raw           : Solid Women Multicolor Track Pants
description_raw     : Yorker trackpants made from 100% rich combed cotton giving it a rich look.Designed for Comfort,Skin friendly fabric,itch-free waistband & great for all year round use Proudly made in India
product_details_raw : [{'': None, ' ': None, 'Alteration Required': None, 'Animal Source': None, 'Belt Loops': None, 'Bottom Fabric': None, 'Bottom Length': None, 'Bottom Type': None, 'Brand': None, 'Brand Color': None, 'Brand Fit': None, 'Bust in inch': None, 'Care Instructions': None, 'Care instructions': None, 'Character': None, 'Clasp Material': None, 'Clasp Type': None, 'Closure': None, 'Coat Type': None, 'Collar': None, 'Color': None, 'Country of Origin': None, 'Covered in Warranty': None, 'Cuff': None, 'Depth': None, 'Design': None, 'Distressed': None, 'Domestic Warranty': None, 'Dupatta Length': None, 'Fabric': None, 'Fabric Care': None, 'Fabric Details': None, 'Fabric care': 

Outcomes

The preprocessing successfully normalized text, cleaned numeric fields, and retained meaningful brand names.
The EDA revealed 28,080 products, 322 unique brands, an average price of ₹706, and common tokens such as shirt, men, women, and cotton.