In [85]:
from datasets import load_dataset
import pandas as pd
import re
from pydantic import BaseModel
from fastapi import FastAPI

ds = load_dataset("UniqueData/asos-e-commerce-dataset")

In [86]:
df = ds["train"].to_pandas()
df.head()

Unnamed: 0,url,name,size,category,price,color,sku,description,images
0,https://www.asos.com/stradivarius/stradivarius...,New Look trench coat in camel,"UK 4,UK 6,UK 8,UK 10,UK 12,UK 14 - Out of stoc...",New Look trench coat in camel,49.99,Neutral,126704571.0,[{'Product Details': 'Coats & Jackets by New L...,['https://images.asos-media.com/products/new-l...
1,https://www.asos.com/stradivarius/stradivarius...,New Look trench coat in camel,"UK 4,UK 6,UK 8,UK 10,UK 12,UK 14 - Out of stoc...",New Look trench coat in camel,49.99,Neutral,126704571.0,[{'Product Details': 'Coats & Jackets by New L...,['https://images.asos-media.com/products/new-l...
2,https://www.asos.com/asos-design/asos-design-l...,New Look trench coat in camel,"UK 4,UK 6,UK 8,UK 10,UK 12,UK 14 - Out of stoc...",New Look trench coat in camel,49.99,Neutral,126704571.0,[{'Product Details': 'Coats & Jackets by New L...,['https://images.asos-media.com/products/new-l...
3,https://www.asos.com/new-look/new-look-trench-...,New Look trench coat in camel,"UK 4,UK 6,UK 8,UK 10,UK 12,UK 14 - Out of stoc...",New Look trench coat in camel,49.99,Neutral,126704571.0,[{'Product Details': 'Coats & Jackets by New L...,['https://images.asos-media.com/products/new-l...
4,https://www.asos.com/stradivarius/stradivarius...,Stradivarius double breasted wool coat in grey,"XS - UK 6,S - UK 8,M - UK 10,L - UK 12,XL - UK 14",Stradivarius double breasted wool coat in grey,59.99,GREY,123650194.0,[{'Product Details': 'Coats & Jackets by Strad...,['https://images.asos-media.com/products/strad...


In [87]:
df.shape

(30845, 9)

In [88]:
df.dropna(inplace=True)

In [89]:
extracted_brands = []

for url in df['url']:
    url_parts = url.split('/')
    if len(url_parts) > 3:
        brand = url_parts[3]
        extracted_brands.append(brand)
    else:
        extracted_brands.append(None)

df['brand'] = extracted_brands

In [90]:
brand_counts = df['brand'].value_counts()
popular_brands = brand_counts[brand_counts > 50]
popular_brands

brand
asos-design              4779
topshop                  1585
asos-curve               1011
miss-selfridge            693
collusion                 657
                         ... 
calvin-klein               53
fred-perry                 53
wednesdays-girl-curve      52
pretty-lavish-curve        52
noisy-may-curve            51
Name: count, Length: 120, dtype: int64

In [91]:
top_15_brands = [
    "asos-design",
    "topshop",
    "asos-curve",
    "miss-selfridge",
    "collusion",
    "river-island",
    "monki",
    "bershka",
    "asos-petite",
    "asos-tall",
    "new-look",
    "stradivarius",
    "asos-edition",
    "asyou",
    "adidas-originals"
]

df = df[df["brand"].isin(top_15_brands)]


In [92]:
df.shape

(13644, 10)

In [93]:
def extract_sizes(row):
    in_stock = []
    out_stock = []
    
    if pd.isna(row):
        return in_stock, out_stock
        
    items = row.split(',')
    
    for item in items:
        match = re.search(r"UK\s*(\d+)", item)
        if match:
            size_val = int(match.group(1))
            
            if "Out of stock" in item:
                out_stock.append(size_val)
            else:
                in_stock.append(size_val)
                
    return in_stock, out_stock

df[['in_stock_size', 'out_stock_size']] = df['size'].apply(lambda x: pd.Series(extract_sizes(x)))

df[['size', 'in_stock_size', 'out_stock_size']].head()

Unnamed: 0,size,in_stock_size,out_stock_size
0,"UK 4,UK 6,UK 8,UK 10,UK 12,UK 14 - Out of stoc...","[4, 6, 8, 10, 12, 16, 18]",[14]
1,"UK 4,UK 6,UK 8,UK 10,UK 12,UK 14 - Out of stoc...","[4, 6, 8, 10, 12, 16, 18]",[14]
2,"UK 4,UK 6,UK 8,UK 10,UK 12,UK 14 - Out of stoc...","[4, 6, 8, 10, 12, 16, 18]",[14]
3,"UK 4,UK 6,UK 8,UK 10,UK 12,UK 14 - Out of stoc...","[4, 6, 8, 10, 12, 16, 18]",[14]
4,"XS - UK 6,S - UK 8,M - UK 10,L - UK 12,XL - UK 14","[6, 8, 10, 12, 14]",[]


In [94]:
df.shape

(13644, 12)

In [95]:
df.drop(columns=['url', 'name', 'size', 'category'], inplace=True)

In [96]:
df.head()

Unnamed: 0,price,color,sku,description,images,brand,in_stock_size,out_stock_size
0,49.99,Neutral,126704571.0,[{'Product Details': 'Coats & Jackets by New L...,['https://images.asos-media.com/products/new-l...,stradivarius,"[4, 6, 8, 10, 12, 16, 18]",[14]
1,49.99,Neutral,126704571.0,[{'Product Details': 'Coats & Jackets by New L...,['https://images.asos-media.com/products/new-l...,stradivarius,"[4, 6, 8, 10, 12, 16, 18]",[14]
2,49.99,Neutral,126704571.0,[{'Product Details': 'Coats & Jackets by New L...,['https://images.asos-media.com/products/new-l...,asos-design,"[4, 6, 8, 10, 12, 16, 18]",[14]
3,49.99,Neutral,126704571.0,[{'Product Details': 'Coats & Jackets by New L...,['https://images.asos-media.com/products/new-l...,new-look,"[4, 6, 8, 10, 12, 16, 18]",[14]
4,59.99,GREY,123650194.0,[{'Product Details': 'Coats & Jackets by Strad...,['https://images.asos-media.com/products/strad...,stradivarius,"[6, 8, 10, 12, 14]",[]


In [97]:
df['price'] = (
    df['price']
      .str.replace(r'\b(from|now)\b', '', regex=True, case=False)
      .str.replace(',', '')
      .str.strip()
      .astype(float)
)

In [98]:
# top_colors = df["color"].value_counts().head(12).index
# df = df[df["color"].isin(top_colors)]

In [99]:
df['sku'] = df['sku'].astype(int)

In [100]:
df_small = df[
    [   'sku',
        "brand",
        "color",
        "price",
        "in_stock_size",
        "out_stock_size",
        "description",
        "images"
    ]
]


In [101]:
rows_to_keep = ~((df_small['in_stock_size'] == '[]') & (df_small['out_stock_size'] == '[]'))

df_small = df_small[rows_to_keep]

original_row_count = len(df_small)
new_row_count = len(df_small)
dropped_rows_count = original_row_count - new_row_count

In [102]:
df_small

Unnamed: 0,sku,brand,color,price,in_stock_size,out_stock_size,description,images
0,126704571,stradivarius,Neutral,49.99,"[4, 6, 8, 10, 12, 16, 18]",[14],[{'Product Details': 'Coats & Jackets by New L...,['https://images.asos-media.com/products/new-l...
1,126704571,stradivarius,Neutral,49.99,"[4, 6, 8, 10, 12, 16, 18]",[14],[{'Product Details': 'Coats & Jackets by New L...,['https://images.asos-media.com/products/new-l...
2,126704571,asos-design,Neutral,49.99,"[4, 6, 8, 10, 12, 16, 18]",[14],[{'Product Details': 'Coats & Jackets by New L...,['https://images.asos-media.com/products/new-l...
3,126704571,new-look,Neutral,49.99,"[4, 6, 8, 10, 12, 16, 18]",[14],[{'Product Details': 'Coats & Jackets by New L...,['https://images.asos-media.com/products/new-l...
4,123650194,stradivarius,GREY,59.99,"[6, 8, 10, 12, 14]",[],[{'Product Details': 'Coats & Jackets by Strad...,['https://images.asos-media.com/products/strad...
...,...,...,...,...,...,...,...,...
30838,122463785,asos-design,MULTI,44.00,[8],"[4, 6, 10, 12, 14, 16, 18]",[{'Product Details': 'Dresses by ASOS DESIGNTh...,['https://images.asos-media.com/products/asos-...
30839,120479167,asos-design,CHOCOLATE,55.00,[8],"[4, 6, 10, 12, 14, 16, 18]",[{'Product Details': 'Dresses by ASOS DESIGNAl...,['https://images.asos-media.com/products/asos-...
30841,1444255,asos-design,Black,24.00,[10],"[4, 6, 8, 12, 14, 16, 18]",[{'Product Details': 'Dress by ASOS DESIGN Act...,['https://images.asos-media.com/products/asos-...
30842,110783769,asyou,Washed black,22.99,[6],"[4, 8, 10, 12, 14, 16, 18, 20, 22, 24]",[{'Product Details': 'Dress by ASYOU Exclusive...,['https://images.asos-media.com/products/asyou...


In [103]:
df_small.to_csv("products_small.csv", index=False)