In [103]:
from datasets import load_dataset
import pandas as pd
import re
from pydantic import BaseModel
from fastapi import FastAPI

ds = load_dataset("UniqueData/asos-e-commerce-dataset")

In [104]:
df = ds["train"].to_pandas()
df.head()

Unnamed: 0,url,name,size,category,price,color,sku,description,images
0,https://www.asos.com/stradivarius/stradivarius...,New Look trench coat in camel,"UK 4,UK 6,UK 8,UK 10,UK 12,UK 14 - Out of stoc...",New Look trench coat in camel,49.99,Neutral,126704571.0,[{'Product Details': 'Coats & Jackets by New L...,['https://images.asos-media.com/products/new-l...
1,https://www.asos.com/stradivarius/stradivarius...,New Look trench coat in camel,"UK 4,UK 6,UK 8,UK 10,UK 12,UK 14 - Out of stoc...",New Look trench coat in camel,49.99,Neutral,126704571.0,[{'Product Details': 'Coats & Jackets by New L...,['https://images.asos-media.com/products/new-l...
2,https://www.asos.com/asos-design/asos-design-l...,New Look trench coat in camel,"UK 4,UK 6,UK 8,UK 10,UK 12,UK 14 - Out of stoc...",New Look trench coat in camel,49.99,Neutral,126704571.0,[{'Product Details': 'Coats & Jackets by New L...,['https://images.asos-media.com/products/new-l...
3,https://www.asos.com/new-look/new-look-trench-...,New Look trench coat in camel,"UK 4,UK 6,UK 8,UK 10,UK 12,UK 14 - Out of stoc...",New Look trench coat in camel,49.99,Neutral,126704571.0,[{'Product Details': 'Coats & Jackets by New L...,['https://images.asos-media.com/products/new-l...
4,https://www.asos.com/stradivarius/stradivarius...,Stradivarius double breasted wool coat in grey,"XS - UK 6,S - UK 8,M - UK 10,L - UK 12,XL - UK 14",Stradivarius double breasted wool coat in grey,59.99,GREY,123650194.0,[{'Product Details': 'Coats & Jackets by Strad...,['https://images.asos-media.com/products/strad...


In [105]:
df.shape

(30845, 9)

In [106]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.drop_duplicates(subset=['sku'], inplace=True)
df.shape 

(29971, 9)

In [107]:
extracted_brands = []

for url in df['url']:
    url_parts = url.split('/')
    if len(url_parts) > 3:
        brand = url_parts[3]
        extracted_brands.append(brand)
    else:
        extracted_brands.append(None)

df['brand'] = extracted_brands

In [108]:
brand_counts = df['brand'].value_counts()
popular_brands = brand_counts[brand_counts > 50]
popular_brands

brand
asos-design              4697
topshop                  1518
asos-curve                994
miss-selfridge            673
collusion                 631
                         ... 
fred-perry                 53
tammy-girl                 52
french-connection          52
wednesdays-girl-curve      52
pretty-lavish-curve        51
Name: count, Length: 119, dtype: int64

In [109]:
top_15_brands = [
    "asos-design",
    "topshop",
    "asos-curve",
    "miss-selfridge",
    "collusion",
    "river-island",
    "monki",
    "bershka",
    "asos-petite",
    "asos-tall",
    "new-look",
    "stradivarius",
    "asos-edition",
    "asyou",
    "adidas-originals"
]

df = df[df["brand"].isin(top_15_brands)]

df["brand"] = (
    df["brand"]
    .str.replace("-", " ", regex=False)
    .str.title()
)


In [110]:
df.shape

(13257, 10)

In [111]:
def extract_sizes(row):
    in_stock = []
    out_stock = []
    
    if pd.isna(row):
        return in_stock, out_stock
        
    items = row.split(',')
    
    for item in items:
        match = re.search(r"UK\s*(\d+)", item)
        if match:
            size_val = int(match.group(1))
            
            if "Out of stock" in item:
                out_stock.append(size_val)
            else:
                in_stock.append(size_val)
                
    return in_stock, out_stock

df[['in_stock_size', 'out_stock_size']] = df['size'].apply(lambda x: pd.Series(extract_sizes(x)))

df[['size', 'in_stock_size', 'out_stock_size']].head()

Unnamed: 0,size,in_stock_size,out_stock_size
0,"UK 4,UK 6,UK 8,UK 10,UK 12,UK 14 - Out of stoc...","[4, 6, 8, 10, 12, 16, 18]",[14]
4,"XS - UK 6,S - UK 8,M - UK 10,L - UK 12,XL - UK 14","[6, 8, 10, 12, 14]",[]
8,"3XL - UK 18 - Out of stock,XS - UK 6,S - UK 8,...","[6, 8, 10, 12, 14]","[18, 16]"
10,"XS - UK 4-6,S - UK 8-10,M - UK 12-14,L - UK 16...","[4, 8, 12, 16, 20]",[]
14,"UK 4,UK 6,UK 8,UK 10,UK 12,UK 14,UK 16,UK 18,U...","[4, 6, 8, 10, 12, 14, 16, 18, 20]",[]


In [112]:
df.drop(columns=['url', 'size', 'category'], inplace=True)

In [113]:
df.head()

Unnamed: 0,name,price,color,sku,description,images,brand,in_stock_size,out_stock_size
0,New Look trench coat in camel,49.99,Neutral,126704571.0,[{'Product Details': 'Coats & Jackets by New L...,['https://images.asos-media.com/products/new-l...,Stradivarius,"[4, 6, 8, 10, 12, 16, 18]",[14]
4,Stradivarius double breasted wool coat in grey,59.99,GREY,123650194.0,[{'Product Details': 'Coats & Jackets by Strad...,['https://images.asos-media.com/products/strad...,Stradivarius,"[6, 8, 10, 12, 14]",[]
8,JDY oversized trench coat in stone,45.0,STONE,125806824.0,[{'Product Details': 'Coats & Jackets by JDYLo...,['https://images.asos-media.com/products/jdy-o...,Asos Petite,"[6, 8, 10, 12, 14]","[18, 16]"
10,Nike Running hooded jacket in pink,84.95,Pink,121963507.0,[{'Product Details': 'Coats & Jackets by Nike ...,['https://images.asos-media.com/products/nike-...,Topshop,"[4, 8, 12, 16, 20]",[]
14,ASOS DESIGN Tall linen mix trench coat in natural,75.0,Natural,123053365.0,[{'Product Details': 'Coats & Jackets by ASOS ...,['https://images.asos-media.com/products/asos-...,Asos Design,"[4, 6, 8, 10, 12, 14, 16, 18, 20]",[]


In [114]:
df['price'] = (
    df['price']
      .str.replace(r'\b(from|now)\b', '', regex=True, case=False)
      .str.replace(',', '')
      .str.strip()
      .astype(float)
)

In [115]:
df['sku'] = df['sku'].astype(int)

In [116]:
df_small = df[
    [   'sku',
        "brand",
        "name",
        "color",
        "price",
        "in_stock_size",
        "out_stock_size",
        "description",
        "images"
    ]
]


In [117]:
rows_to_keep = ~((df_small['in_stock_size'] == '[]') & (df_small['out_stock_size'] == '[]'))

df_small = df_small[rows_to_keep]

original_row_count = len(df_small)
new_row_count = len(df_small)
dropped_rows_count = original_row_count - new_row_count

In [118]:
df_small["color"] = df_small["color"].str.upper()

In [119]:
df_small

Unnamed: 0,sku,brand,name,color,price,in_stock_size,out_stock_size,description,images
0,126704571,Stradivarius,New Look trench coat in camel,NEUTRAL,49.99,"[4, 6, 8, 10, 12, 16, 18]",[14],[{'Product Details': 'Coats & Jackets by New L...,['https://images.asos-media.com/products/new-l...
4,123650194,Stradivarius,Stradivarius double breasted wool coat in grey,GREY,59.99,"[6, 8, 10, 12, 14]",[],[{'Product Details': 'Coats & Jackets by Strad...,['https://images.asos-media.com/products/strad...
8,125806824,Asos Petite,JDY oversized trench coat in stone,STONE,45.00,"[6, 8, 10, 12, 14]","[18, 16]",[{'Product Details': 'Coats & Jackets by JDYLo...,['https://images.asos-media.com/products/jdy-o...
10,121963507,Topshop,Nike Running hooded jacket in pink,PINK,84.95,"[4, 8, 12, 16, 20]",[],[{'Product Details': 'Coats & Jackets by Nike ...,['https://images.asos-media.com/products/nike-...
14,123053365,Asos Design,ASOS DESIGN Tall linen mix trench coat in natural,NATURAL,75.00,"[4, 6, 8, 10, 12, 14, 16, 18, 20]",[],[{'Product Details': 'Coats & Jackets by ASOS ...,['https://images.asos-media.com/products/asos-...
...,...,...,...,...,...,...,...,...,...
30838,122463785,Asos Design,ASOS DESIGN cotton shirred maxi smock dress in...,MULTI,44.00,[8],"[4, 6, 10, 12, 14, 16, 18]",[{'Product Details': 'Dresses by ASOS DESIGNTh...,['https://images.asos-media.com/products/asos-...
30839,120479167,Asos Design,ASOS DESIGN satin midi dress with cowl neck an...,CHOCOLATE,55.00,[8],"[4, 6, 10, 12, 14, 16, 18]",[{'Product Details': 'Dresses by ASOS DESIGNAl...,['https://images.asos-media.com/products/asos-...
30841,1444255,Asos Design,ASOS DESIGN long sleeve maxi t-shirt dress in ...,BLACK,24.00,[10],"[4, 6, 8, 12, 14, 16, 18]",[{'Product Details': 'Dress by ASOS DESIGN Act...,['https://images.asos-media.com/products/asos-...
30842,110783769,Asyou,ASYOU layered t-shirt dress with focus graphic...,WASHED BLACK,22.99,[6],"[4, 8, 10, 12, 14, 16, 18, 20, 22, 24]",[{'Product Details': 'Dress by ASYOU Exclusive...,['https://images.asos-media.com/products/asyou...


In [120]:
df_small.to_csv("products_small.csv", index=False)