In [None]:
import pandas as pd
import numpy as np
import random
import re

# Step 1: Load dataset
df = pd.read_csv('/content/online_retail_II.csv', encoding='ISO-8859-1')

# Step 2: Filter relevant columns and clean
df = df[['StockCode', 'Description', 'Price']].dropna()
df = df[df['Price'] > 0]

# Step 3: Extract product name
df['p_name'] = df['Description'].apply(lambda x: ' '.join(re.findall(r'[A-Za-z]+', x)[:4]).title())

# Step 4: Add dummy Quantity and Vendor
vendor_list = ['Apple Inc.', 'Samsung Ltd.', 'Sony Corp.', 'LG Electronics', 'Philips', 'Dell Technologies']
df['Quantity'] = np.random.randint(1, 101, size=len(df))
df['Vendor'] = np.random.choice(vendor_list, size=len(df))

# Step 5: Get only 4932 unique StockCodes
df_unique_stockcode = df.drop_duplicates(subset='StockCode').reset_index(drop=True)

# Step 6: Optional â€“ reorder columns
column_order = ['StockCode', 'p_name', 'Description', 'Price', 'Quantity', 'Vendor']
df_unique_stockcode = df_unique_stockcode[column_order]

# Step 7: Limit to exactly 4932 rows (if more exist)
df_final = df_unique_stockcode.head(4932)

# Step 8: Display result
print(df_final.head())
print(f"Total unique rows: {len(df_final)}")

# Optional: Save
# df_final.to_csv('unique_stockcode_dummy_data.csv', index=False)


  StockCode                          p_name  \
0     85048         Cm Christmas Glass Ball   
1    79323P              Pink Cherry Lights   
2    79323W             White Cherry Lights   
3     22041        Record Frame Single Size   
4     21232  Strawberry Ceramic Trinket Box   

                           Description  Price  Quantity             Vendor  
0  15CM CHRISTMAS GLASS BALL 20 LIGHTS   6.95         1  Dell Technologies  
1                   PINK CHERRY LIGHTS   6.75        22       Samsung Ltd.  
2                  WHITE CHERRY LIGHTS   6.75        21       Samsung Ltd.  
3         RECORD FRAME 7" SINGLE SIZE    2.10        42         Apple Inc.  
4       STRAWBERRY CERAMIC TRINKET BOX   1.25        90       Samsung Ltd.  
Total unique rows: 4932


In [None]:
len(df_final['StockCode'].unique())

4932

In [None]:
len(df_final)

4932

In [None]:
df_final.to_csv('final_dummy_data.csv', index=False)