In [1]:
# # Install librariess
# !pip install sdv

**Data Description**
The dataset includes the following information:

1. Customer Interactions:
   - Customer ID
   - Page views
   - Time spent on the website

2. Purchase History:
   - Customer ID
   - Product ID
   - Purchase date

3. Product Details:
   - Product ID
   - Category
   - Price
   - Ratings

In [32]:
# Load libraries
import pandas as pd
from sdv.lite import SingleTablePreset
from sdv.metadata import SingleTableMetadata

In [33]:
# Load dataset
df_customer = pd.read_csv("../dataset/customer_interactions.csv")
df_customer.head()

Unnamed: 0,customer_id,page_views,time_spent
0,1,25,120
1,2,20,90
2,3,30,150
3,4,15,80
4,5,22,110


In [34]:
# Syntetic dataset
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df_customer)
synthesizer = SingleTablePreset(metadata, name='FAST_ML')
synthesizer.fit(data=df_customer)
print(f"[*] Customer Interaction Metadata:\n{metadata.to_dict()}")

[*] Customer Interaction Metadata:
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1', 'columns': {'customer_id': {'sdtype': 'numerical'}, 'page_views': {'sdtype': 'numerical'}, 'time_spent': {'sdtype': 'numerical'}}}


In [35]:
# Generate customer interaction syntetic data within 1k rows
df_customer_generated = synthesizer.sample(1000)
df_customer_generated.tail()

Unnamed: 0,customer_id,page_views,time_spent
995,2,19,96
996,3,22,117
997,4,15,80
998,1,23,109
999,1,22,101


In [36]:
# Load dataset
df_product = pd.read_csv("../dataset/product_details.csv", sep=";")
df_product = df_product[['product_id', 'category', 'price', 'ratings']]
df_product.head()

Unnamed: 0,product_id,category,price,ratings
0,101,Electronics,500,4.5
1,102,Clothing,50,3.8
2,103,Home & Kitchen,200,4.2
3,104,Beauty,30,4.0
4,105,Electronics,800,4.8


In [37]:
# Syntetic dataset
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df_product)
synthesizer = SingleTablePreset(metadata, name='FAST_ML')
synthesizer.fit(data=df_product)
print(f"[*] Product Detail Metadata:\n{metadata.to_dict()}")

[*] Product Detail Metadata:
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1', 'columns': {'product_id': {'sdtype': 'numerical'}, 'category': {'sdtype': 'categorical'}, 'price': {'sdtype': 'numerical'}, 'ratings': {'sdtype': 'numerical'}}}


In [38]:
# Generate product detail syntetic data within 1k rows
df_product_generated = synthesizer.sample(1000)
df_product_generated.tail()

Unnamed: 0,product_id,category,price,ratings
995,101,Electronics,30,3.8
996,102,Beauty,30,3.8
997,101,Electronics,578,4.646858
998,105,Clothing,537,4.554549
999,105,Electronics,800,4.8


In [39]:
# Load dataset
df_purchase = pd.read_csv("../dataset/purchase_history.csv", sep=";")
df_purchase = df_purchase[['customer_id', 'product_id', 'purchase_date']]
df_purchase.head()

Unnamed: 0,customer_id,product_id,purchase_date
0,1,101,2023-01-01
1,1,105,2023-01-05
2,2,102,2023-01-02
3,3,103,2023-01-03
4,4,104,2023-01-04


In [40]:
# Syntetic dataset
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df_purchase)
synthesizer = SingleTablePreset(metadata, name='FAST_ML')
synthesizer.fit(data=df_purchase)
print(f"[*] Purchase History Metadata:\n{metadata.to_dict()}")

[*] Purchase History Metadata:
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1', 'columns': {'customer_id': {'sdtype': 'numerical'}, 'product_id': {'sdtype': 'numerical'}, 'purchase_date': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'}}}


In [41]:
# Generate purchase syntetic data within 1k rows
df_purchase_generated = synthesizer.sample(1000)
df_purchase_generated.tail()

Unnamed: 0,customer_id,product_id,purchase_date
995,4,102,2023-01-04
996,2,102,2023-01-03
997,5,105,2023-01-05
998,4,101,2023-01-03
999,5,101,2023-01-03


In [42]:
# Save .csv file
df_customer_generated.to_csv("../dataset/customer_interactions_synt.csv", index=False)
df_product_generated.to_csv("../dataset/product_details_synt.csv", index=False)
df_purchase_generated.to_csv("../dataset/purchase_history_synt.csv", index=False)