## Generate datasets

In [88]:
# Import libraries
import pandas as pd
import numpy as np
import random
from faker import Faker  # Faker is a library for generating fake data
from datetime import datetime, timedelta

# Initialize Faker to generate fake data
fake = Faker()

In [89]:
# Generate Customer Interactions data
# Generate a list of customer IDs from 1 to 100
customer_ids = list(range(1, 101)) 
# Generate random values for page views ranging from 1 to 50 for 100 instances
page_views = [random.randint(1, 50) for _ in range(100)]
# Generate random values for time spent on the website ranging from 1 to 60 minutes for 100 instances
time_spent = [random.randint(1, 60) for _ in range(100)]

# Create a DataFrame containing customer interactions data
customer_interactions_df = pd.DataFrame({
    'Customer ID': customer_ids, 
    'Page views': page_views, 
    'Time spent on website': time_spent 
})

customer_interactions_df

Unnamed: 0,Customer ID,Page views,Time spent on website
0,1,18,19
1,2,48,9
2,3,8,43
3,4,11,15
4,5,5,25
...,...,...,...
95,96,23,14
96,97,3,2
97,98,41,44
98,99,19,51


In [95]:
# Generate data for Purchase History
# Generate random purchase dates within the last year for 1000 instances
purchase_dates = [fake.date_between(start_date='-1M', end_date='today') for _ in range(1000)]

# Create a DataFrame containing purchase history data
purchase_history_df = pd.DataFrame({
    'Customer ID': random.choices(customer_ids, k=1000),  # Randomly select 1000 customer IDs from the list
    'Product ID': [random.randint(1, 20) for _ in range(1000)],  # Generate random product IDs ranging from 1 to 20 for 1000 instances
    'Purchase date': purchase_dates  # Assign the list of random purchase dates to the 'Purchase date' column
})

purchase_history_df

Unnamed: 0,Customer ID,Product ID,Purchase date
0,7,6,2024-01-18
1,25,12,2024-02-10
2,89,14,2024-02-09
3,27,8,2024-01-22
4,5,18,2024-01-24
...,...,...,...
995,51,5,2024-02-14
996,45,20,2024-01-20
997,94,8,2024-02-10
998,28,14,2024-02-04


In [4]:
# Generate data for Product Details
# Create a list of product IDs ranging from 1 to 20
product_ids = list(range(1, 21))
# Define categories for products
categories = ['Electronics', 'Clothing', 'Home & Kitchen', 'Books']
# Generate random prices between 10 and 500 for 20 products
prices = [round(random.uniform(10, 500), 2) for _ in range(20)]
# Generate random ratings between 1 and 5 for 20 products
ratings = [round(random.uniform(1, 5), 1) for _ in range(20)]
# Create a DataFrame containing product details
product_details_df = pd.DataFrame({
    'Product ID': product_ids,  # Assign the list of product IDs to the 'Product ID' column
    'Category': random.choices(categories, k=20),  # Randomly select categories for the products
    'Price': prices,  # Assign the list of prices to the 'Price' column
    'Ratings': ratings  # Assign the list of ratings to the 'Ratings' column
})

product_details_df

Unnamed: 0,Product ID,Category,Price,Ratings
0,1,Electronics,210.32,4.6
1,2,Clothing,228.36,4.7
2,3,Books,399.18,3.9
3,4,Books,237.73,1.1
4,5,Electronics,492.34,4.0
5,6,Books,370.96,4.5
6,7,Books,339.5,4.0
7,8,Books,346.56,4.4
8,9,Home & Kitchen,340.86,4.4
9,10,Clothing,466.81,3.1


## Store the Datasets

In [91]:
from pathlib import Path

# Get the absolute path to the parent directory of the current script
PARENT_PATH = Path().resolve().parent

# Define the path to the dataset directory relative to the parent directory
DATASET_PATH = PARENT_PATH / 'dataset'

In [96]:
customer_interactions_df.to_csv(DATASET_PATH / 'customer_interactions.csv', index=False)
purchase_history_df.to_csv(DATASET_PATH / 'purchase_history.csv', index=False)
product_details_df.to_csv(DATASET_PATH / 'product_details.csv', index=False)