In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pickle

In [2]:
# Dynamic path settings
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath("__file__")))
DATA_DIR = os.path.join(BASE_DIR, "dataset")
TRAIN_TEST_SPLIT_DIR = os.path.join(BASE_DIR, "train_test_split")

In [3]:
# Check if directories exist and create if they don't
os.makedirs(TRAIN_TEST_SPLIT_DIR, exist_ok=True)

In [4]:
# File paths
product_csv_path = os.path.join(DATA_DIR, "product_details.csv")
customer_csv_path = os.path.join(DATA_DIR, "customer_details.csv")
sales_csv_path = os.path.join(DATA_DIR, "sales_data.csv")

In [5]:
# Reading CSV files
products_df = pd.read_csv(product_csv_path)
customers_df = pd.read_csv(customer_csv_path)
sales_df = pd.read_csv(sales_csv_path)

In [6]:
# Checking for missing values
print("Products DataFrame missing values:\n", products_df.isnull().sum(), "\n")
print("Customers DataFrame missing values:\n", customers_df.isnull().sum(), "\n")
print("Sales DataFrame missing values:\n", sales_df.isnull().sum(), "\n")

Products DataFrame missing values:
 Uniqe Id                     0
Product Name                 0
Brand Name               10002
Asin                     10002
Category                   830
Upc Ean Code              9968
List Price               10002
Selling Price              107
Quantity                 10002
Model Number              1772
About Product              273
Product Specification     1632
Technical Details          790
Shipping Weight           1138
Product Dimensions        9523
Image                        0
Variants                  7524
Sku                      10002
Product Url                  0
Stock                    10002
Product Details          10002
Dimensions               10002
Color                    10002
Ingredients              10002
Direction To Use         10002
Is Amazon Seller             0
Size Quantity Variant    10002
Product Description      10002
dtype: int64 

Customers DataFrame missing values:
 Customer ID               0
Age             

In [7]:
# Dropping rows with missing values in specific columns
sales_df.dropna(subset=['user id', 'product id', 'Interaction type', 'Time stamp'], inplace=True)

In [8]:
# Checking the data
print("Products DataFrame:\n", products_df.head(), "\n")
print("Customers DataFrame:\n", customers_df.head(), "\n")
print("Sales DataFrame:\n", sales_df.head(), "\n")

Products DataFrame:
                            Uniqe Id  \
0  4c69b61db1fc16e7013b43fc926e502d   
1  66d49bbed043f5be260fa9f7fbff5957   
2  2c55cae269aebf53838484b0d7dd931a   
3  18018b6bc416dab347b1b7db79994afa   
4  e04b990e95bf73bbe6a3fa09785d7cd0   

                                        Product Name  Brand Name  Asin  \
0  DB Longboards CoreFlex Crossbow 41" Bamboo Fib...         NaN   NaN   
1  Electronic Snap Circuits Mini Kits Classpack, ...         NaN   NaN   
2  3Doodler Create Flexy 3D Printing Filament Ref...         NaN   NaN   
3  Guillow Airplane Design Studio with Travel Cas...         NaN   NaN   
4                   Woodstock- Collage 500 pc Puzzle         NaN   NaN   

                                            Category Upc Ean Code  List Price  \
0  Sports & Outdoors | Outdoor Recreation | Skate...          NaN         NaN   
1  Toys & Games | Learning & Education | Science ...          NaN         NaN   
2          Toys & Games | Arts & Crafts | Craft Kits    

In [9]:
# Converting timestamp to datetime format
try:
    sales_df['Time stamp'] = pd.to_datetime(sales_df['Time stamp'], dayfirst=True)
except Exception as e:
    raise ValueError(f"Error converting 'Time stamp' to datetime: {e}")

In [10]:
# User and product IDs to numeric values Mapping
with open(os.path.join(DATA_DIR, 'user_id_map.pkl'), 'rb') as f:
    user_id_map = pickle.load(f)
with open(os.path.join(DATA_DIR, 'product_id_map.pkl'), 'rb') as f:
    product_id_map = pickle.load(f)

In [11]:
sales_df['user_id'] = sales_df['user id'].map({v: k for k, v in user_id_map.items()})
sales_df['product_id'] = sales_df['product id'].map({v: k for k, v in product_id_map.items()})

In [12]:
# Map interaction types to numeric values
interaction_mapping = {'view': 1, 'like': 2, 'purchase': 3}
sales_df['interaction_type'] = sales_df['Interaction type'].map(interaction_mapping)

In [13]:
# Splitting data into train and test sets ensuring overlap
unique_users = sales_df['user_id'].unique()
unique_products = sales_df['product_id'].unique()

train_users, test_users = train_test_split(unique_users, test_size=0.2, random_state=42)
train_products, test_products = train_test_split(unique_products, test_size=0.2, random_state=42)