# Machine Learning for E-Commerce User Predictions 
## AWS Platform
### Axel (Titouan) Magret, Gurleen Virk,  Victor Hsu

### AAI 540 Group 3
### October 2025

In [1]:
import pandas as pd
import numpy as np
import boto3
import sagemaker
from sagemaker import get_execution_role
import io
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [5]:
# Set S3 configurations
BUCKET_NAME = 'aai540-ecommerce-recommendation-project-group3'  # team bucket
S3_PREFIX = '' 
PROCESSED_PREFIX = 'processed/'

# Dataset file names
RAW_FILES = [
    'oct_2019_sample.csv',
    'nov_2019_sample.csv', 
    'jan_2020_sample.csv',
    'train_user_product_pairs.csv',
    'validation_user_product_pairs.csv'
]

# Initialize AWS services
sagemaker_session = sagemaker.Session()
role = get_execution_role()
s3_client = boto3.client('s3')
s3_resource = boto3.resource('s3')

print(f"SageMaker Role ARN: {role}")
print(f"Default S3 Bucket: {sagemaker_session.default_bucket()}")
print(f"Our Project Bucket: {BUCKET_NAME}")

SageMaker Role ARN: arn:aws:iam::654654600015:role/LabRole
Default S3 Bucket: sagemaker-us-east-1-654654600015
Our Project Bucket: aai540-ecommerce-recommendation-project-group3


In [6]:
# Verify S3 files all exist
def verify_s3_files(bucket_name=BUCKET_NAME, s3_prefix=S3_PREFIX):
    print("VERIFYING S3 FILES")
    print(f"\nBucket: {bucket_name}")
    print(f"Prefix: {s3_prefix if s3_prefix else '(root)'}")
    
    print("\nChecking for required files...")
    
    all_files_exist = True
    for filename in RAW_FILES:
        s3_key = f"{s3_prefix}{filename}" if s3_prefix else filename
        
        try:
            s3_client.head_object(Bucket=bucket_name, Key=s3_key)
            print(f"  ✓ {filename}")
            
            # Get file size
            response = s3_client.head_object(Bucket=bucket_name, Key=s3_key)
            size_mb = response['ContentLength'] / (1024 * 1024)
            print(f"    Location: s3://{bucket_name}/{s3_key}")
            print(f"    Size: {size_mb:.2f} MB")
            
        except Exception as e:
            print(f"  ✗ {filename} - NOT FOUND")
            print(f"    Expected location: s3://{bucket_name}/{s3_key}")
            all_files_exist = False
    
    if all_files_exist:
        print("\n✓ All files found in S3!")
    else:
        print("\n✗ Some files are missing. Please check:")
        print(f"   1. Bucket name: {bucket_name}")
        print(f"   2. File names match exactly (case-sensitive)")
        print(f"   3. Files are in correct location")
        
        # List what's in the bucket
        print("\nFiles currently in bucket:")
        try:
            response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=s3_prefix)
            if 'Contents' in response:
                for obj in response['Contents']:
                    print(f"  - {obj['Key']}")
            else:
                print("  (bucket is empty)")
        except Exception as e:
            print(f"  Error listing bucket: {str(e)}")
    
    return all_files_exist

In [7]:
# Load data from S3 bucket
def load_data_from_s3(bucket_name=BUCKET_NAME, s3_prefix=S3_PREFIX):
    print("="*90)
    print("LOADING DATA FROM S3")
    
    # Load training data
    train_filename = 'train_user_product_pairs.csv'
    train_key = f"{s3_prefix}{train_filename}" if s3_prefix else train_filename
    print(f"\nLoading training data from s3://{bucket_name}/{train_key}")
    
    try:
        obj = s3_client.get_object(Bucket=bucket_name, Key=train_key)
        df_train = pd.read_csv(io.BytesIO(obj['Body'].read()))
        print(f"✓ Training data loaded: {df_train.shape}")
    except Exception as e:
        print(f"✗ Error loading training data: {str(e)}")
        print("\nTroubleshooting:")
        print("1. Check if bucket name is correct")
        print("2. Verify files were uploaded to S3")
        print("3. Check S3 permissions")
        print("4. Verify file name matches exactly (case-sensitive)")
        return None, None
    
    # Load validation data
    val_filename = 'validation_user_product_pairs.csv'
    val_key = f"{s3_prefix}{val_filename}" if s3_prefix else val_filename
    print(f"\nLoading validation data from s3://{bucket_name}/{val_key}")
    
    try:
        obj = s3_client.get_object(Bucket=bucket_name, Key=val_key)
        df_val = pd.read_csv(io.BytesIO(obj['Body'].read()))
        print(f"✓ Validation data loaded: {df_val.shape}")
    except Exception as e:
        print(f"✗ Error loading validation data: {str(e)}")
        return df_train, None
    
    # Display basic info
    print("\n" + "-"*80)
    print("DATASET SUMMARY")
    print("-"*80)
    
    print(f"\nTraining data:")
    print(f"  Shape: {df_train.shape}")
    print(f"  Columns: {df_train.columns.tolist()}")
    print(f"  Purchase rate: {df_train['purchased'].mean()*100:.2f}%")
    print(f"  Missing values: {df_train.isnull().sum().sum()}")
    
    print(f"\nValidation data:")
    print(f"  Shape: {df_val.shape}")
    print(f"  Purchase rate: {df_val['purchased'].mean()*100:.2f}%")
    print(f"  Missing values: {df_val.isnull().sum().sum()}")
    
    print(f"\nFirst few rows of training data:")
    print(df_train.head())
    
    return df_train, df_val

In [8]:
# Handle missing values in categorical and numeric columns
def handle_missing_values(df_train, df_val):
    print("HANDLING MISSING VALUES")
    print("="*80)
    
    # Make copies
    df_train = df_train.copy()
    df_val = df_val.copy()
    
    print("\nMissing values BEFORE handling:")
    print("\nTraining set:")
    missing_train = df_train.isnull().sum()
    print(missing_train[missing_train > 0])
    
    print("\nValidation set:")
    missing_val = df_val.isnull().sum()
    print(missing_val[missing_val > 0])
    
    # Handle categorical columns
    categorical_cols = ['category_code', 'brand']
    
    for col in categorical_cols:
        if col in df_train.columns:
            train_missing = df_train[col].isnull().sum()
            val_missing = df_val[col].isnull().sum()
            
            # Fill with 'unknown'
            df_train[col] = df_train[col].fillna('unknown')
            df_val[col] = df_val[col].fillna('unknown')
            
            print(f"\n✓ {col}:")
            print(f"    Training: Filled {train_missing:,} ({train_missing/len(df_train)*100:.1f}%)")
            print(f"    Validation: Filled {val_missing:,} ({val_missing/len(df_val)*100:.1f}%)")
    
    # Handle numeric columns (if any remaining nulls)
    numeric_cols = df_train.select_dtypes(include=[np.number]).columns
    
    for col in numeric_cols:
        if df_train[col].isnull().sum() > 0:
            median_val = df_train[col].median()
            df_train[col] = df_train[col].fillna(median_val)
            df_val[col] = df_val[col].fillna(median_val)
            print(f"\n✓ {col}: Filled with median ({median_val:.2f})")
    
    print("\n" + "-"*80)
    print("Missing values AFTER handling:")
    print(f"  Training: {df_train.isnull().sum().sum()} total")
    print(f"  Validation: {df_val.isnull().sum().sum()} total")
    print("✓ All missing values handled!")
    
    return df_train, df_val

In [9]:
# Encode categorical features using Label Encoding
def encode_categorical_features(df_train, df_val):
    print("\n" + "="*80)
    print("ENCODING CATEGORICAL FEATURES")
    print("Method: Label Encoding")
    
    df_train = df_train.copy()
    df_val = df_val.copy()
    
    categorical_cols = ['category_code', 'brand']
    label_encoders = {}
    
    for col in categorical_cols:
        if col in df_train.columns:
            print(f"\n{col}:")
            print(f"  Unique categories in training: {df_train[col].nunique():,}")
            print(f"  Top 5 categories: {df_train[col].value_counts().head().to_dict()}")
            
            # Initialize and fit encoder
            le = LabelEncoder()
            le.fit(df_train[col])
            
            # Transform training data
            df_train[col + '_encoded'] = le.transform(df_train[col])
            
            # Transform validation data (handle unseen categories)
            df_val[col + '_encoded'] = df_val[col].apply(
                lambda x: le.transform([x])[0] if x in le.classes_ else -1
            )
            
            unseen_count = (df_val[col + '_encoded'] == -1).sum()
            print(f"  Encoded range: 0 to {df_train[col + '_encoded'].max()}")
            print(f"  Unseen categories in validation: {unseen_count} ({unseen_count/len(df_val)*100:.1f}%)")
            
            # Store encoder
            label_encoders[col] = le
    
    print("\n✓ Categorical encoding complete!")
    
    return df_train, df_val, label_encoders

In [10]:
# Select and prepare final features for model training
def prepare_modeling_features(df_train, df_val):
    print("\n" + "="*80)
    print("PREPARING FEATURES FOR MODELING")
    print("="*80)
    
    # Define feature columns
    feature_cols = [
        # User-product interaction features
        'view_count',
        'cart_count',
        'total_interactions',
        
        # Product features
        'price',
        'category_id',
        'category_code_encoded',
        'brand_encoded',
        'product_view_count',
        'product_purchase_count',
        'product_conversion_rate',
        
        # User behavior features
        'user_total_events',
        'user_total_purchases'
    ]
    
    # Verify all features exist
    missing_features = [col for col in feature_cols if col not in df_train.columns]
    if missing_features:
        print(f"⚠ Warning: Missing features: {missing_features}")
        feature_cols = [col for col in feature_cols if col in df_train.columns]
    
    print(f"\nSelected {len(feature_cols)} features:")
    for i, col in enumerate(feature_cols, 1):
        print(f"  {i:2d}. {col}")
    
    # Extract features and target
    X_train = df_train[feature_cols].copy()
    y_train = df_train['purchased'].copy()
    
    X_val = df_val[feature_cols].copy()
    y_val = df_val['purchased'].copy()
    
    # Final data quality checks
    print("\n" + "-"*80)
    print("DATA QUALITY CHECKS")
    print("-"*80)
    
    print(f"\nShapes:")
    print(f"  X_train: {X_train.shape}")
    print(f"  y_train: {y_train.shape}")
    print(f"  X_val: {X_val.shape}")
    print(f"  y_val: {y_val.shape}")
    
    print(f"\nClass Distribution:")
    print(f"  Training:")
    print(f"    Purchased (1): {y_train.sum():,} ({y_train.mean()*100:.2f}%)")
    print(f"    Not Purchased (0): {(~y_train.astype(bool)).sum():,} ({(1-y_train.mean())*100:.2f}%)")
    print(f"  Validation:")
    print(f"    Purchased (1): {y_val.sum():,} ({y_val.mean()*100:.2f}%)")
    print(f"    Not Purchased (0): {(~y_val.astype(bool)).sum():,} ({(1-y_val.mean())*100:.2f}%)")
    
    print(f"\nData Quality:")
    print(f"  Missing values in X_train: {X_train.isnull().sum().sum()}")
    print(f"  Missing values in X_val: {X_val.isnull().sum().sum()}")
    print(f"  Infinite values in X_train: {np.isinf(X_train.values).sum()}")
    print(f"  Infinite values in X_val: {np.isinf(X_val.values).sum()}")
    
    if X_train.isnull().sum().sum() == 0 and np.isinf(X_train.values).sum() == 0:
        print("\n✓ Data quality checks passed!")
    else:
        print("\n⚠ Warning: Data quality issues detected!")
    
    return X_train, y_train, X_val, y_val, feature_cols

In [11]:
# Save processed data to S3 bucket for team access and model training
def save_processed_data_to_s3(X_train, y_train, X_val, y_val, 
                               bucket_name=BUCKET_NAME, 
                               s3_prefix=PROCESSED_PREFIX):
    print("SAVING PROCESSED DATA TO S3")
    print("="*80)
    
    # Combine features and target
    train_processed = X_train.copy()
    train_processed['purchased'] = y_train.values
    
    val_processed = X_val.copy()
    val_processed['purchased'] = y_val.values
    
    # Save to S3
    datasets = {
        'train_processed.csv': train_processed,
        'validation_processed.csv': val_processed
    }
    
    for filename, df in datasets.items():
        s3_key = f"{s3_prefix}{filename}" if s3_prefix else filename
        
        print(f"\nSaving {filename}...")
        print(f"  Shape: {df.shape}")
        print(f"  S3 location: s3://{bucket_name}/{s3_key}")
        
        try:
            # Convert to CSV in memory
            csv_buffer = io.StringIO()
            df.to_csv(csv_buffer, index=False)
            
            # Upload to S3
            s3_client.put_object(
                Bucket=bucket_name,
                Key=s3_key,
                Body=csv_buffer.getvalue()
            )
            
            print(f"  ✓ Saved!")
            
        except Exception as e:
            print(f"  ✗ Error: {str(e)}")
    
    print("\n✓ All processed data saved to S3!")
    print(f"\nYour team can now access processed data from:")
    print(f"  s3://{bucket_name}/{s3_prefix}")

In [12]:
# Complete data processing pipeline
def run_complete_pipeline():
    print("\n" + "="*80)
    print("COMPLETE DATA PREPROCESSING PIPELINE")
    print(f"\nBucket: {BUCKET_NAME}")
    print(f"Region: {sagemaker_session.boto_region_name}")
    
    # Step 0: Verify S3 files exist
    print("\n>>> STEP 0: Verifying S3 files")
    files_exist = verify_s3_files(BUCKET_NAME, S3_PREFIX)
    
    if not files_exist:
        print("\n✗ Cannot proceed - required files not found in S3")
        print("\nPlease ensure all files are uploaded to:")
        print(f"  s3://{BUCKET_NAME}/{S3_PREFIX if S3_PREFIX else '(root)'}")
        return None
    
    # Step 1: Load data from S3
    print("\n>>> STEP 1: Loading data from S3")
    df_train, df_val = load_data_from_s3(BUCKET_NAME, S3_PREFIX)
    
    if df_train is None or df_val is None:
        print("\n✗ Failed to load data. Please check S3 setup.")
        return None
    
    # Step 2: Handle missing values
    print("\n>>> STEP 2: Handling missing values")
    df_train, df_val = handle_missing_values(df_train, df_val)
    
    # Step 3: Encode categorical features
    print("\n>>> STEP 3: Encoding categorical features")
    df_train, df_val, encoders = encode_categorical_features(df_train, df_val)
    
    # Step 4: Prepare features
    print("\n>>> STEP 4: Preparing features for modeling")
    X_train, y_train, X_val, y_val, features = prepare_modeling_features(df_train, df_val)
    
    # Step 5: Save processed data
    print("\n>>> STEP 5: Saving processed data to S3")
    save_processed_data_to_s3(X_train, y_train, X_val, y_val, BUCKET_NAME, PROCESSED_PREFIX)
    
    # Summary
    print("\n" + "="*80)
    print("PIPELINE COMPLETE! ✓")
    print("="*80)
    
    print("\n Data Ready for Model Training:")
    print(f"  Training samples: {len(X_train):,}")
    print(f"  Validation samples: {len(X_val):,}")
    print(f"  Features: {len(features)}")
    print(f"  Target: 'purchased' (binary: 0/1)")
    
    print("\n Processed data saved to:")
    print(f"  s3://{BUCKET_NAME}/{PROCESSED_PREFIX}train_processed.csv")
    print(f"  s3://{BUCKET_NAME}/{PROCESSED_PREFIX}validation_processed.csv")

    return X_train, y_train, X_val, y_val, features, encoders

In [None]:
# Execute
X_train, y_train, X_val, y_val, features, encoders = run_complete_pipeline()

# Display final data
if X_train is not None:
    print("\n" + "="*80)
    print("FINAL PREPROCESSED DATA")
    print("="*80)
    print(f"\nX_train shape: {X_train.shape}")
    print(f"y_train shape: {y_train.shape}")
    print(f"\nFeatures: {features}")
    print(f"\nSample of training data:")
    print(X_train.head())
    print(f"\nTarget distribution:")
    print(y_train.value_counts())


COMPLETE DATA PREPROCESSING PIPELINE

Bucket: aai540-ecommerce-recommendation-project-group3
Region: us-east-1

>>> STEP 0: Verifying S3 files
VERIFYING S3 FILES

Bucket: aai540-ecommerce-recommendation-project-group3
Prefix: (root)

Checking for required files...
  ✓ oct_2019_sample.csv
    Location: s3://aai540-ecommerce-recommendation-project-group3/oct_2019_sample.csv
    Size: 151.36 MB
  ✓ nov_2019_sample.csv
    Location: s3://aai540-ecommerce-recommendation-project-group3/nov_2019_sample.csv
    Size: 262.94 MB
  ✓ jan_2020_sample.csv
    Location: s3://aai540-ecommerce-recommendation-project-group3/jan_2020_sample.csv
    Size: 241.01 MB
  ✓ train_user_product_pairs.csv
    Location: s3://aai540-ecommerce-recommendation-project-group3/train_user_product_pairs.csv
    Size: 297.99 MB
  ✓ validation_user_product_pairs.csv
    Location: s3://aai540-ecommerce-recommendation-project-group3/validation_user_product_pairs.csv
    Size: 160.76 MB

✓ All files found in S3!

>>> STEP 1: