# Feature engineering
This notebook will focus on the following:

- Spliting train/test dataset:
- Creating our target feature (Customer Lifetime value)
- Engineering RMF (recency, monitory, frequency based features)
- Experimenting with some automated feature creatations - (identify important polynomial or interactions)


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df_clean = pd.read_csv('/Users/ryanmurray/Desktop/Project_Folder/CLV_Project/CLV-MLOPs-Project/data/processed/cleaned_online_retail.csv')

In [13]:

# ============================================
# DATASET 1: TRAINING SET
# ============================================
print("\n1. TRAINING SET")
print("-"*60)

TRAIN_FEATURE_START = '2010-12-01'
TRAIN_FEATURE_END = '2011-08-31'
TRAIN_TARGET_START = '2011-09-01'
TRAIN_TARGET_END = '2011-11-30'

print(f"Feature period: {TRAIN_FEATURE_START} to {TRAIN_FEATURE_END}")
print(f"Target period:  {TRAIN_TARGET_START} to {TRAIN_TARGET_END}")

# Split data
train_feature_df = df_clean[
    (df_clean['InvoiceDate'] >= TRAIN_FEATURE_START) & 
    (df_clean['InvoiceDate'] <= TRAIN_FEATURE_END)
].copy()

train_target_df = df_clean[
    (df_clean['InvoiceDate'] >= TRAIN_TARGET_START) & 
    (df_clean['InvoiceDate'] <= TRAIN_TARGET_END)
].copy()

# Get active customers
train_customers = train_feature_df['CustomerID'].unique()

# Calculate CLV
train_clv = train_target_df.groupby('CustomerID')['Revenue'].sum().reset_index()
train_clv.columns = ['CustomerID', 'CLV_Target']

# Create dataset
train_data = pd.DataFrame({'CustomerID': train_customers})
train_data = train_data.merge(train_clv, on='CustomerID', how='left')
train_data['CLV_Target'] = train_data['CLV_Target'].fillna(0)

print(f"Active customers: {len(train_data)}")
print(f"Active rate: {(train_data['CLV_Target'] > 0).mean()*100:.1f}%")

# Save
train_feature_df.to_csv('../data/processed/train_feature_transactions.csv', index=False)
train_data.to_csv('../data/processed/train_clv_target.csv', index=False)

# ============================================
# DATASET 2: VALIDATION SET
# ============================================
print("\n2. VALIDATION SET")
print("-"*60)

VAL_FEATURE_START = '2010-12-01'
VAL_FEATURE_END = '2011-05-31'
VAL_TARGET_START = '2011-06-01'
VAL_TARGET_END = '2011-08-31'

print(f"Feature period: {VAL_FEATURE_START} to {VAL_FEATURE_END}")
print(f"Target period:  {VAL_TARGET_START} to {VAL_TARGET_END}")

# Split data
val_feature_df = df_clean[
    (df_clean['InvoiceDate'] >= VAL_FEATURE_START) & 
    (df_clean['InvoiceDate'] <= VAL_FEATURE_END)
].copy()

val_target_df = df_clean[
    (df_clean['InvoiceDate'] >= VAL_TARGET_START) & 
    (df_clean['InvoiceDate'] <= VAL_TARGET_END)
].copy()

# Get active customers
val_customers = val_feature_df['CustomerID'].unique()

# Calculate CLV
val_clv = val_target_df.groupby('CustomerID')['Revenue'].sum().reset_index()
val_clv.columns = ['CustomerID', 'CLV_Target']

# Create dataset
val_data = pd.DataFrame({'CustomerID': val_customers})
val_data = val_data.merge(val_clv, on='CustomerID', how='left')
val_data['CLV_Target'] = val_data['CLV_Target'].fillna(0)

print(f"Active customers: {len(val_data)}")
print(f"Active rate: {(val_data['CLV_Target'] > 0).mean()*100:.1f}%")

# Save
val_feature_df.to_csv('../data/processed/val_feature_transactions.csv', index=False)
val_data.to_csv('../data/processed/val_clv_target.csv', index=False)

# ============================================
# DATASET 3: TEST SET
# ============================================
print("\n3. TEST SET")
print("-"*60)

TEST_FEATURE_START = '2010-12-01'
TEST_FEATURE_END = '2011-09-30'
TEST_TARGET_START = '2011-10-01'
TEST_TARGET_END = '2011-12-31'

print(f"Feature period: {TEST_FEATURE_START} to {TEST_FEATURE_END}")
print(f"Target period:  {TEST_TARGET_START} to {TEST_TARGET_END}")

# Split data
test_feature_df = df_clean[
    (df_clean['InvoiceDate'] >= TEST_FEATURE_START) & 
    (df_clean['InvoiceDate'] <= TEST_FEATURE_END)
].copy()

test_target_df = df_clean[
    (df_clean['InvoiceDate'] >= TEST_TARGET_START) & 
    (df_clean['InvoiceDate'] <= TEST_TARGET_END)
].copy()

# Get active customers
test_customers = test_feature_df['CustomerID'].unique()

# Calculate CLV
test_clv = test_target_df.groupby('CustomerID')['Revenue'].sum().reset_index()
test_clv.columns = ['CustomerID', 'CLV_Target']

# Create dataset
test_data = pd.DataFrame({'CustomerID': test_customers})
test_data = test_data.merge(test_clv, on='CustomerID', how='left')
test_data['CLV_Target'] = test_data['CLV_Target'].fillna(0)

print(f"Active customers: {len(test_data)}")
print(f"Active rate: {(test_data['CLV_Target'] > 0).mean()*100:.1f}%")

# Save
test_feature_df.to_csv('../data/processed/test_feature_transactions.csv', index=False)
test_data.to_csv('../data/processed/test_clv_target.csv', index=False)

# ============================================
# SUMMARY
# ============================================
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"Training set:   {len(train_data):,} customers")
print(f"Validation set: {len(val_data):,} customers")
print(f"Test set:       {len(test_data):,} customers")

print("\n" + "="*60)
print("SAVED FILES")
print("="*60)
print("Training:")
print("  - data/processed/train_feature_transactions.csv")
print("  - data/processed/train_clv_target.csv")
print("\nValidation:")
print("  - data/processed/val_feature_transactions.csv")
print("  - data/processed/val_clv_target.csv")
print("\nTest:")
print("  - data/processed/test_feature_transactions.csv")
print("  - data/processed/test_clv_target.csv")

print("\n" + "="*60)
print("NEXT STEPS")
print("="*60)
print("1. Create features using train_feature_transactions.csv")
print("2. Apply SAME feature engineering to val and test")
print("3. Train models on training set")
print("4. Tune on validation set")
print("5. Final evaluation on test set")



## Your File Structure

# After running this, you'll have:
# ```
# data/processed/
# ├── cleaned_data.csv                      # All transactions
# │
# ├── train_feature_transactions.csv        # Dec 2010 - Aug 2011
# ├── train_clv_target.csv                  # CustomerID + CLV (Sep-Nov 2011)
# │
# ├── val_feature_transactions.csv          # Dec 2010 - May 2011
# ├── val_clv_target.csv                    # CustomerID + CLV (Jun-Aug 2011)
# │
# ├── test_feature_transactions.csv         # Dec 2010 - Sep 2011
# └── test_clv_target.csv                   # CustomerID + CLV (Oct-Dec 2011)


1. TRAINING SET
------------------------------------------------------------
Feature period: 2010-12-01 to 2011-08-31
Target period:  2011-09-01 to 2011-11-30
Active customers: 3292
Active rate: 56.1%

2. VALIDATION SET
------------------------------------------------------------
Feature period: 2010-12-01 to 2011-05-31
Target period:  2011-06-01 to 2011-08-31
Active customers: 2689
Active rate: 49.5%

3. TEST SET
------------------------------------------------------------
Feature period: 2010-12-01 to 2011-09-30
Target period:  2011-10-01 to 2011-12-31
Active customers: 3583
Active rate: 51.0%

SUMMARY
Training set:   3,292 customers
Validation set: 2,689 customers
Test set:       3,583 customers

SAVED FILES
Training:
  - data/processed/train_feature_transactions.csv
  - data/processed/train_clv_target.csv

Validation:
  - data/processed/val_feature_transactions.csv
  - data/processed/val_clv_target.csv

Test:
  - data/processed/test_feature_transactions.csv
  - data/processed/test