# ElectroShop Purchase Prediction - EDA & Feature Engineering

**Team Member:** [Your Name]

**Date:** November 13, 2025

## Objective
- Explore the ElectroShop dataset
- Understand purchase patterns
- Create meaningful features to improve prediction
- Reduce marketing costs from €630/day to ≤€200/day while maintaining sales

## 1. Setup & Data Loading

In [5]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

ModuleNotFoundError: No module named 'matplotlib'

In [4]:
# Load data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")

NameError: name 'pd' is not defined

## 2. Initial Data Exploration

In [None]:
# First look at the data
train_df.head(10)

In [None]:
# Data types and missing values
train_df.info()

In [None]:
# Summary statistics
train_df.describe()

In [None]:
# Check for missing values
missing = train_df.isnull().sum()
missing_pct = 100 * missing / len(train_df)
missing_df = pd.DataFrame({'Missing_Count': missing, 'Percentage': missing_pct})
missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

In [None]:
# Check for duplicates
duplicates = train_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
print(f"Number of unique Session_IDs: {train_df['Session_ID'].nunique()}")

## 3. Target Variable Analysis

In [None]:
# Purchase rate
purchase_rate = train_df['Purchase'].mean()
print(f"Overall purchase rate: {purchase_rate:.2%}")
print(f"\nPurchase distribution:")
print(train_df['Purchase'].value_counts())
print(f"\nPurchase distribution (%):")
print(train_df['Purchase'].value_counts(normalize=True) * 100)

In [None]:
# Visualize target distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
train_df['Purchase'].value_counts().plot(kind='bar', ax=axes[0], color=['#e74c3c', '#2ecc71'])
axes[0].set_title('Purchase Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Purchase (0=No, 1=Yes)')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(['No Purchase', 'Purchase'], rotation=0)

# Percentage plot
(train_df['Purchase'].value_counts(normalize=True) * 100).plot(kind='bar', ax=axes[1], color=['#e74c3c', '#2ecc71'])
axes[1].set_title('Purchase Distribution (Percentage)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Purchase (0=No, 1=Yes)')
axes[1].set_ylabel('Percentage (%)')
axes[1].set_xticklabels(['No Purchase', 'Purchase'], rotation=0)

plt.tight_layout()
plt.show()

print(f"\n⚠️ Class imbalance: {(1 - purchase_rate) / purchase_rate:.1f}:1 ratio (No Purchase : Purchase)")

## 4. Campaign Period Analysis

In [None]:
# Purchase rate by campaign period
campaign_purchase = train_df.groupby('Campaign_Period')['Purchase'].agg(['mean', 'count'])
campaign_purchase.columns = ['Purchase_Rate', 'Count']
campaign_purchase['Purchase_Rate'] = campaign_purchase['Purchase_Rate'] * 100
print("Purchase rate by Campaign Period:")
print(campaign_purchase)

In [None]:
# Purchase rate over days
daily_stats = train_df.groupby('Day').agg({
    'Purchase': ['mean', 'count'],
    'Campaign_Period': 'first'
}).reset_index()
daily_stats.columns = ['Day', 'Purchase_Rate', 'Sessions', 'Campaign_Period']
daily_stats['Purchase_Rate'] = daily_stats['Purchase_Rate'] * 100

# Visualize
fig, axes = plt.subplots(2, 1, figsize=(16, 10))

# Purchase rate by day
colors = ['#e74c3c' if cp else '#3498db' for cp in daily_stats['Campaign_Period']]
axes[0].bar(daily_stats['Day'], daily_stats['Purchase_Rate'], color=colors, alpha=0.7)
axes[0].axvspan(25, 50, alpha=0.2, color='red', label='Campaign 1')
axes[0].axvspan(75, 90, alpha=0.2, color='red', label='Campaign 2')
axes[0].set_title('Purchase Rate by Day (Campaign Periods Highlighted)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Day')
axes[0].set_ylabel('Purchase Rate (%)')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Sessions per day
axes[1].bar(daily_stats['Day'], daily_stats['Sessions'], color=colors, alpha=0.7)
axes[1].axvspan(25, 50, alpha=0.2, color='red')
axes[1].axvspan(75, 90, alpha=0.2, color='red')
axes[1].set_title('Number of Sessions by Day', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Day')
axes[1].set_ylabel('Number of Sessions')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Univariate Analysis - Numerical Features

In [None]:
# Identify numerical features
numerical_features = ['Age', 'Reviews_Read', 'Price', 'Discount', 'Items_In_Cart', 
                      'Socioeconomic_Status_Score', 'Engagement_Score']

# Distribution plots
fig, axes = plt.subplots(3, 3, figsize=(18, 15))
axes = axes.ravel()

for idx, col in enumerate(numerical_features):
    axes[idx].hist(train_df[col], bins=50, edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    axes[idx].grid(True, alpha=0.3)

# Remove extra subplots
for idx in range(len(numerical_features), 9):
    fig.delaxes(axes[idx])

plt.tight_layout()
plt.show()

In [None]:
# Box plots to identify outliers
fig, axes = plt.subplots(3, 3, figsize=(18, 15))
axes = axes.ravel()

for idx, col in enumerate(numerical_features):
    axes[idx].boxplot(train_df[col].dropna())
    axes[idx].set_title(f'Box Plot of {col}', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel(col)
    axes[idx].grid(True, alpha=0.3)

for idx in range(len(numerical_features), 9):
    fig.delaxes(axes[idx])

plt.tight_layout()
plt.show()

## 6. Univariate Analysis - Categorical Features

In [None]:
# Categorical features
categorical_features = ['Gender', 'Category', 'Time_of_Day', 'Email_Interaction', 
                        'Device_Type', 'Payment_Method', 'Referral_Source']

# Value counts for each categorical feature
for col in categorical_features:
    print(f"\n{'='*60}")
    print(f"{col} - Value Counts:")
    print('='*60)
    print(train_df[col].value_counts())
    print(f"\nPercentage distribution:")
    print(train_df[col].value_counts(normalize=True) * 100)

In [None]:
# Visualize categorical distributions
fig, axes = plt.subplots(3, 3, figsize=(18, 15))
axes = axes.ravel()

for idx, col in enumerate(categorical_features):
    value_counts = train_df[col].value_counts()
    axes[idx].bar(range(len(value_counts)), value_counts.values, alpha=0.7)
    axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Count')
    axes[idx].set_xticks(range(len(value_counts)))
    axes[idx].set_xticklabels(value_counts.index, rotation=45, ha='right')
    axes[idx].grid(True, alpha=0.3, axis='y')

for idx in range(len(categorical_features), 9):
    fig.delaxes(axes[idx])

plt.tight_layout()
plt.show()

## 7. Bivariate Analysis - Features vs Purchase

In [None]:
# Purchase rate by numerical features (binned)
fig, axes = plt.subplots(3, 3, figsize=(18, 15))
axes = axes.ravel()

for idx, col in enumerate(numerical_features):
    # Create bins
    train_df[f'{col}_binned'] = pd.qcut(train_df[col], q=10, duplicates='drop')
    
    # Calculate purchase rate by bin
    purchase_by_bin = train_df.groupby(f'{col}_binned')['Purchase'].mean() * 100
    
    # Plot
    purchase_by_bin.plot(kind='bar', ax=axes[idx], alpha=0.7)
    axes[idx].set_title(f'Purchase Rate by {col}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Purchase Rate (%)')
    axes[idx].tick_params(axis='x', rotation=45)
    axes[idx].grid(True, alpha=0.3, axis='y')
    axes[idx].axhline(y=purchase_rate*100, color='r', linestyle='--', label='Overall Avg')
    axes[idx].legend()
    
    # Drop temporary column
    train_df.drop(f'{col}_binned', axis=1, inplace=True)

for idx in range(len(numerical_features), 9):
    fig.delaxes(axes[idx])

plt.tight_layout()
plt.show()

In [None]:
# Purchase rate by categorical features
fig, axes = plt.subplots(3, 3, figsize=(18, 15))
axes = axes.ravel()

for idx, col in enumerate(categorical_features):
    purchase_by_cat = train_df.groupby(col)['Purchase'].mean() * 100
    purchase_by_cat.plot(kind='bar', ax=axes[idx], alpha=0.7)
    axes[idx].set_title(f'Purchase Rate by {col}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Purchase Rate (%)')
    axes[idx].tick_params(axis='x', rotation=45)
    axes[idx].grid(True, alpha=0.3, axis='y')
    axes[idx].axhline(y=purchase_rate*100, color='r', linestyle='--', label='Overall Avg')
    axes[idx].legend()

for idx in range(len(categorical_features), 9):
    fig.delaxes(axes[idx])

plt.tight_layout()
plt.show()

In [None]:
# Statistical tests for categorical features
from scipy.stats import chi2_contingency

print("Chi-square test results for categorical features:")
print("="*70)

for col in categorical_features:
    contingency_table = pd.crosstab(train_df[col], train_df['Purchase'])
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    
    print(f"\n{col}:")
    print(f"  Chi-square statistic: {chi2:.4f}")
    print(f"  P-value: {p_value:.4e}")
    print(f"  Significant: {'Yes' if p_value < 0.05 else 'No'}")

## 8. Correlation Analysis

In [None]:
# Correlation matrix for numerical features
correlation_features = numerical_features + ['Purchase', 'Gender', 'Email_Interaction', 'Category']
corr_matrix = train_df[correlation_features].corr()

# Visualize
plt.figure(figsize=(14, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Correlation with target variable
target_corr = corr_matrix['Purchase'].sort_values(ascending=False)
print("Correlation with Purchase:")
print("="*40)
print(target_corr)

In [None]:
# Visualize correlations with target
plt.figure(figsize=(10, 8))
target_corr.drop('Purchase').plot(kind='barh', alpha=0.7)
plt.title('Feature Correlation with Purchase', fontsize=14, fontweight='bold')
plt.xlabel('Correlation Coefficient')
plt.ylabel('Features')
plt.grid(True, alpha=0.3, axis='x')
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
plt.tight_layout()
plt.show()

## 9. Price & Discount Analysis

In [None]:
# Effective price (price after discount)
train_df['Effective_Price'] = train_df['Price'] * (1 - train_df['Discount'] / 100)

# Analyze price and discount effects
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# Price distribution by purchase
train_df[train_df['Purchase']==0]['Price'].hist(bins=50, alpha=0.5, label='No Purchase', ax=axes[0,0])
train_df[train_df['Purchase']==1]['Price'].hist(bins=50, alpha=0.5, label='Purchase', ax=axes[0,0])
axes[0,0].set_title('Price Distribution by Purchase')
axes[0,0].set_xlabel('Price (€)')
axes[0,0].legend()

# Discount distribution by purchase
train_df[train_df['Purchase']==0]['Discount'].hist(bins=50, alpha=0.5, label='No Purchase', ax=axes[0,1])
train_df[train_df['Purchase']==1]['Discount'].hist(bins=50, alpha=0.5, label='Purchase', ax=axes[0,1])
axes[0,1].set_title('Discount Distribution by Purchase')
axes[0,1].set_xlabel('Discount (%)')
axes[0,1].legend()

# Effective price distribution by purchase
train_df[train_df['Purchase']==0]['Effective_Price'].hist(bins=50, alpha=0.5, label='No Purchase', ax=axes[0,2])
train_df[train_df['Purchase']==1]['Effective_Price'].hist(bins=50, alpha=0.5, label='Purchase', ax=axes[0,2])
axes[0,2].set_title('Effective Price Distribution by Purchase')
axes[0,2].set_xlabel('Effective Price (€)')
axes[0,2].legend()

# Scatter plots
axes[1,0].scatter(train_df[train_df['Purchase']==0]['Price'], 
                  train_df[train_df['Purchase']==0]['Discount'], 
                  alpha=0.3, label='No Purchase', s=10)
axes[1,0].scatter(train_df[train_df['Purchase']==1]['Price'], 
                  train_df[train_df['Purchase']==1]['Discount'], 
                  alpha=0.3, label='Purchase', s=10)
axes[1,0].set_title('Price vs Discount')
axes[1,0].set_xlabel('Price (€)')
axes[1,0].set_ylabel('Discount (%)')
axes[1,0].legend()

# Purchase rate by discount brackets
train_df['Discount_Bracket'] = pd.cut(train_df['Discount'], bins=[0, 10, 20, 30, 40, 100])
discount_purchase = train_df.groupby('Discount_Bracket')['Purchase'].mean() * 100
discount_purchase.plot(kind='bar', ax=axes[1,1], alpha=0.7)
axes[1,1].set_title('Purchase Rate by Discount Bracket')
axes[1,1].set_ylabel('Purchase Rate (%)')
axes[1,1].tick_params(axis='x', rotation=45)
axes[1,1].axhline(y=purchase_rate*100, color='r', linestyle='--')

# Purchase rate by price brackets
train_df['Price_Bracket'] = pd.qcut(train_df['Price'], q=5, duplicates='drop')
price_purchase = train_df.groupby('Price_Bracket')['Purchase'].mean() * 100
price_purchase.plot(kind='bar', ax=axes[1,2], alpha=0.7)
axes[1,2].set_title('Purchase Rate by Price Bracket')
axes[1,2].set_ylabel('Purchase Rate (%)')
axes[1,2].tick_params(axis='x', rotation=45)
axes[1,2].axhline(y=purchase_rate*100, color='r', linestyle='--')

plt.tight_layout()
plt.show()

# Clean up temporary columns
train_df.drop(['Discount_Bracket', 'Price_Bracket'], axis=1, inplace=True)

## 10. Feature Engineering

### 10.1 Price-Related Features

In [None]:
def create_price_features(df):
    """
    Create features related to price and discount
    """
    df = df.copy()
    
    # Effective price after discount
    df['Effective_Price'] = df['Price'] * (1 - df['Discount'] / 100)
    
    # Discount amount in euros
    df['Discount_Amount'] = df['Price'] * df['Discount'] / 100
    
    # Price per item in cart
    df['Price_Per_Item'] = df['Price'] / (df['Items_In_Cart'] + 1)  # +1 to avoid division by zero
    
    # Binary: High discount (>30%)
    df['High_Discount'] = (df['Discount'] > 30).astype(int)
    
    # Binary: Has discount
    df['Has_Discount'] = (df['Discount'] > 0).astype(int)
    
    # Price categories
    df['Price_Category'] = pd.cut(df['Price'], 
                                   bins=[0, 50, 200, 500, 10000], 
                                   labels=['Low', 'Medium', 'High', 'Premium'])
    
    return df

# Test the function
train_with_price_features = create_price_features(train_df)
print("New price-related features created:")
print(train_with_price_features[['Price', 'Discount', 'Effective_Price', 'Discount_Amount', 
                                  'Price_Per_Item', 'High_Discount', 'Has_Discount', 'Price_Category']].head())

### 10.2 Engagement & Behavior Features

In [None]:
def create_engagement_features(df):
    """
    Create features related to user engagement and behavior
    """
    df = df.copy()
    
    # Reviews engagement
    df['Reviews_Engaged'] = (df['Reviews_Read'] > 0).astype(int)
    df['Heavy_Reviewer'] = (df['Reviews_Read'] > df['Reviews_Read'].median()).astype(int)
    
    # Cart engagement
    df['Has_Items_In_Cart'] = (df['Items_In_Cart'] > 0).astype(int)
    df['Multiple_Items_In_Cart'] = (df['Items_In_Cart'] > 1).astype(int)
    
    # Engagement score categories
    df['Engagement_Level'] = pd.qcut(df['Engagement_Score'], 
                                     q=4, 
                                     labels=['Low', 'Medium', 'High', 'Very_High'],
                                     duplicates='drop')
    
    # Combined engagement metric
    df['Total_Engagement'] = (df['Reviews_Read'] + df['Items_In_Cart'] + 
                              df['Email_Interaction'] * 5) * df['Engagement_Score']
    
    # Interaction with email during campaign
    df['Email_During_Campaign'] = df['Email_Interaction'] & df['Campaign_Period']
    
    return df

# Test the function
train_with_engagement = create_engagement_features(train_with_price_features)
print("New engagement features created:")
print(train_with_engagement[['Reviews_Read', 'Items_In_Cart', 'Engagement_Score',
                             'Reviews_Engaged', 'Has_Items_In_Cart', 'Total_Engagement']].head())

### 10.3 Temporal Features

In [None]:
def create_temporal_features(df):
    """
    Create features related to time and campaign periods
    """
    df = df.copy()
    
    # Days since start
    df['Days_Since_Start'] = df['Day']
    
    # Days into campaign (if in campaign)
    df['Days_Into_Campaign'] = 0
    df.loc[(df['Day'] >= 25) & (df['Day'] <= 50), 'Days_Into_Campaign'] = df['Day'] - 25
    df.loc[(df['Day'] >= 75) & (df['Day'] <= 90), 'Days_Into_Campaign'] = df['Day'] - 75
    
    # Which campaign (0=none, 1=first, 2=second)
    df['Campaign_Number'] = 0
    df.loc[(df['Day'] >= 25) & (df['Day'] <= 50), 'Campaign_Number'] = 1
    df.loc[(df['Day'] >= 75) & (df['Day'] <= 90), 'Campaign_Number'] = 2
    
    # Time of day indicators
    df['Is_Morning'] = (df['Time_of_Day'] == 'morning').astype(int)
    df['Is_Evening'] = (df['Time_of_Day'] == 'evening').astype(int)
    df['Is_Afternoon'] = (df['Time_of_Day'] == 'afternoon').astype(int)
    
    # Weekend proxy (assuming 7-day weeks, days 6-7, 13-14, etc. are weekends)
    df['Is_Weekend'] = ((df['Day'] % 7 == 6) | (df['Day'] % 7 == 0)).astype(int)
    
    # Week number
    df['Week_Number'] = (df['Day'] - 1) // 7 + 1
    
    return df

# Test the function
train_with_temporal = create_temporal_features(train_with_engagement)
print("New temporal features created:")
print(train_with_temporal[['Day', 'Campaign_Period', 'Campaign_Number', 'Days_Into_Campaign',
                           'Time_of_Day', 'Is_Morning', 'Is_Weekend', 'Week_Number']].head(20))

### 10.4 Customer Segment Features

In [None]:
def create_customer_features(df):
    """
    Create features related to customer demographics and segmentation
    """
    df = df.copy()
    
    # Age groups
    df['Age_Group'] = pd.cut(df['Age'], 
                             bins=[0, 25, 35, 45, 55, 100], 
                             labels=['18-25', '26-35', '36-45', '46-55', '56+'])
    
    # Young customer
    df['Is_Young'] = (df['Age'] < 30).astype(int)
    
    # Senior customer
    df['Is_Senior'] = (df['Age'] > 55).astype(int)
    
    # Socioeconomic categories
    df['SES_Category'] = pd.qcut(df['Socioeconomic_Status_Score'], 
                                  q=3, 
                                  labels=['Low_SES', 'Mid_SES', 'High_SES'],
                                  duplicates='drop')
    
    # High SES flag
    df['Is_High_SES'] = (df['Socioeconomic_Status_Score'] > 
                         df['Socioeconomic_Status_Score'].quantile(0.75)).astype(int)
    
    # Customer value score (combining SES and engagement)
    df['Customer_Value_Score'] = (df['Socioeconomic_Status_Score'] * 0.5 + 
                                  df['Engagement_Score'] * 0.5)
    
    # Gender-Age interaction
    df['Female_Young'] = ((df['Gender'] == 1) & (df['Age'] < 35)).astype(int)
    df['Male_Senior'] = ((df['Gender'] == 0) & (df['Age'] > 50)).astype(int)
    
    return df

# Test the function
train_with_customer = create_customer_features(train_with_temporal)
print("New customer segment features created:")
print(train_with_customer[['Age', 'Gender', 'Socioeconomic_Status_Score',
                           'Age_Group', 'SES_Category', 'Customer_Value_Score']].head())

### 10.5 Device & Channel Features

In [None]:
def create_channel_features(df):
    """
    Create features related to device, payment, and referral channels
    """
    df = df.copy()
    
    # Device type indicators
    df['Is_Mobile'] = (df['Device_Type'] == 'Mobile').astype(int)
    df['Is_Desktop'] = (df['Device_Type'] == 'Desktop').astype(int)
    df['Is_Tablet'] = (df['Device_Type'] == 'Tablet').astype(int)
    
    # Payment method categories
    df['Digital_Payment'] = df['Payment_Method'].isin(['Credit', 'PayPal']).astype(int)
    df['Traditional_Payment'] = df['Payment_Method'].isin(['Bank', 'Cash']).astype(int)
    
    # Referral source categories
    df['Organic_Traffic'] = df['Referral_Source'].isin(['Direct', 'Search_engine']).astype(int)
    df['Paid_Traffic'] = df['Referral_Source'].isin(['Ads', 'Email']).astype(int)
    df['Social_Traffic'] = (df['Referral_Source'] == 'Social_media').astype(int)
    
    # Email referral combined with email interaction
    df['Email_Engaged_User'] = ((df['Referral_Source'] == 'Email') & 
                                (df['Email_Interaction'] == 1)).astype(int)
    
    return df

# Test the function
train_with_channels = create_channel_features(train_with_customer)
print("New channel features created:")
print(train_with_channels[['Device_Type', 'Payment_Method', 'Referral_Source',
                           'Is_Mobile', 'Digital_Payment', 'Organic_Traffic']].head())

### 10.6 Interaction Features

In [None]:
def create_interaction_features(df):
    """
    Create interaction features between different variables
    """
    df = df.copy()
    
    # Price and discount interaction
    df['Price_Discount_Interaction'] = df['Price'] * df['Discount']
    
    # Engagement and SES interaction
    df['Engagement_SES_Interaction'] = df['Engagement_Score'] * df['Socioeconomic_Status_Score']
    
    # Reviews and items interaction
    df['Reviews_Items_Ratio'] = df['Reviews_Read'] / (df['Items_In_Cart'] + 1)
    
    # Age and price interaction
    df['Age_Price_Interaction'] = df['Age'] * df['Price']
    
    # Campaign and discount interaction
    df['Campaign_Discount_Interaction'] = df['Campaign_Period'].astype(int) * df['Discount']
    
    # Mobile and evening interaction (mobile evening shopping)
    df['Mobile_Evening'] = ((df['Device_Type'] == 'Mobile') & 
                            (df['Time_of_Day'] == 'evening')).astype(int)
    
    # High engagement with items in cart
    df['High_Engagement_With_Cart'] = ((df['Engagement_Score'] > df['Engagement_Score'].median()) & 
                                       (df['Items_In_Cart'] > 0)).astype(int)
    
    return df

# Test the function
train_with_interactions = create_interaction_features(train_with_channels)
print("New interaction features created:")
print(train_with_interactions[['Price', 'Discount', 'Engagement_Score',
                               'Price_Discount_Interaction', 'Engagement_SES_Interaction']].head())

### 10.7 Combined Feature Engineering Pipeline

In [None]:
def engineer_features(df):
    """
    Apply all feature engineering steps
    """
    df = create_price_features(df)
    df = create_engagement_features(df)
    df = create_temporal_features(df)
    df = create_customer_features(df)
    df = create_channel_features(df)
    df = create_interaction_features(df)
    
    return df

# Apply to train and test sets
train_engineered = engineer_features(train_df)
test_engineered = engineer_features(test_df)

print(f"Original training set shape: {train_df.shape}")
print(f"Engineered training set shape: {train_engineered.shape}")
print(f"\nNumber of new features created: {train_engineered.shape[1] - train_df.shape[1]}")

In [None]:
# Display new features
new_features = [col for col in train_engineered.columns if col not in train_df.columns]
print(f"\nNew features ({len(new_features)} total):")
print("="*60)
for i, feat in enumerate(new_features, 1):
    print(f"{i}. {feat}")

## 11. Feature Importance Analysis

In [None]:
# Quick random forest to assess feature importance
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Prepare data for quick model
rf_df = train_engineered.copy()

# Encode categorical variables
le = LabelEncoder()
categorical_cols = ['Time_of_Day', 'Device_Type', 'Payment_Method', 'Referral_Source',
                   'Price_Category', 'Engagement_Level', 'Age_Group', 'SES_Category']

for col in categorical_cols:
    if col in rf_df.columns:
        rf_df[col] = le.fit_transform(rf_df[col].astype(str))

# Drop non-feature columns
feature_cols = [col for col in rf_df.columns if col not in ['Purchase', 'Session_ID']]
X = rf_df[feature_cols]
y = rf_df['Purchase']

# Train quick random forest
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X, y)

# Get feature importances
feature_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

print("Top 30 Most Important Features:")
print("="*60)
print(feature_importance.head(30))

In [None]:
# Visualize feature importance
plt.figure(figsize=(12, 10))
top_features = feature_importance.head(25)
plt.barh(range(len(top_features)), top_features['Importance'], alpha=0.7)
plt.yticks(range(len(top_features)), top_features['Feature'])
plt.xlabel('Importance Score')
plt.title('Top 25 Feature Importances (Random Forest)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

## 12. Key Insights Summary

In [None]:
# Compile key insights
print("="*80)
print("KEY INSIGHTS FROM EDA & FEATURE ENGINEERING")
print("="*80)

print("\n1. TARGET VARIABLE")
print("-" * 80)
print(f"   - Overall purchase rate: {purchase_rate:.2%}")
print(f"   - Class imbalance ratio: {(1-purchase_rate)/purchase_rate:.1f}:1")

print("\n2. CAMPAIGN EFFECTIVENESS")
print("-" * 80)
campaign_stats = train_df.groupby('Campaign_Period')['Purchase'].mean()
print(f"   - Purchase rate during campaigns: {campaign_stats[True]:.2%}")
print(f"   - Purchase rate outside campaigns: {campaign_stats[False]:.2%}")
print(f"   - Campaign lift: {(campaign_stats[True]/campaign_stats[False]-1)*100:.1f}%")

print("\n3. TOP CORRELATED FEATURES WITH PURCHASE")
print("-" * 80)
print(target_corr.head(6))

print("\n4. FEATURES CREATED")
print("-" * 80)
print(f"   - Original features: {train_df.shape[1]}")
print(f"   - Engineered features: {train_engineered.shape[1]}")
print(f"   - New features added: {train_engineered.shape[1] - train_df.shape[1]}")

print("\n5. TOP FEATURE CATEGORIES (by importance)")
print("-" * 80)
for i, row in feature_importance.head(10).iterrows():
    print(f"   {i+1}. {row['Feature']}: {row['Importance']:.4f}")

print("\n" + "="*80)

## 13. Save Engineered Data

In [None]:
# Save engineered datasets
train_engineered.to_csv('../data/train_engineered.csv', index=False)
test_engineered.to_csv('../data/test_engineered.csv', index=False)

print("Engineered datasets saved!")
print(f"Train: {train_engineered.shape}")
print(f"Test: {test_engineered.shape}")

## 14. Next Steps

### For Tomorrow's Team Discussion:

1. **Feature Selection**
   - Compare which features each team member found most useful
   - Decide on a unified feature set or test multiple combinations
   - Consider dimensionality reduction (PCA, feature selection methods)

2. **Model Selection**
   - Test multiple algorithms (Logistic Regression, Random Forest, XGBoost, etc.)
   - Consider ensemble methods
   - Set up proper cross-validation

3. **Class Imbalance Handling**
   - Test SMOTE, class weights, or undersampling
   - Evaluate using appropriate metrics (F1, ROC-AUC, Precision-Recall)

4. **Business Constraints**
   - Set threshold to achieve ≤€200/day budget (max 2000 sessions)
   - Optimize for precision to reduce wasted ad spend
   - Create cost-benefit analysis

5. **Insights & Playbook**
   - Develop 3-5 actionable marketing rules from feature importance
   - Create customer segments for targeted marketing
   - Document findings for the report

## Notes & Ideas

Use this section to document your own observations and ideas:

- 
- 
- 