In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
df = pd.read_csv('/kaggle/input/e-commerce/ecommerce_customer_data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Customer_ID               2000 non-null   object 
 1   Churn_Label               2000 non-null   int64  
 2   Age                       2000 non-null   int64  
 3   Gender                    1959 non-null   object 
 4   Location_Region           2000 non-null   object 
 5   Marital_Status            2000 non-null   object 
 6   Annual_Income             2000 non-null   float64
 7   Membership_Tier           2000 non-null   object 
 8   Join_Date                 2000 non-null   object 
 9   Last_Login_Date           2000 non-null   object 
 10  Total_Spend_Lifetime      2000 non-null   float64
 11  Items_In_Cart             2000 non-null   int64  
 12  Avg_Session_Duration_Min  2000 non-null   float64
 13  Customer_Support_Calls    2000 non-null   int64  
 14  Preferre

In [4]:
print(df.head())

  Customer_ID  Churn_Label  Age  Gender Location_Region Marital_Status  \
0  CUST_00000            0   45  Female            West       Divorced   
1  CUST_00001            1   24  Female            East        Married   
2  CUST_00002            0   52  Female            East        Married   
3  CUST_00003            0   45    Male           South        Married   
4  CUST_00004            0   31  Female            East       Divorced   

   Annual_Income Membership_Tier   Join_Date Last_Login_Date  \
0       79239.91          Silver  2021-10-16      2023-11-02   
1       67390.43        Platinum  2020-04-24      2023-01-02   
2       96437.88        Platinum  2020-01-26      2023-05-22   
3       31695.81        Platinum  2022-01-29      2023-03-16   
4       47409.48            Gold  2020-10-08      2023-03-08   

   Total_Spend_Lifetime  Items_In_Cart  Avg_Session_Duration_Min  \
0                944.99              6                      12.1   
1                 45.50           

In [5]:
print("Original Shape:", df.shape)

Original Shape: (2000, 19)


In [6]:
# STEP 2: Date Engineering
# Convert 'Join_Date' from String (Object) to Datetime format
df['Join_Date'] = pd.to_datetime(df['Join_Date'])

# Extract specific components
df['Join_Year'] = df['Join_Date'].dt.year
df['Join_Month'] = df['Join_Date'].dt.month
df['Join_Weekday'] = df['Join_Date'].dt.day_name()

# Calculate "Tenure" (How many days since they joined until today)
current_date = pd.to_datetime('today')
df['Tenure_Days'] = (current_date - df['Join_Date']).dt.days

In [7]:
# STEP 3: Categorical Encoding
# 1. One-Hot Encoding for Nominal data (Region, Gender)
# This creates columns like 'Region_North', 'Region_South', etc.
df = pd.get_dummies(df, columns=['Location_Region', 'Gender'], prefix=['Region', 'Gen'], dtype=int)

In [8]:
# STEP 4: Numerical Feature Creation
# CORRECTED LINE: Used 'Total_Spend_Lifetime' instead of 'Total_Spend'
df['Spend_Per_Year'] = df['Total_Spend_Lifetime'] / ((df['Tenure_Days'] / 365) + 0.01)

# Binning: Group 'Age' into categories (Young, Middle, Senior)
bins = [0, 30, 50, 100]
labels = ['Young_Adult', 'Middle_Aged', 'Senior']
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels)

In [9]:
print("Feature Engineering Complete!")
print(df[['Customer_ID', 'Spend_Per_Year', 'Age_Group']].head())

Feature Engineering Complete!
  Customer_ID  Spend_Per_Year    Age_Group
0  CUST_00000      223.445308  Middle_Aged
1  CUST_00001        7.970389  Young_Adult
2  CUST_00002      234.326422       Senior
3  CUST_00003       20.915859  Middle_Aged
4  CUST_00004      203.479430  Middle_Aged
