In [19]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector as selector

In [20]:
df = pd.read_csv('data/ecommerce_customer_data_cleaned.csv')
df = df.iloc[:, 2:]
df

Unnamed: 0,Age,Gender,IncomeLevel,Country,City,TotalPurchases,AverageOrderValue,CustomerLifetimeValue,EmailEngagementRate,SocialMediaEngagementRate,...,PremiumMember,HasReturnedItems,FavoriteCategory_Beauty,FavoriteCategory_Books,FavoriteCategory_Clothing,FavoriteCategory_Electronics,FavoriteCategory_Food,FavoriteCategory_Home Goods,FavoriteCategory_Sports,FavoriteCategory_Toys
0,25,Prefer not to say,High,Japan,Tokyo,4,15.886509,327.828625,0.332365,0.379694,...,Yes,No,0.0,0.5,1.0,0.0,0.0,0.0,0.0,0.0
1,38,Prefer not to say,High,United Kingdom,London,6,27.638853,181.725056,0.344574,0.140988,...,No,No,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.5
2,43,Prefer not to say,High,France,Paris,10,161.739425,1810.555150,0.409656,0.323660,...,No,Yes,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
3,49,Female,High,United States,Los Angeles,5,14.194263,86.219740,0.356765,0.268428,...,Yes,No,0.0,0.0,0.5,0.0,0.0,0.0,1.0,0.0
4,29,Female,Very High,Japan,Tokyo,7,298.953396,2112.575945,0.222703,0.160427,...,No,No,0.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,39,Female,High,United States,New York,4,60.986389,212.992614,0.105934,0.128575,...,Yes,Yes,0.0,0.0,0.0,0.0,0.5,0.0,0.0,1.0
9996,58,Prefer not to say,Low,Australia,Sydney,6,75.208556,549.352094,0.424127,0.298521,...,No,Yes,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0
9997,38,Other,Very High,United States,New York,1,37.196899,117.203509,0.487704,0.126018,...,No,Yes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9998,29,Other,High,Germany,Berlin,3,13.191952,-6.655247,0.377627,0.600112,...,No,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [21]:
feature_columns = list(df.columns)
feature_columns.remove('CustomerLifetimeValue')
feature_columns

['Age',
 'Gender',
 'IncomeLevel',
 'Country',
 'City',
 'TotalPurchases',
 'AverageOrderValue',
 'EmailEngagementRate',
 'SocialMediaEngagementRate',
 'MobileAppUsage',
 'CustomerServiceInteractions',
 'AverageSatisfactionScore',
 'EmailConversionRate',
 'SocialMediaConversionRate',
 'SearchEngineConversionRate',
 'RepeatCustomer',
 'PremiumMember',
 'HasReturnedItems',
 'FavoriteCategory_Beauty',
 'FavoriteCategory_Books',
 'FavoriteCategory_Clothing',
 'FavoriteCategory_Electronics',
 'FavoriteCategory_Food',
 'FavoriteCategory_Home Goods',
 'FavoriteCategory_Sports',
 'FavoriteCategory_Toys']

In [39]:
def create_preprocessor(X):
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = [c for c in list(X.columns) if c not in num_cols]
    
    numeric_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components = 0.95))
    ])

    cat_pipeline = None
    
    if len(cat_cols) > 0:
        cat_pipeline = Pipeline([
            ('encoder', OneHotEncoder(handle_unknown = 'ignore', sparse_output = False))
        ])

    # combine pipelines into single preprocessor
    transformers = []

    if len(num_cols) > 0:
        transformers.append(('num', numeric_pipeline, num_cols))
        
    if cat_pipeline is not None:
        transformers.append(('cat', cat_pipeline, cat_cols))

    return ColumnTransformer(transformers, remainder = 'drop')

In [50]:
def clv_regression(X, y):
    preprocessor = create_preprocessor(X)
    
    # regressor for final imputation
    model = RandomForestRegressor(
        n_estimators = 300,
        random_state = 42,
        n_jobs = -1
    )

    # data flow pipeline
    pipe = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', model)
    ])

    # train the model
    pipe.fit(X, y)
    
    return pipe

In [51]:
X = df[feature_columns]
y = df['CustomerLifetimeValue']

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [56]:
model = clv_regression(X_train, y_train)
model

0,1,2
,steps,"[('preprocess', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_components,0.95
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [57]:
def infer(model, X):
    return model.predict(X)

In [58]:
infer(model, X_test)

array([675.64041146,  49.32490878, 103.42065104, ..., 381.17282631,
       516.79587558, 454.71444131], shape=(3000,))