In [5]:
import pandas as pd,numpy as np, os, joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor

os.makedirs("models",exist_ok=True)
os.makedirs("outputs",exist_ok=True)

TRAIN_FILE="ltv_train.csv"
MODEL_PATH="models/ltv_xgb.joblib"
PRED_PATH="outputs/ltv_predictions.csv"

In [8]:
df=pd.read_csv(TRAIN_FILE)
print("Loaded:", TRAIN_FILE,"shape:",df.shape)
display(df.head())

Loaded: ltv_train.csv shape: (51, 14)


Unnamed: 0,customer_id,frequency,monetary_sum,monetory_avg,last_order_date,first_order_date,distinct_categories,avg_items,recencey_days,tenure_days,orders_30d,orders_90d,orders_180d,ltv_12m
0,C0001,2,1650,825.0,2023-05-11,2023-02-15,2,1.5,600,685,0.0,0.0,0.0,3620.0
1,C0002,4,2480,620.0,2024-03-29,2022-12-28,4,1.75,277,734,0.0,0.0,0.0,2970.0
2,C0003,4,2790,697.5,2024-07-19,2023-03-22,4,1.25,165,650,0.0,0.0,1.0,3230.0
3,C0004,4,4540,1135.0,2024-11-01,2020-08-12,4,1.5,60,1602,0.0,2.0,2.0,2770.0
4,C0005,3,1775,591.666667,2024-06-05,2021-06-30,3,1.0,209,1280,0.0,0.0,0.0,830.0


In [9]:
print(df['ltv_12m'].describe().to_string())
print("zero-LTV %:",(df['ltv_12m']==0).mean()*100)
USE_LOG=True

count      51.000000
mean     3490.392157
std      1497.652778
min         0.000000
25%      2665.000000
50%      3280.000000
75%      4185.000000
max      8050.000000
zero-LTV %: 1.9607843137254901


In [11]:
candidate=["frequency","monetory_sum","monetory_avg","avg_items","distinct_categories","recency_days","tenure_days","orders_30d",
           "orders_90d","or ders_180d","age","acq_channel_count","signup_tenure_days"]
features=[f for f in candidate if f in df.columns]
cats=[c for c in["region","acquisition_channel"] if c in df.columns]
print("Numeric features:", features)
print("Categorical features:", cats)

Numeric features: ['frequency', 'monetory_avg', 'avg_items', 'distinct_categories', 'tenure_days', 'orders_30d', 'orders_90d']
Categorical features: []


In [13]:
train_df, val_df= train_test_split(df, test_size=0.2, random_state=42)
print("Train:",train_df.shape, "Val:", val_df.shape)

Train: (40, 14) Val: (11, 14)


In [14]:
# create ColumnTransformer (one-hot small cats)
transformers = []
if cats:
    transformers.append(("cat", OneHotEncoder(handle_unknown="ignore", sparse=False), cats))
preprocessor = ColumnTransformer(transformers=transformers, remainder="passthrough", verbose_feature_names_out=False)
preprocessor.fit(train_df[features + cats])
X_train = preprocessor.transform(train_df[features + cats])
X_val   = preprocessor.transform(val_df[features + cats])

if USE_LOG:
    y_train = np.log1p(train_df['ltv_12m'].values)
    y_val = np.log1p(val_df['ltv_12m'].values)
else:
    y_train = train_df['ltv_12m'].values
    y_val = val_df['ltv_12m'].values

print("X_train shape:", X_train.shape, "y_train len:", len(y_train))

X_train shape: (40, 7) y_train len: 40


In [16]:
model = XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=6, subsample=0.8,
                     colsample_bytree=0.8, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
joblib.dump({"model":model, "preprocessor":preprocessor}, MODEL_PATH)
print("Saved model:", MODEL_PATH)

Saved model: models/ltv_xgb.joblib


In [17]:
pred_val = model.predict(X_val)
if USE_LOG:
    pred_val_inv = np.expm1(pred_val)
    y_val_inv = np.expm1(y_val)
else:
    pred_val_inv = pred_val
    y_val_inv = y_val

mae = mean_absolute_error(y_val_inv, pred_val_inv)
rmse = mean_squared_error(y_val_inv, pred_val_inv, squared=False)
print(f"Validation MAE: {mae:.2f}")
print(f"Validation RMSE: {rmse:.2f}")
pd.Series({"MAE":mae,"RMSE":rmse}).to_json("outputs/model_metrics.json")

Validation MAE: 1083.84
Validation RMSE: 1345.63


In [19]:
all_feat = pd.read_csv("ltv_features_raw.csv")
X_all = preprocessor.transform(all_feat[features + cats])
pred_all = model.predict(X_all)
if USE_LOG: pred_all = np.expm1(pred_all)

out = all_feat[['customer_id']].copy()
out['predicted_ltv_12m'] = pred_all
out['ltv_segment'] = pd.qcut(out['predicted_ltv_12m'].rank(method='first'), q=[0,.6,.9,1.0], labels=['Low','Medium','High'])
out.to_csv(PRED_PATH, index=False)
display(out.head(10))
print("Saved:", PRED_PATH)

Unnamed: 0,customer_id,predicted_ltv_12m,ltv_segment
0,C0001,3617.125244,Medium
1,C0002,2971.211914,Low
2,C0003,3227.629395,Low
3,C0004,3603.339111,Low
4,C0005,830.186035,Low
5,C0006,5392.304688,High
6,C0007,2710.61377,Low
7,C0008,3897.77002,Medium
8,C0009,2200.937744,Low
9,C0010,6297.110352,High


Saved: outputs/ltv_predictions.csv


In [20]:
import os
os.getcwd()

'C:\\Users\\somus'

In [21]:
import os
os.listdir("outputs")

['ltv_predictions.csv', 'model_metrics.json']