# LifeSync â€” Production-Grade Cold-Start Personalization Pipeline

In [1]:
import numpy as np
import pandas as pd
import json
import joblib
from pathlib import Path

from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier

In [2]:
BASE_DIR = Path('artifacts')
BASE_DIR.mkdir(exist_ok=True)

SCALER_PATH = BASE_DIR / 'scaler.joblib'
PCA_PATH = BASE_DIR / 'pca.joblib'
GMM_PATH = BASE_DIR / 'gmm.joblib'
CLUSTER_MODEL_PATH = BASE_DIR / 'cluster_predictor.joblib'
CLUSTER_PROFILES_PATH = BASE_DIR / 'cluster_profiles.json'
INFLUENCE_MAP_PATH = BASE_DIR / 'feature_influence.json'

N_CLUSTERS = 12
RANDOM_STATE = 42

In [3]:
df = pd.read_csv('lifesync_synthetic_dataset_20k.csv')
print('Dataset shape:', df.shape)

Dataset shape: (20000, 46)


In [4]:
COLD_START_FEATURES = [
 'sleep_quality','physical_activity_consistency','diet_quality','daily_energy_level','sedentary_level',
 'stress_level','anxiety_level','mood_stability','mindfulness_habit','social_support',
 'focus_ability','task_completion_reliability','distraction_level',
 'financial_discipline','financial_stress'
]

FULL_FEATURES = [c for c in df.columns if c not in [
 'health_score','mind_score','productivity_score','finance_score','life_score'
]]

In [5]:
df['sleep_quality'] = df['sleep_hours'] / 9 * 100
df['physical_activity_consistency'] = df['exercise_days_per_week'] / 7 * 100
df['diet_quality'] = df['diet_ratio']
df['sedentary_level'] = df['sedentary_hours'] / 12 * 100

df['stress_level'] = df['stress_score']
df['anxiety_level'] = df['anxiety_score']
df['mood_stability'] = df['mood_stability_score']
df['mindfulness_habit'] = df['meditation_completion_ratio']
df['social_support'] = (df['family_support_ratio'] + df['friends_support_ratio']) / 2

df['focus_ability'] = df['focus_level']
df['task_completion_reliability'] = df['task_completion_ratio']
df['distraction_level'] = df['distraction_ratio']

df['financial_discipline'] = df['expense_tracking_score']
df['financial_stress'] = df['debt_pressure_score']

In [6]:
scaler = MinMaxScaler()
X_full = scaler.fit_transform(df[FULL_FEATURES])
joblib.dump(scaler, SCALER_PATH)

['artifacts\\scaler.joblib']

In [7]:
pca = PCA(n_components=0.9, random_state=RANDOM_STATE)
X_pca = pca.fit_transform(X_full)
joblib.dump(pca, PCA_PATH)
print('PCA dimensions:', X_pca.shape[1])

PCA dimensions: 18


In [8]:
gmm = GaussianMixture(
 n_components=N_CLUSTERS,
 covariance_type='full',
 random_state=RANDOM_STATE
)
df['cluster_id'] = gmm.fit_predict(X_pca)
joblib.dump(gmm, GMM_PATH)

['artifacts\\gmm.joblib']

In [9]:
cluster_profiles = {}

for cid in range(N_CLUSTERS):
 data = df[df['cluster_id'] == cid][FULL_FEATURES]
 cluster_profiles[cid] = {
  'mean': data.mean().to_dict(),
  'std': data.std().fillna(0).to_dict(),
  'p05': data.quantile(0.05).to_dict(),
  'p95': data.quantile(0.95).to_dict()
 }

with open(CLUSTER_PROFILES_PATH, 'w') as f:
 json.dump(cluster_profiles, f)

cluster_profiles = {int(k): v for k, v in json.load(open(CLUSTER_PROFILES_PATH)).items()}

In [None]:
X = df[COLD_START_FEATURES]
y = df['cluster_id']

X_train, X_val, y_train, y_val = train_test_split(
 X, y, test_size=0.2, random_state=RANDOM_STATE
)

cluster_model = LGBMClassifier(
 n_estimators=300,
 max_depth=6,
 learning_rate=0.05,
 random_state=RANDOM_STATE
)

cluster_model.fit(X_train, y_train)
joblib.dump(cluster_model, CLUSTER_MODEL_PATH)

In [11]:
feature_influence = {
 'sleep_hours': {'sleep_quality': 0.04, 'stress_level': -0.03},
 'steps_count': {'physical_activity_consistency': 80, 'sedentary_level': -60},
 'distraction_ratio': {'stress_level': 0.5},
 'savings_ratio': {'financial_stress': -0.5}
}

with open(INFLUENCE_MAP_PATH, 'w') as f:
 json.dump(feature_influence, f, indent=2)

In [12]:
def normalized_entropy(probs):
 probs = np.clip(probs, 1e-9, 1)
 return -np.sum(probs * np.log(probs)) / np.log(len(probs))

In [13]:
def generate_full_profile(user_15, probs):
 cid = np.random.choice(len(probs), p=probs)
 stats = cluster_profiles[cid]

 entropy = normalized_entropy(probs)
 confidence = 1 - entropy

 profile = {}

 for f in FULL_FEATURES:
  base = stats['mean'].get(f, 0)
  std = stats['std'].get(f, 1)

  noise = np.random.normal(0, std * (0.15 + 0.35 * confidence))

  influence = 0
  for src, targets in feature_influence.items():
   if f in targets:
    influence += confidence * targets[f] * (user_15.get(src, 50) - 50)

  value = base + influence + noise
  value = max(stats['p05'].get(f, value), min(value, stats['p95'].get(f, value)))

  profile[f] = value

 return profile, {'entropy': float(entropy), 'confidence': float(confidence)}

In [14]:
def infer_user_profile_from_json(user_json):
 for f in COLD_START_FEATURES:
  if f not in user_json or not (0 <= user_json[f] <= 100):
   raise ValueError(f'Invalid value for {f}')

 probs = cluster_model.predict_proba(pd.DataFrame([user_json]))[0]
 profile, meta = generate_full_profile(user_json, probs)

 return {
  'cluster_probabilities': probs.tolist(),
  'confidence': meta,
  'generated_features': profile
 }

In [15]:
from sklearn.metrics import accuracy_score, top_k_accuracy_score

y_pred = cluster_model.predict(X_val)
y_prob = cluster_model.predict_proba(X_val)

acc = accuracy_score(y_val, y_pred)
top3 = top_k_accuracy_score(y_val, y_prob, k=3)

print(f"Top-1 Accuracy: {acc:.3f}")
print(f"Top-3 Accuracy: {top3:.3f}")
print(f"Mean Cluster Entropy: {np.mean([normalized_entropy(p) for p in y_prob]):.3f}")


Top-1 Accuracy: 0.406
Top-3 Accuracy: 0.830
Mean Cluster Entropy: 0.457


In [16]:
def normalized_mae(real, pred, low, high):
    if high - low == 0:
        return 0
    return abs(real - pred) / (high - low)

errors = []

for i in range(200):
    real_row = df.iloc[X_val.index[i]]
    user_15 = X_val.iloc[i].to_dict()

    probs = cluster_model.predict_proba(pd.DataFrame([user_15]))[0]
    gen_profile, _ = generate_full_profile(user_15, probs)

    cid = int(real_row["cluster_id"])

    for f in FULL_FEATURES:
        err = normalized_mae(
            real_row[f],
            gen_profile[f],
            cluster_profiles[cid]["p05"][f],
            cluster_profiles[cid]["p95"][f]
        )
        errors.append(err)

print("Mean Normalized MAE:", np.mean(errors))


Mean Normalized MAE: 0.27542771777991787


In [17]:
def correlation_check(feature_x, feature_y, samples=300):
    xs, ys = [], []

    for _ in range(samples):
        row = X_val.sample(1).iloc[0]
        probs = cluster_model.predict_proba(pd.DataFrame([row]))[0]
        profile, _ = generate_full_profile(row.to_dict(), probs)

        xs.append(profile[feature_x])
        ys.append(profile[feature_y])

    return np.corrcoef(xs, ys)[0, 1]

print("Stress vs Distraction:", correlation_check("stress_score", "distraction_ratio"))
print("Sleep vs Energy:", correlation_check("sleep_hours", "daily_energy_level"))
print("Debt vs Savings:", correlation_check("debt_pressure_score", "savings_ratio"))


Stress vs Distraction: 0.49374141323240517
Sleep vs Energy: 0.12999022658098294
Debt vs Savings: -0.577274374028165


In [18]:
def sanity_check(profile):
    issues = []

    if profile["sleep_hours"] < 5 and profile["daily_energy_level"] > 70:
        issues.append("Low sleep but high energy")

    if profile["sedentary_hours"] > 10 and profile["steps_count"] > 15000:
        issues.append("High sedentary but high steps")

    if profile["debt_pressure_score"] > 80 and profile["savings_ratio"] > 60:
        issues.append("High debt but high savings")

    return issues


violations = 0
for _ in range(200):
    row = X_val.sample(1).iloc[0]
    probs = cluster_model.predict_proba(pd.DataFrame([row]))[0]
    profile, _ = generate_full_profile(row.to_dict(), probs)

    if sanity_check(profile):
        violations += 1

print("Sanity Violations:", violations)


Sanity Violations: 0
