In [None]:
import pandas as pd
from IPython.display import display
import uuid
import random
from datetime import datetime, timedelta

OVERWRITE = False

In [22]:
error_metrics = ["MAE", "RMSE"]
other_metrics = ["HR", "cHR", "MRR", "Coverage", "Precision@K", "Recall@K", "F1@K"]
all_metrics = error_metrics + other_metrics

In [23]:
country_codes = ["FR", "LU", "NL", "UK"]
variant_ids = [
    [f"{country_code}{i:04d}" for i in range(1, 4)] for country_code in country_codes
]
versions = ["1.0.0", "1.0.1", "1.0.2"]
model_types = ["content-based", "collaborative", "hybrid"]


def create_model_metrics(factor):
    mae_lower = 4.0 - float(factor)
    other_lower = float(factor)
    other_upper = 10.0 - (4.0 - other_lower)
    mae = round(random.uniform(mae_lower, 10.0) / 10, 5)
    rmse = round(random.uniform(mae_lower + 1.0, 10.0) / 10, 5)
    other_metrics = ["HR", "cHR", "MRR", "Coverage", "Precision@K", "Recall@K", "F1@K"]
    return {
        "MAE": mae,
        "RMSE": rmse,
        **{k: round(random.uniform(other_lower, other_upper) / 10.0, 5) for k in other_metrics},
    }


model_features = []
for country_code, variants in zip(country_codes, variant_ids):
    for variant_idx, (variant, model_type) in enumerate(zip(variants, model_types)):
        for version_idx, version in enumerate(versions):
            uid = str(uuid.uuid4()).split("-")[0]
            new_features = pd.DataFrame(
                {
                    "model_name": ["LeadFinder"],
                    "country_code": [country_code],
                    "variant_id": [variant],
                    "model_type": [model_type],
                    "version": [version],
                    "model_id": [uid],
                    **{
                        k: [v]
                        for k, v in create_model_metrics(
                            (variant_idx + version_idx)
                        ).items()
                    },
                }
            )
            model_features.append(new_features)

model_features_df = pd.concat(model_features).sort_values(["country_code", "variant_id", "version"])

if OVERWRITE:
    model_features_df.to_csv("../data/model_features.csv", index=False)

display(model_features_df.head(3))

Unnamed: 0,model_name,country_code,variant_id,model_type,version,model_id,MAE,RMSE,HR,cHR,MRR,Coverage,Precision@K,Recall@K,F1@K
0,LeadFinder,FR,FR0001,content-based,1.0.0,30472591,0.68957,0.97195,0.4828,0.18846,0.14007,0.35053,0.19152,0.26864,0.48828
0,LeadFinder,FR,FR0001,content-based,1.0.1,11335fdb,0.49247,0.43225,0.44294,0.11499,0.38627,0.38324,0.13777,0.4428,0.13419
0,LeadFinder,FR,FR0001,content-based,1.0.2,74fec078,0.7822,0.71788,0.37655,0.29021,0.24611,0.38166,0.57859,0.35524,0.3588


In [24]:
user_ids = [
    f"U{i:04d}" for i in range(1, 51)
]
user_feedback = []
for user_id in user_ids:
    start_time = datetime.now() - timedelta(days=1, hours=6)
    for country_code in country_codes:
        num_requests = random.randint(5, 21)
        for _ in range(num_requests):
            model = model_features_df[model_features_df["country_code"] == country_code].sample(1)
            model["MAE"] = 1.0 - model["MAE"]
            model["RMSE"] = 1.0 - model["RMSE"]
            model["score"] = model.loc[:, all_metrics].mean(axis=1).round(5)
            model = model[["model_id", "score"]]
            model_props = model.to_dict(orient="records")[0]
            score = model_props["score"]
            feedback = random.choices([0, 1], weights=[1.0 - score, score])[0]
            user_response = {
                "user_id": user_id,
                "model_id": model_props["model_id"],
                "country_code": country_code,
                "timestamp": str(start_time).split(".")[0],
                "feedback": feedback,
            }
            user_feedback.append(user_response)
            start_time += timedelta(minutes=random.randint(1, 5), seconds=random.randint(0, 59))

user_feedback_df = pd.DataFrame(user_feedback).sort_values(["timestamp"])

if OVERWRITE:
    user_feedback_df.to_csv("../data/user_feedback.csv", index=False)

user_feedback_df.head(3)

Unnamed: 0,user_id,model_id,country_code,timestamp,feedback
0,U0001,8e95c798,FR,2025-11-17 07:17:14,1
130,U0004,89fca1aa,FR,2025-11-17 07:17:14,0
174,U0005,8fc5b433,FR,2025-11-17 07:17:14,0
