⭐ CEL 1 — USERS DATA (perfect voor recommender + realistische clusters)

In [9]:
import pandas as pd
import numpy as np

rng = np.random.default_rng(42)

N_USERS = 1000

clusters = rng.integers(1, 6, size=N_USERS)
digital_lit = rng.integers(1, 6, size=N_USERS)
age_groups = rng.choice(
    ["18-25","26-35","36-45","46-60","60+"],
    size=N_USERS,
    p=[0.20,0.30,0.25,0.15,0.10]
)
risk_profiles = rng.choice(
    ["low","medium","high"],
    size=N_USERS,
    p=[0.40,0.40,0.20]
)
preferred_topic = rng.choice(
    ["phishing","whatsapp","ai_voice","bank","shopping"],
    size=N_USERS
)

users = pd.DataFrame({
    "user_id": np.arange(1, N_USERS+1),
    "user_cluster": clusters,
    "digital_literacy": digital_lit,
    "age_group": age_groups,
    "risk_profile": risk_profiles,
    "preferred_topic": preferred_topic
})

users.to_csv("users.csv", index=False)
users.head()


Unnamed: 0,user_id,user_cluster,digital_literacy,age_group,risk_profile,preferred_topic
0,1,1,5,18-25,high,bank
1,2,4,4,26-35,medium,shopping
2,3,4,3,18-25,high,bank
3,4,3,5,18-25,medium,ai_voice
4,5,3,1,36-45,low,ai_voice


⭐ CEL 2 — MODULES DATA (30 modules, elk met duidelijke features)

In [10]:
N_MODULES = 30

module_ids = np.arange(1, N_MODULES+1)
topics = ["phishing","whatsapp","ai_voice","bank","shopping"]

modules = pd.DataFrame({
    "module_id": module_ids,
    "scam_type": rng.choice(topics, size=N_MODULES),
    "difficulty": rng.integers(1, 6, size=N_MODULES),
    "target_literacy": rng.integers(1, 6, size=N_MODULES),
    "duration_min": rng.integers(3, 12, size=N_MODULES)  # 3–12 minuten
})

modules.to_csv("modules.csv", index=False)
modules.head()


Unnamed: 0,module_id,scam_type,difficulty,target_literacy,duration_min
0,1,shopping,5,3,8
1,2,ai_voice,4,1,7
2,3,shopping,5,5,6
3,4,ai_voice,4,1,7
4,5,whatsapp,5,1,11


⭐ CEL 3 — PERFECTE RATINGS GENERATOR (géén duplicaten, 20.000 interacties)

In [11]:
import pandas as pd
import numpy as np

rng = np.random.default_rng(42)

users = pd.read_csv("users.csv")
modules = pd.read_csv("modules.csv")

TARGET_RATINGS = 20000
MIN_PER_USER = 10
MAX_PER_USER = 30

module_ids = modules["module_id"].values

# Dictionaries (sneller lookup)
mod_topic = dict(zip(modules.module_id, modules.scam_type))
mod_diff = dict(zip(modules.module_id, modules.difficulty))
mod_tar = dict(zip(modules.module_id, modules.target_literacy))

def generate_rating(user, module_id):
    ulit = user["digital_literacy"]
    utopic = user["preferred_topic"]
    ucluster = user["user_cluster"]

    topic_m = 1 if mod_topic[module_id] == utopic else 0
    diff_gap = abs(ulit - mod_diff[module_id])
    tar_match = 1 if mod_tar[module_id] == ulit else 0

    # Difficulty effect
    if diff_gap == 0:
        diff_eff = 1.0
    elif diff_gap == 1:
        diff_eff = 0.4
    elif diff_gap == 2:
        diff_eff = -0.4
    else:
        diff_eff = -1.1

    rating = (
        3.0
        + 0.9*topic_m
        + diff_eff
        + 0.3*tar_match
        + 0.15*(ucluster - 3)
        + rng.normal(0, 0.45)
    )

    return float(np.clip(round(rating,1), 1, 5))


ratings_rows = []
for _, user in users.iterrows():
    k = rng.integers(MIN_PER_USER, MAX_PER_USER+1)
    chosen = rng.choice(module_ids, size=k, replace=False)

    for mid in chosen:
        r = generate_rating(user, mid)
        ratings_rows.append([user.user_id, mid, r])

ratings = pd.DataFrame(ratings_rows, columns=["user_id","module_id","rating"])

# Downsample exact to 20k
if len(ratings) > TARGET_RATINGS:
    ratings = ratings.sample(TARGET_RATINGS, random_state=42).reset_index(drop=True)

# Hard guarantee: no duplicates
ratings = ratings.drop_duplicates(subset=["user_id","module_id"])
ratings = ratings.reset_index(drop=True)

ratings.to_csv("ratings.csv", index=False)

print("Ratings shape:", ratings.shape)
print("Duplicates:", ratings.duplicated(subset=["user_id","module_id"]).sum())
ratings.head()


Ratings shape: (20000, 3)
Duplicates: 0


Unnamed: 0,user_id,module_id,rating
0,806,23,2.3
1,273,18,2.3
2,332,2,4.0
3,642,7,5.0
4,258,1,3.5


In [12]:
import pandas as pd

users = pd.read_csv("users.csv")
modules = pd.read_csv("modules.csv")
ratings = pd.read_csv("ratings.csv")

print("=== SHAPES ===")
print("Users:", users.shape)
print("Modules:", modules.shape)
print("Ratings:", ratings.shape)

print("\n=== CHECK UNIQUE IDS ===")
print("Unique users:", users["user_id"].is_unique)
print("Unique modules:", modules["module_id"].is_unique)

print("\n=== CHECK DUPLICATES IN RATINGS ===")
dups = ratings.duplicated(subset=["user_id","module_id"]).sum()
print("Duplicate (user_id, module_id) pairs:", dups)

print("\n=== CHECK SPARSITY ===")
n_users = users["user_id"].nunique()
n_mods = modules["module_id"].nunique()

possible = n_users * n_mods
actual = len(ratings)
sparsity = 1 - actual/possible

print("Possible:", possible)
print("Actual:", actual)
print("Sparsity:", sparsity)


=== SHAPES ===
Users: (1000, 6)
Modules: (30, 5)
Ratings: (20000, 3)

=== CHECK UNIQUE IDS ===
Unique users: True
Unique modules: True

=== CHECK DUPLICATES IN RATINGS ===
Duplicate (user_id, module_id) pairs: 0

=== CHECK SPARSITY ===
Possible: 30000
Actual: 20000
Sparsity: 0.33333333333333337
