In [None]:
import numpy as np
import pandas as pd
import random

def generate_clustered_style_dataset(n=800):

    # Age distribution: 5, 6, 7 with specific probabilities
    age = np.random.choice([5, 6, 7], size=n, p=[0.35, 0.4, 0.25])

    # Continuous variables (normal distributions)
    attention_span = np.random.normal(loc=6.02, scale=1.10, size=n)
    emotional_regulation = np.random.normal(loc=6.06, scale=1.21, size=n)
    parental_involvement = np.random.normal(loc=6.41, scale=1.45, size=n)

    # Clip to realistic bounds
    attention_span = np.clip(attention_span, 2.5, 9.0)
    emotional_regulation = np.clip(emotional_regulation, 2.0, 9.5)
    parental_involvement = np.clip(parental_involvement, 2.0, 10.0)

    # Categorical variables
    motivation_type = np.random.choice(["Intrinsic", "Extrinsic"], size=n, p=[0.52, 0.48])
    learning_style = np.random.choice(
        ["Mixed", "Visual", "Kinesthetic", "Auditory"],
        size=n,
        p=[0.6, 0.15, 0.15, 0.10]
    )
    social_interaction_style = np.random.choice(
        ["Mixed", "Independent", "Cooperative"],
        size=n,
        p=[0.5, 0.25, 0.25]
    )

    # Outcome score: weighted sum + intrinsic motivation bonus + noise
    outcome_score = (
        0.35 * attention_span +
        0.3 * emotional_regulation +
        0.25 * parental_involvement +
        np.where(motivation_type == "Intrinsic", 3.5, 0) +
        np.random.normal(0, 2.0, size=n)
    )
    outcome_score = np.clip(outcome_score, 40, 100)

    # Assemble DataFrame
    df = pd.DataFrame({
        "Age": age,
        "Attention_Span": np.round(attention_span, 4),
        "Emotional_Regulation": np.round(emotional_regulation, 4),
        "Motivation_Type": motivation_type,
        "Parental_Involvement": np.round(parental_involvement, 4),
        "Learning_Style": learning_style,
        "Social_Interaction_Style": social_interaction_style,
        "Outcome_Score": np.round(outcome_score, 2)
    })

    return df

# Example usage
df_simulated = generate_clustered_style_dataset(n=800)
print(df_simulated.head())
df_simulated.to_csv("Simulated_Learner_Dataset.csv", index=False)
