In [None]:
import random
import matplotlib.pyplot as plt
import numpy as np
from tqdm.contrib import itertools
import pandas as pd

In [None]:
DQL_MODEL = {
    "join": ["inner_join", "outer_join", "self_join"],
    "nesting": ["cte", "correlated_subquery", "uncorrelated_subquery"],
    "predicates": ["basic_operators", "logical_operators", "set_operators"]
}

In [None]:
def rgbeta(n: int, mean: float, var: float, min: float = 0, max: float = 1) -> float:
    dmin = mean - min
    dmax = max - mean

    if dmin <= 0 or dmax <= 0:
        raise ValueError(f"mean must be between min = {min} and max = {max}")

    if var >= dmin * dmax:
        raise ValueError(
            f"var must be less than (mean - min) * (max - mean) = {dmin * dmax}")

    mx = (mean - min) / (max - min)
    vx = var / (max - min) ** 2

    a = ((1 - mx) / vx - 1 / mx) * mx ** 2
    b = a * (1 / mx - 1)

    x = np.random.beta(a, b, n)
    y = (max - min) * x + min

    return y.tolist()


In [None]:
def calc_complexity(frequency, regulation: float, weight=1):
    return ((frequency*weight)**(1/regulation))/(1+((frequency*weight)**(1/regulation))) 

def calc_frequency(complexity, regulation: float, weight=1):
    x = -complexity/(1-complexity)
    if(x<0): x = x*-1
    return x**regulation

In [None]:
def create_random_task(dql_model: dict[str, list[str]]):
    return {key: [random.randint(0, 7) for _ in dql_model[key]] for key in dql_model}


def create_optimal_task(dql_model: dict[str, list[str]], learner_competency: dict[str, list[float]], scaffolding_bonus: dict[str, list[float]], regulation: float):
    return {key: [np.clip(calc_frequency(learner_competency[key][i] + scaffolding_bonus[key][i], regulation), 0, 7) for i in range(len(dql_model[key]))] for key in dql_model}

In [None]:
def calc_task_complexities(task: dict[str, list[int]], regulation: float):
    return {key: calc_complexity_for_category(category, regulation) for key, category in task.items()}


def calc_complexity_for_category(category: list[int], regulation: float):
    return list(calc_complexity(frequency, regulation) for frequency in category)

In [None]:
def create_learner_scaffolded_competence_bonuses(dql_model: dict[str, list[str]], bonusDistribution: tuple[4]):
    return {key: rgbeta(len(dql_model[key]), *bonusDistribution) for key in dql_model}


def sample_from_snd_vectorized_and_normalize(X: list[float], mean=0, sd=1):
    # Generate random normal samples and normalize using min-max
    
    samples = np.random.normal(mean, sd, len(X))
    min_x = min(samples)
    max_x = max(samples)
    normalized = (samples - min_x) / (max_x - min_x)
    return (normalized-0.2).tolist()


def create_learner_competencies(dql_model: dict[str, list[str]]):
    return {key: sample_from_snd_vectorized_and_normalize(dql_model[key]) for key in dql_model}


def create_learner_population(learner_count: int, task_count: int, dql_model: dict[str, list[str]], bonusDistribution: tuple[4]):
    population = {
        "learner_competencies": [create_learner_competencies(dql_model) for _ in range(learner_count)],
        "scaffolding_competence_bonus_per_step_and_learner": [[create_learner_scaffolded_competence_bonuses(dql_model, bonusDistribution) for _ in range(learner_count)] for _ in range(task_count)]
    }
    # Calculate and print mean competency stats for better insight
    mean_competencies = {}
    for key in dql_model:
        mean_competencies[key] = sum([sum(population["learner_competencies"][i][key]) / len(population["learner_competencies"][i][key]) 
                                    for i in range(learner_count)]) / learner_count
    
    print(f"Mean initial learner competencies:")
    for key, value in mean_competencies.items():
        print(f"  {key}: {value:.4f}")
    return population

In [None]:


def calculate_delta(learner_competency: dict[str, list[str]], task_complexities: dict[str, list[str]], scaffolding_bonus: dict[str, list[str]]):
    result = {}
    for key in learner_competency:
        result[key] = []
        for i in range(len(learner_competency[key])):
            k = learner_competency[key][i]
            c = task_complexities[key][i]
            t = scaffolding_bonus[key][i]
            if (c <= k or c > k + t):
                result[key].append(0)
            else:
                result[key].append(c - k)
    return result


def add_delta_to_competency(competency: dict[str, list[str]], delta: dict[str, list[str]]):
    return {key: [competency[key][i] + delta[key][i] for i in range(len(competency[key]))] for key in competency}

In [None]:
def simulate_task_adaptation(task_count: int, learner_count: int, regulation: float, bonusDistribution: tuple[4]):

    learner_population = create_learner_population(
        learner_count, task_count, DQL_MODEL, bonusDistribution)

    simulationLog = [{
        "tasks": [],
        "competencies": [],
        "scaffolding_bonuses": [],
        "deltas": []
    } for _ in range(learner_count)]

    for i, j in itertools.product(range(task_count), range(learner_count)):
        learner_competency = learner_population["learner_competencies"][j]
        scaffolding_bonus = learner_population["scaffolding_competence_bonus_per_step_and_learner"][i][j]

        # task = create_random_task(dql_model)
        task = create_optimal_task(
            DQL_MODEL, learner_competency, scaffolding_bonus, regulation)

        task_complexities = calc_task_complexities(task, regulation)
        delta = calculate_delta(
            learner_competency, task_complexities, scaffolding_bonus)

        # update the learner competency in the global learner population
        learner_population["learner_competencies"][j] = add_delta_to_competency(
            learner_competency, delta)

        simulationLog[j]["tasks"].append(task)
        simulationLog[j]["competencies"].append(learner_competency)
        simulationLog[j]["scaffolding_bonuses"].append(scaffolding_bonus)
        simulationLog[j]["deltas"].append(delta)

    return simulationLog

In [None]:
def plot_simulation_log(simulationLog: dict[str, list[list[float]]], learnerId: int, regulation: float):
    task_count = len(simulationLog[learnerId]["tasks"])
    plt.figure(figsize=(16, 9))
    colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
    for i, key in enumerate(DQL_MODEL):
        color = colors[i % len(colors)]

        aggregated_competency_values = []
        aggregated_task_values = []
        aggregated_competency_plus_bonus_values = []
        for i in range(task_count):
            # Get competency value
            competency_values = simulationLog[learnerId]["competencies"][i][key]
            competency_aggregated = sum(
                competency_values)/len(competency_values)
            aggregated_competency_values.append(competency_aggregated)

            # Get task complexity value
            task_values = calc_task_complexities(
                simulationLog[learnerId]["tasks"][i], regulation)[key]
            task_aggregated = sum(task_values)/len(task_values)
            aggregated_task_values.append(task_aggregated)

            scaffolding_bonus_values = simulationLog[learnerId]["scaffolding_bonuses"][i][key]
            # Add the scaffolding bonus to the competency value
            competency_plus_bonus_values = [
                a + b for a, b in zip(scaffolding_bonus_values, competency_values)]
            competency_plus_bonus_aggregated = sum(
                competency_plus_bonus_values)/len(competency_plus_bonus_values)
            aggregated_competency_plus_bonus_values.append(
                competency_plus_bonus_aggregated)

        plt.plot(range(task_count), aggregated_competency_values,
                 color=color, label=f'{key} competency')
        plt.plot(range(task_count), aggregated_task_values,
                 '.', color=color, label=f'{key} task')
        plt.plot(range(task_count), aggregated_competency_plus_bonus_values,
                 '--', color=color, label=f'{key} competency + scaffolding bonus')

    plt.ylim(0, 1)
    plt.xlim(0, task_count-1)
    plt.ylabel("Competency")
    plt.xlabel("Step")
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig(
        f"sql_task_adaptation_{learnerId}.png", dpi=300, bbox_inches="tight")
    
    
def plot_mean_simulation_log(simulationLog: dict[str, list[list[float]]], regulation: float):
    task_count = len(simulationLog[0]["tasks"])
    plt.figure(figsize=(16, 9))
    colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
    
    for i, key in enumerate(DQL_MODEL):
        color = colors[i % len(colors)]

        # Initialize arrays to store values across all learners
        mean_competency_values = []
        mean_task_values = []
        mean_competency_plus_bonus_values = []
        
        # For each task step
        for step in range(task_count):
            # Collect values across all learners for this step
            all_learners_competency = []
            all_learners_task = []
            all_learners_competency_plus_bonus = []
            
            # Iterate through all learners
            for learner_id in range(len(simulationLog)):
                # Get competency value
                competency_values = simulationLog[learner_id]["competencies"][step][key]
                competency_aggregated = sum(competency_values)/len(competency_values)
                all_learners_competency.append(competency_aggregated)
                
                # Get task complexity value
                task_values = calc_task_complexities(
                    simulationLog[learner_id]["tasks"][step],regulation)[key]
                task_aggregated = sum(task_values)/len(task_values)
                all_learners_task.append(task_aggregated)
                
                # Get competency + scaffolding bonus
                scaffolding_bonus_values = simulationLog[learner_id]["scaffolding_bonuses"][step][key]
                competency_plus_bonus_values = [
                    a + b for a, b in zip(scaffolding_bonus_values, competency_values)]
                competency_plus_bonus_aggregated = sum(
                    competency_plus_bonus_values)/len(competency_plus_bonus_values)
                all_learners_competency_plus_bonus.append(competency_plus_bonus_aggregated)
            
            # Calculate means across all learners
            mean_competency_values.append(sum(all_learners_competency)/len(all_learners_competency))
            mean_task_values.append(sum(all_learners_task)/len(all_learners_task))
            mean_competency_plus_bonus_values.append(sum(all_learners_competency_plus_bonus)/len(all_learners_competency_plus_bonus))
        
        # Plot the means
        plt.plot(range(task_count), mean_competency_values,
                    color=color, label=f'{key} mean competency')
        plt.plot(range(task_count), mean_task_values,
                    '.', color=color, label=f'{key} mean task')
        plt.plot(range(task_count), mean_competency_plus_bonus_values,
                    '--', color=color, label=f'{key} mean competency + scaffolding bonus')

    plt.ylim(0, 1)
    plt.xlim(0, task_count-1)
    plt.ylabel("Mean Competency")
    plt.xlabel("Step")
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig(
        f"sql_task_adaptation_mean.png", dpi=300, bbox_inches="tight")

In [None]:
def saveData(simulationLog, fileName):
    records = []
    order_id = 0

    # Iterate through each student 
    for studentId in range(len(simulationLog)):
        for taskIndex in range(len(simulationLog[studentId]["tasks"])):
            task_order_id = order_id  # Create one order_id per task
            # Iterate through categories
            for category_i, (category_name, category_elements) in enumerate(DQL_MODEL.items()):
                # Iterate through elements in category
                for element_i, element in enumerate(category_elements):
                    # Get delta for this element
                    delta = simulationLog[studentId]["deltas"][taskIndex][category_name][element_i]
                    
                    # Create record with zero-padded IDs
                    # Calculate position: (category_i * elements_per_category) + element_i
                    skill_position = (category_i * len(category_elements)) + element_i
                    record = {
                        'order_id': f'{task_order_id+1:08d}',
                        'user_id': f'{studentId+1:06d}',
                        'sequence_id': f'{studentId+1:06d}',
                        'skill_id': skill_position+1,
                        'correct': 1 if delta > 0 else 0
                    }
                    records.append(record)
            order_id += 1  # Increment order_id only after all records for a task are created

    # Create and save dataframe
    df = pd.DataFrame(records)
    df.to_csv("../" + fileName+'.csv', index=False, sep=',')

In [None]:

# TASK_COUNT = 10
# LEARNER_COUNT = 1000
# SIM_PARAM_COMPLEXITY_CONVERGATION_FACTOR = 0.5
# SIM_PARAM_SCAFFOLDING_BONUS_DISTRIBUTION = (0.1, 0.002, 0, 0.2)

def createSimData(task_count, learner_count, regulation, bonusDistribution):
    simulationLog = simulate_task_adaptation(task_count, learner_count, regulation, bonusDistribution)
    plot_mean_simulation_log(simulationLog, regulation)
    return simulationLog
    
    
simVariations = [
    (35, 1000, 0.5, (0.05, 0.002, 0, 0.2)),
    (15, 1000, 0.5, (0.1, 0.002, 0, 0.2)),
    (10, 1000, 0.1, (0.2, 0.002, 0.1, 0.3))
]

# Initialize datasets array with the correct length
dataset = []
for sim in simVariations:
    data = createSimData(*sim)
    for value in data:
        dataset.append(value)
        
saveData(dataset, 'dataset')
