In [None]:
import pandas as pd
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture

load_dotenv();

In [None]:
data_trials_Teacher = pd.read_csv("SS_trials_TEACH.csv")
data_trials_Teacher_learning = data_trials_Teacher[data_trials_Teacher.block_type == 'learning']
data_trials_Teacher_testing = data_trials_Teacher[data_trials_Teacher.block_type == 'test']

# del data_trials_Teacher

In [None]:
def calculate_accuracy_by_trial_type(df, n=None):

    grouped_data = df.groupby('participant_id')

    # Create empty lists to store results
    participant_ids = []
    trial_type_ids = []
    accuracies = []
    trial_numbers = []

    # Iterate over each participant group
    for participant_id, group in grouped_data:
        # Group by trial_type_id within each participant
        trial_type_groups = group.groupby('trial_type_id')
        
        # Calculate accuracy for each trial type
        for trial_type_id, trial_group in trial_type_groups:
            
            trials_for_accuracy_calculation = trial_group # Default to all trials in the group
            if n is not None and isinstance(n, int) and n > 0:
                if not trial_group.empty:
                    # Select the last n trials for accuracy calculation
                    trials_for_accuracy_calculation = trial_group.tail(n)
            
            # Calculate accuracy
            if not trials_for_accuracy_calculation.empty:
                accuracy = trials_for_accuracy_calculation['accuracy'].mean()
            else:
                # If the group (or the tail) is empty, accuracy is undefined
                accuracy = np.nan 
            
            # trial_number still represents the total number of trials in this specific group
            current_trial_number = trial_group.shape[0]
            
            # Append results
            participant_ids.append(participant_id)
            trial_type_ids.append(trial_type_id)
            accuracies.append(accuracy)
            trial_numbers.append(current_trial_number)

    # Create DataFrame from results
    accuracy_df = pd.DataFrame({
        'participant_id': participant_ids,
        'trial_type_id': trial_type_ids,
        'accuracy': accuracies,
        'trial_number': trial_numbers
    })

    return accuracy_df


def calculate_learning_rate(df, window=None, by_trial_type=True):
    if by_trial_type:
        # Group by participant and trial type
        grouped_data = df.groupby(['participant_id', 'trial_type_id'])
    else:
        # Group by participant only for overall learning
        grouped_data = df.groupby(['participant_id'])
    
    # Create empty lists to store results
    participant_ids = []
    trial_type_ids = []
    trial_numbers = []
    cumulative_accuracies = []
    
    # Iterate over each group
    for group_key, group in grouped_data:
        # Handle different group keys based on grouping method
        if by_trial_type:
            participant_id, trial_type_id = group_key
        else:
            participant_id = group_key
            trial_type_id = 'overall'  # Use 'overall' to indicate combined trial types
        
        # Calculate accuracy based on window parameter
        if window is not None:
            cumulative_accuracy = group['accuracy'].rolling(window=window, min_periods=1).mean()
        else:
            cumulative_accuracy = group['accuracy'].expanding().mean()
        
        # Append results
        participant_ids.extend([participant_id] * len(group))
        trial_type_ids.extend([trial_type_id] * len(group))
        trial_numbers.extend(range(1, len(group) + 1))  # Generate sequential numbers
        cumulative_accuracies.extend(cumulative_accuracy.tolist())
    
    # Create DataFrame from results
    learning_df = pd.DataFrame({
        'participant_id': participant_ids,
        'trial_type_id': trial_type_ids,
        'trial_number': trial_numbers,
        'cumulative_accuracy': cumulative_accuracies,
    })
    
    return learning_df

def plot_learning_curves(df, title='Learning Curves by Trial Type'):
    plt.figure(figsize=(10, 6))
    plt.title(title)
    
    # Define the order of trial types
    trial_types = ['plain', 'hatched', 'triangle']
    
    # Plot learning curves for each trial type
    for trial_type in trial_types:
        trial_data = df[df['trial_type_id'] == trial_type]
        sns.lineplot(data=trial_data, x='trial_number', y='cumulative_accuracy',
                    label=trial_type, errorbar='se')
    
    plt.xlabel('Trial Number')
    plt.ylabel('Cumulative Accuracy')
    plt.ylim(0, 1)
    plt.legend()
    plt.tight_layout()
    plt.show()


def plot_accuracy_by_trial_type(df, title='Accuracy by Trial Type', cluster=False):
    # Create a single figure
    # Create a more elegant figure with better proportions
    plt.figure(figsize=(12, 8))
    
    # Use a cleaner style
    with plt.style.context('seaborn-v0_8-whitegrid'):
        # Set up the plot with a more descriptive title and custom font properties
        plt.title(title, fontsize=16, fontweight='bold', pad=15)
        
        # Define the order of trial types
        trial_types = ['plain', 'hatched', 'triangle']
        
        # Define a more elegant color palette
        bar_color = '#8BBEE8'  # Soft blue
        
        # Create bar plot with error bars for all trial types
        # print(df.mean())
        ax = sns.barplot(data=df, x='trial_type_id', y='accuracy',
                    order=trial_types, errorbar='se', color=bar_color, alpha=0.6)
        
        # Add scatter points with jitter
        if cluster:
            # Use a more harmonious color palette for clusters
            cluster_palette = {'Level0': '#3A6B9F', 'Level1': '#E3675C', 'Level3': '#4CAF50'}
            
            sns.stripplot(data=df, x='trial_type_id', y='accuracy',
                        order=trial_types, hue='cluster', palette=cluster_palette, 
                        alpha=0.7, jitter=0.2, size=7, edgecolor='white', linewidth=0.5)
            
            # Improve legend appearance
            plt.legend(title='Cluster', title_fontsize=12, fontsize=10, framealpha=0.9, 
                      edgecolor='lightgray', loc='upper right')
        else:
            sns.stripplot(data=df, x='trial_type_id', y='accuracy',
                        order=trial_types, color='#333333', alpha=0.6, 
                        jitter=0.2, size=7, edgecolor='white', linewidth=0.5)
        
        # Improve axis labels
        plt.ylabel('Accuracy', fontsize=14, labelpad=10)
        plt.xlabel('', fontsize=14, labelpad=10)
        plt.ylim(0, 1.05)  # Set y-axis limits from 0 to 1
        
        # Customize tick parameters
        plt.xticks(fontsize=16)  # Increased x tick label size
        plt.yticks(fontsize=12)
        
        # Add subtle horizontal grid lines
        plt.grid(axis='y', linestyle='--', alpha=0.3)
        
        # Rename x-tick labels to be more descriptive
        ax.set_xticklabels([t.capitalize() for t in trial_types], fontsize=16)  # Increased font size here too
        
        # Remove top and right spines for cleaner look
        sns.despine()
        
        plt.tight_layout()
        plt.show()

def plot_accuracy_boxplots(df, title='Accuracy Boxplots by Trial Type', cluster=False):
    # Create a single figure
    plt.figure(figsize=(10, 6))
    
    # Add a more elegant title with better font
    plt.title(title, fontsize=16, fontweight='medium', pad=15)

    # Define the order of trial types
    trial_types = ['plain', 'hatched', 'triangle']
    
    # Define a more elegant color palette
    cluster_palette = {'Level0': '#3A6B9F', 'Level1': '#E3675C', 'Level3': '#4CAF50'}
    base_color = '#8BBEE8'  # Soft blue for non-clustered plots
    
    # Create box plots for trial types with improved styling
    if cluster:
        # Box plots with cluster as hue
        sns.boxplot(data=df, x='trial_type_id', y='accuracy',
                   order=trial_types, hue='cluster', 
                   palette=cluster_palette,
                   width=0.7, 
                   boxprops=dict(alpha=0.7, edgecolor='black', linewidth=0.8),
                   whiskerprops=dict(linewidth=0.8),
                   medianprops=dict(color='black', linewidth=1.5),
                   capprops=dict(linewidth=0.8))
        
        # Improve legend appearance and place it outside the plot
        plt.legend(title='Cluster', title_fontsize=12, fontsize=10, framealpha=0.9, 
                  edgecolor='lightgray', bbox_to_anchor=(1.05, 1), loc='upper left')
    else:
        # Box plots without cluster distinction
        sns.boxplot(data=df, x='trial_type_id', y='accuracy',
                   order=trial_types, color=base_color,
                   width=0.7, 
                   boxprops=dict(alpha=0.7, edgecolor='black', linewidth=0.8),
                   whiskerprops=dict(linewidth=0.8),
                   medianprops=dict(color='black', linewidth=1.5),
                   capprops=dict(linewidth=0.8))
    
    # Add individual data points with jitter for better visualization
    if cluster:
        sns.stripplot(data=df, x='trial_type_id', y='accuracy',
                    order=trial_types, hue='cluster', palette=cluster_palette, 
                    dodge=True, alpha=0.7, jitter=0.2, size=5, 
                    edgecolor='white', linewidth=0.5, legend=False)
    else:
        sns.stripplot(data=df, x='trial_type_id', y='accuracy',
                    order=trial_types, color='#333333', alpha=0.6, 
                    jitter=0.2, size=5, edgecolor='white', linewidth=0.5)
    
    # Improve axis labels
    plt.ylabel('Accuracy', fontsize=14, labelpad=10)
    plt.xlabel('', fontsize=14)  # Remove x-label as it's redundant
    plt.ylim(0, 1.05)  # Set y-axis limits from 0 to 1
    
    # Customize tick parameters
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=10)
    
    # Add subtle horizontal grid lines
    plt.grid(axis='y', linestyle='--', alpha=0.3)
    
    # Rename x-tick labels to be more descriptive
    ax = plt.gca()
    ax.set_xticklabels([t.capitalize() for t in trial_types], fontsize=12)
    
    # Remove top and right spines for cleaner look
    sns.despine()
    
    # Adjust layout to make room for the legend
    if cluster:
        plt.tight_layout(rect=[0, 0, 0.85, 1])
    else:
        plt.tight_layout()
        
    plt.show()


def plot_accuracy_by_cluster_and_trial_type(df, title='Accuracy by Cluster and Trial Type'):
    # Create figure
    plt.figure(figsize=(12, 7))
    plt.title(title)
    
    # Define the order of trial types
    trial_types = ['plain', 'hatched', 'triangle']
    
    # Define cluster colors
    cluster_colors = {'Level0': 'blue', 'Level1': 'red', 'Level3': 'green'}
    
    # Create boxplots for each cluster within each trial type
    sns.boxplot(
        data=df,
        x='trial_type_id',
        y='accuracy',
        hue='cluster',
        order=trial_types,
        palette=cluster_colors,
        width=0.7,
        fliersize=3,
        boxprops=dict(alpha=0.5),
        medianprops=dict(color='black')
    )
    
    # Add individual data points with jitter for better visualization
    sns.stripplot(
        data=df,
        x='trial_type_id',
        y='accuracy',
        hue='cluster',
        order=trial_types,
        palette=cluster_colors,
        dodge=True,
        alpha=0.5,
        jitter=0.2,
        size=3,
        legend=False
    )
    
    # Customize plot appearance
    plt.ylabel('Accuracy')
    plt.ylim(0, 1)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Improve legend
    handles, labels = plt.gca().get_legend_handles_labels()
    plt.legend(handles[:3], labels[:3], title='Cluster', loc='lower right')
    
    plt.tight_layout()
    plt.show()

def perform_clustering(df, n_clusters=3, random_state=42):
    """
    Perform k-means clustering on accuracy data and return cluster assignments and centers.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing accuracy data with columns ['hatched_acc', 'plain_acc', 'triangle_acc']
    n_clusters : int, default=3
        Number of clusters for k-means
    random_state : int, default=42
        Random state for reproducibility
        
    Returns:
    --------
    tuple
        (DataFrame with cluster assignments, cluster centers in original scale)
    """
    # Prepare data for clustering
    X = df[['hatched_acc', 'plain_acc', 'triangle_acc']]
    
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Perform k-means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    df['cluster'] = kmeans.fit_predict(X_scaled)
    
    # Get cluster centers in original scale
    cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)
    
    # Print cluster centers
    print("\nCluster Centers (original scale):")
    for i, center in enumerate(cluster_centers):
        print(f"Cluster {i}:")
        print(f"  Hatched accuracy: {center[0]:.3f}")
        print(f"  Plain accuracy: {center[1]:.3f}")
        print(f"  Triangle accuracy: {center[2]:.3f}")
    
    # Print cluster sizes
    print("\nNumber of participants in each cluster:")
    print(df['cluster'].value_counts().sort_index())
    
    return df, cluster_centers




def perform_gmm_clustering(df, n_components=3, random_state=42):
    """
    Perform Gaussian Mixture Model (GMM) clustering on accuracy data and return cluster assignments and means.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing accuracy data with columns ['hatched_acc', 'plain_acc', 'triangle_acc']
    n_components : int, default=3
        Number of mixture components for GMM
    random_state : int, default=42
        Random state for reproducibility
        
    Returns:
    --------
    tuple
        (DataFrame with cluster assignments, component means in original scale)
    """
    # Prepare data for clustering
    X = df[['hatched_acc', 'plain_acc', 'triangle_acc']]
    
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    # print(X_scaled)
    
    # Perform GMM clustering
    # Make sure to import: from sklearn.mixture import GaussianMixture
    gmm = GaussianMixture(n_components=n_components, random_state=random_state)
    df['cluster'] = gmm.fit_predict(X_scaled)
    
    # Get component means in original scale
    component_means_scaled = gmm.means_
    component_means = scaler.inverse_transform(component_means_scaled)
    
    # Print component means
    print("\nComponent Means (original scale):")
    for i, mean_val in enumerate(component_means):
        print(f"Component {i}:")
        print(f"  Hatched accuracy: {mean_val[0]:.3f}")
        print(f"  Plain accuracy: {mean_val[1]:.3f}")
        print(f"  Triangle accuracy: {mean_val[2]:.3f}")

    # Report BIC and AIC
    bic = gmm.bic(X_scaled)
    aic = gmm.aic(X_scaled)
    
    print(f"\nBIC: {bic:.3f}")
    print(f"AIC: {aic:.3f}")
    
    # Print cluster sizes
    print("\nNumber of participants in each cluster (component):")
    print(df['cluster'].value_counts().sort_index())
    
    return df, component_means



In [None]:
learning_accuracy_df = calculate_accuracy_by_trial_type(data_trials_Teacher_learning, n=10)
testing_accuracy_df = calculate_accuracy_by_trial_type(data_trials_Teacher_testing, n=10)
teacher_overall_accuracy_df = calculate_accuracy_by_trial_type(data_trials_Teacher, n=10)

In [None]:
learning_accuracy_df

In [None]:
# Restructure learning_accuracy_df to have separate columns for each trial type accuracy
learning_accuracy_df_restructured = learning_accuracy_df.pivot(
    index='participant_id',
    columns='trial_type_id',
    values='accuracy'
).reset_index()

# Rename columns to be more descriptive
learning_accuracy_df_restructured.columns = ['participant_id', 'hatched_acc', 'plain_acc', 'triangle_acc']


# Display the restructured dataframe
(learning_accuracy_df_restructured.head())


testing_accuracy_df_restructured = testing_accuracy_df.pivot(
    index='participant_id',
    columns='trial_type_id',
    values='accuracy'
).reset_index()

# Rename columns to be more descriptive
testing_accuracy_df_restructured.columns = ['participant_id', 'hatched_acc', 'plain_acc', 'triangle_acc']


In [None]:
# Add cluster information to the original learning_accuracy_df

# learning_accuracy_df_restructured, cluster_centers = perform_clustering(learning_accuracy_df_restructured, n_clusters=3)
learning_accuracy_df_restructured, cluster_centers = perform_gmm_clustering(learning_accuracy_df_restructured, n_components=3)
learning_acc_with_cluster = learning_accuracy_df.merge(
    learning_accuracy_df_restructured[['participant_id', 'cluster']],
    on='participant_id',
    how='left'
)

# Create a mapping dictionary for cluster renaming
cluster_mapping = {0: 'Level1', 1: 'Level3', 2: 'Level0'}

# Apply the mapping to rename clusters
learning_acc_with_cluster['cluster'] = learning_acc_with_cluster['cluster'].map(cluster_mapping)
learning_accuracy_df_restructured['cluster'] = learning_accuracy_df_restructured['cluster'].map(cluster_mapping)


In [None]:
# Add cluster information to the original learning_accuracy_df

# learning_accuracy_df_restructured, cluster_centers = perform_clustering(learning_accuracy_df_restructured, n_clusters=3)
testing_accuracy_df_restructured, cluster_centers = perform_clustering(testing_accuracy_df_restructured, n_clusters=3)
testing_acc_with_cluster = testing_accuracy_df.merge(
    testing_accuracy_df_restructured[['participant_id', 'cluster']],
    on='participant_id',
    how='left'
)

# Create a mapping dictionary for cluster renaming
cluster_mapping = {0: 'Level1', 1: 'Level3', 2: 'Level0'}

# Apply the mapping to rename clusters
testing_acc_with_cluster['cluster'] = testing_acc_with_cluster['cluster'].map(cluster_mapping)
testing_accuracy_df_restructured['cluster'] = testing_accuracy_df_restructured['cluster'].map(cluster_mapping)


In [None]:
# Add cluster information to the original learning_accuracy_df

# learning_accuracy_df_restructured, cluster_centers = perform_clustering(learning_accuracy_df_restructured, n_clusters=3)
learning_accuracy_df_restructured, cluster_centers = perform_gmm_clustering(learning_accuracy_df_restructured, n_components=3)
learning_acc_with_cluster = learning_accuracy_df.merge(
    learning_accuracy_df_restructured[['participant_id', 'cluster']],
    on='participant_id',
    how='left'
)

# Create a mapping dictionary for cluster renaming
cluster_mapping = {0: 'Level1', 1: 'Level3', 2: 'Level0'}

# Apply the mapping to rename clusters
learning_acc_with_cluster['cluster'] = learning_acc_with_cluster['cluster'].map(cluster_mapping)
learning_accuracy_df_restructured['cluster'] = learning_accuracy_df_restructured['cluster'].map(cluster_mapping)


In [None]:
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np

def silhouette_analysis_gmm(data, max_clusters=10, perform_gmm_clustering=None, mode='kmeans'):
    """
    Perform silhouette analysis for GMM or K-means clustering
    
    Parameters:
    data: your dataset
    max_clusters: maximum number of clusters to test
    perform_gmm_clustering: your custom GMM function (optional)
    mode: clustering algorithm to use ('gmm' or 'kmeans')
    """
    
    cluster_range = range(2, max_clusters + 1)
    silhouette_scores = []
    
    for n_clusters in cluster_range:
        if perform_gmm_clustering is not None:
            # Use your custom function
            labels = perform_gmm_clustering(data, n_clusters)
        else:
            # Use standard GMM or KMeans based on mode
            if mode.lower() == 'gmm':
                model = GaussianMixture(n_components=n_clusters, random_state=42)
                labels = model.fit_predict(data)
            elif mode.lower() == 'kmeans':
                model = KMeans(n_clusters=n_clusters, random_state=42)
                labels = model.fit_predict(data)
            else:
                raise ValueError("Mode must be either 'gmm' or 'kmeans'")
        
        # Calculate silhouette score
        score = silhouette_score(data, labels)
        silhouette_scores.append(score)
        print(f"For {n_clusters} clusters, silhouette score = {score:.3f}")
    
    # Plot the results with a white background
    plt.figure(figsize=(10, 6), facecolor='white')
    
    # Create a white background
    ax = plt.gca()
    ax.set_facecolor('white')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_color('#555555')
    ax.spines['bottom'].set_color('#555555')
    
    # Plot silhouette scores with a more elegant style
    plt.plot(cluster_range, silhouette_scores, marker='o', markersize=10, 
             linewidth=2.5, color='#3498db', markerfacecolor='white', 
             markeredgecolor='#3498db', markeredgewidth=2)
    
    # Add subtle grid only on y-axis
    plt.grid(axis='y', linestyle='--', alpha=0.3, color='#cccccc')
    
    # Highlight the best score with a more elegant indicator
    best_k = cluster_range[np.argmax(silhouette_scores)]
    best_score = max(silhouette_scores)
    plt.axvline(x=best_k, color='#e74c3c', linestyle='--', alpha=0.6, linewidth=1.5)
    
    # Add annotation for best score with a subtle background
    bbox_props = dict(boxstyle="round,pad=0.5", fc="white", ec="#e74c3c", alpha=0.9)
    plt.text(best_k + 0.15, best_score - 0.02, f'Optimal: {best_k} clusters\n(score: {best_score:.3f})', 
             ha='left', va='top', fontsize=11, color='#333333', 
             bbox=bbox_props)
    
    # Enhance labels and title
    plt.xlabel('Number of Clusters', fontsize=12, color='#333333', fontweight='bold')
    plt.ylabel('Silhouette Score', fontsize=12, color='#333333', fontweight='bold')
    plt.title(f'Silhouette Analysis for Optimal {mode.upper()} Clustering', 
              fontsize=14, color='#333333', fontweight='bold', pad=15)
    
    # Improve tick parameters
    plt.tick_params(axis='both', colors='#555555', direction='out', length=6, width=1)
    plt.xticks(cluster_range)
    
    plt.tight_layout()
    plt.show()
    
    return cluster_range, silhouette_scores, best_k



# Or without your custom function:
cluster_range, scores, best_k = silhouette_analysis_gmm(learning_accuracy_df_restructured[['hatched_acc', 'plain_acc', 'triangle_acc']], max_clusters=6)
# cluster_range, scores, best_k = silhouette_analysis_gmm(testing_accuracy_df_restructured[['hatched_acc', 'plain_acc', 'triangle_acc']], max_clusters=8, mode='kmeans')


In [None]:
learning_acc_with_cluster

In [None]:
learning_acc_with_cluster[['trial_type_id', 'accuracy']].groupby('trial_type_id')['accuracy'].mean().reset_index()

In [None]:
plot_accuracy_by_trial_type(learning_acc_with_cluster, 'Learning Accuracy - Teacher', cluster=False)
plot_accuracy_by_trial_type(testing_accuracy_df, 'Test Accuracy - Teacher', cluster=False)

In [None]:
# Create a figure with appropriate size
plt.figure(figsize=(12, 8))

# Define color palette for clusters
cluster_colors = {'Level0': '#3A6B9F', 'Level1': '#E3675C', 'Level3': '#4CAF50'}

# Compute summary statistics for each trial type
trial_type_order = ['plain', 'hatched', 'triangle']
trial_type_data = learning_acc_with_cluster.groupby(['trial_type_id', 'cluster'])['accuracy'].agg(['mean', 'sem']).reset_index()

# Reorder trial_type_data to have plain first, hatched second, triangle third
trial_type_order = ['plain', 'hatched', 'triangle']
trial_type_data = trial_type_data.sort_values('trial_type_id', key=lambda x: x.map({t: i for i, t in enumerate(trial_type_order)}))
trial_type_data = trial_type_data.reset_index(drop=True)

# Create grouped bar chart
# Set positions for the bars
bar_width = 0.25
r1 = np.arange(len(trial_type_order))
r2 = [x + bar_width for x in r1]
r3 = [x + bar_width for x in r2]

# Filter data for each cluster
level0 = trial_type_data[trial_type_data['cluster'] == 'Level0']
level1 = trial_type_data[trial_type_data['cluster'] == 'Level1']
level3 = trial_type_data[trial_type_data['cluster'] == 'Level3']

# Create bar plots for each cluster
# Create bar plots for each cluster
plt.bar(r1, level0['mean'], 
    width=bar_width, color=cluster_colors['Level0'], label='Level0', 
    yerr=level0['sem'],
    capsize=5, alpha=0.8, edgecolor='black', linewidth=0.8)

plt.bar(r2, level1['mean'], 
    width=bar_width, color=cluster_colors['Level1'], label='Level1', 
    yerr=level1['sem'],
    capsize=5, alpha=0.8, edgecolor='black', linewidth=0.8)

plt.bar(r3, level3['mean'], 
    width=bar_width, color=cluster_colors['Level3'], label='Level3', 
    yerr=level3['sem'],
    capsize=5, alpha=0.8, edgecolor='black', linewidth=0.8)

# Add legend
plt.legend(title='Cluster', fontsize=12, title_fontsize=14, 
       loc='upper right', framealpha=0.9)

# Customize the plot
plt.title('Teachers Performance by Trial Type and Cluster', fontsize=18, pad=15)
plt.xlabel('Trial Type', fontsize=14, labelpad=10)
plt.ylabel('Performance', fontsize=14, labelpad=10)
plt.ylim(0, 1.05)
plt.grid(axis='y', linestyle='--', alpha=0.3)

# Improve xticks
plt.xticks([r + bar_width for r in range(len(trial_type_order))], 
       [t.capitalize() for t in trial_type_order], fontsize=12)

# Improve legend
# Create a more elegant legend placed outside the figure
plt.legend(title='Cluster', fontsize=12, title_fontsize=14, 
           bbox_to_anchor=(1.05, 1), loc='upper left', framealpha=0.9)

# Remove top and right spines
sns.despine()

plt.tight_layout()
plt.show()

In [None]:
level0

In [None]:
trial_type_data

In [None]:
trial_type_data

In [None]:
trial_type_data

In [None]:
# plot_accuracy_boxplots(learning_acc_with_cluster, 'Learning Accuracy by Trial Type Teacher', cluster=True)
plot_accuracy_barplots(learning_acc_with_cluster, 'Learning Accuracy by Trial Type Teacher', cluster=True)


In [None]:
plot_accuracy_boxplots(testing_acc_with_cluster, 'Testing Accuracy by Trial Type Teacher', cluster=True)

In [None]:
testing_accuracy_df_restructured = testing_accuracy_df.pivot(
    index='participant_id',
    columns='trial_type_id',
    values='accuracy'
).reset_index()

# Rename columns to be more descriptive
testing_accuracy_df_restructured.columns = ['participant_id', 'hatched_acc', 'plain_acc', 'triangle_acc']

# Display the restructured dataframe
(testing_accuracy_df_restructured.head())


In [None]:


testing_accuracy_df_restructured, cluster_centers = perform_clustering(testing_accuracy_df_restructured)
testing_acc_with_cluster = testing_accuracy_df.merge(
    testing_accuracy_df_restructured[['participant_id', 'cluster']],
    on='participant_id',
    how='left'
)

# Create a mapping dictionary for cluster renaming
cluster_mapping = {0: 'Level1', 1: 'Level3', 2: 'Level0'}

# Apply the mapping to rename clusters
testing_acc_with_cluster['cluster'] = testing_acc_with_cluster['cluster'].map(cluster_mapping)
testing_accuracy_df_restructured['cluster'] = testing_accuracy_df_restructured['cluster'].map(cluster_mapping)


In [None]:
learning_rate_df = calculate_learning_rate(data_trials_Teacher_learning, window=20)
plot_learning_curves(learning_rate_df, 'Learning Curves by Trial Type')

In [None]:
data_trials_Pupil = pd.read_csv("SS_trials_Pupil.csv")
pupil_participants = pd.read_csv("SS_participants_Pupil.csv")
data_trials_Pupil = data_trials_Pupil[data_trials_Pupil.participant_id.isin(pupil_participants.id)]
data_trials_Pupil_learning = data_trials_Pupil[data_trials_Pupil.block_type == 'learning']
data_trials_Pupil_testing = data_trials_Pupil[data_trials_Pupil.block_type == 'test']

del data_trials_Pupil

In [None]:
learning_accuracy_df_Pupil = calculate_accuracy_by_trial_type(data_trials_Pupil_learning)
testing_accuracy_df_Pupil = calculate_accuracy_by_trial_type(data_trials_Pupil_testing)

In [None]:
# Add cluster information to the original learning_accuracy_df

testing_accuracy_df_Pupil_restructured = testing_accuracy_df_Pupil.pivot(
    index='participant_id',
    columns='trial_type_id',
    values='accuracy'
).reset_index()

# Rename columns to be more descriptive
testing_accuracy_df_Pupil_restructured.columns = ['participant_id', 'hatched_acc', 'plain_acc', 'triangle_acc']

testing_accuracy_df_Pupil_restructured, cluster_centers = perform_gmm_clustering(testing_accuracy_df_Pupil_restructured, n_components=3)
# testing_accuracy_df_Pupil_restructured, cluster_centers = perform_clustering(testing_accuracy_df_Pupil_restructured, n_clusters=3)

testing_acc_Pupil_with_cluster = testing_accuracy_df_Pupil.merge(
    testing_accuracy_df_Pupil_restructured[['participant_id', 'cluster']],
    on='participant_id',
    how='left'
)

# Create a mapping dictionary for cluster renaming
cluster_mapping = {2: 'Level1', 0: 'Level3', 1: 'Level0'}

# Apply the mapping to rename clusters
testing_acc_Pupil_with_cluster['cluster'] = testing_acc_Pupil_with_cluster['cluster'].map(cluster_mapping)
testing_accuracy_df_Pupil_restructured['cluster'] = testing_accuracy_df_Pupil_restructured['cluster'].map(cluster_mapping)


In [None]:
# learning_accuracy_df_Pupil = calculate_accuracy_by_trial_type(data_trials_Pupil_learning)
# plot_accuracy_by_trial_type(testing_acc_Pupil_with_cluster, 'Testing Accuracy by Trial Type Pupil', cluster=True)
plot_accuracy_boxplots(testing_acc_Pupil_with_cluster, 'Testing Accuracy by Trial Type Pupil', cluster=True)

In [None]:
# Add cluster information to the original learning_accuracy_df

learning_accuracy_df_Pupil_restructured = learning_accuracy_df_Pupil.pivot(
    index='participant_id',
    columns='trial_type_id',
    values='accuracy'
).reset_index()

# Rename columns to be more descriptive
learning_accuracy_df_Pupil_restructured.columns = ['participant_id', 'hatched_acc', 'plain_acc', 'triangle_acc']

learning_accuracy_df_Pupil_restructured, cluster_centers = perform_gmm_clustering(learning_accuracy_df_Pupil_restructured, n_components=3)
# learning_accuracy_df_Pupil_restructured, cluster_centers = perform_clustering(learning_accuracy_df_Pupil_restructured, n_clusters=3)

learning_acc_Pupil_with_cluster = learning_accuracy_df_Pupil.merge(
    learning_accuracy_df_Pupil_restructured[['participant_id', 'cluster']],
    on='participant_id',
    how='left'
)

# Create a mapping dictionary for cluster renaming
cluster_mapping = {1: 'Level1', 0: 'Level3', 2: 'Level0'}

# Apply the mapping to rename clusters
learning_acc_Pupil_with_cluster['cluster'] = learning_acc_Pupil_with_cluster['cluster'].map(cluster_mapping)
learning_accuracy_df_Pupil_restructured['cluster'] = learning_accuracy_df_Pupil_restructured['cluster'].map(cluster_mapping)


In [None]:
plot_accuracy_boxplots(learning_acc_Pupil_with_cluster, 'Learning Accuracy by Trial Type Pupil', cluster=True)


In [None]:
pupil_clusters = testing_acc_Pupil_with_cluster[['participant_id', 'cluster']].drop_duplicates().reset_index(drop=True)
teacher_clusters = testing_acc_with_cluster[['participant_id', 'cluster']].drop_duplicates().reset_index(drop=True)
pupil_teacher_relation = pupil_participants[['id', 'teacher_participant_id']]

pupil_teacher_clusters = pd.merge(pupil_teacher_relation, pupil_clusters, left_on='id', right_on='participant_id', how='left')
pupil_teacher_clusters = pd.merge(pupil_teacher_clusters, teacher_clusters, left_on='teacher_participant_id', right_on='participant_id', suffixes=('_pupil', '_teacher'), how='left')
pupil_teacher_clusters = pupil_teacher_clusters[['id', 'teacher_participant_id', 'cluster_pupil', 'cluster_teacher']]
pupil_teacher_clusters = pupil_teacher_clusters.dropna(subset=['cluster_pupil', 'cluster_teacher'])


# pupil_teacher_clusters


# Create normalized confusion matrix
cluster_flow_norm = pd.crosstab(pupil_teacher_clusters['cluster_teacher'], 
                                pupil_teacher_clusters['cluster_pupil'], 
                                normalize='index')

# Plot normalized heatmap
plt.figure(figsize=(8,6))
sns.heatmap(cluster_flow_norm, annot=True, fmt='.001%', cmap='Blues', 
            annot_kws={'size': 20, 'weight': 'bold'},
            cbar_kws={'label': 'Proportion of Pupils', })
plt.xlabel('Pupil Cluster', size=20)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(rotation=0, fontsize=12)
plt.ylabel('Teacher Cluster', size=20)
plt.title('Before Learning', size=18)
plt.show()


In [None]:
pupil_clusters = learning_acc_Pupil_with_cluster[['participant_id', 'cluster']].drop_duplicates().reset_index(drop=True)
teacher_clusters = testing_acc_with_cluster[['participant_id', 'cluster']].drop_duplicates().reset_index(drop=True)
pupil_teacher_relation = pupil_participants[['id', 'teacher_participant_id']]

pupil_teacher_clusters = pd.merge(pupil_teacher_relation, pupil_clusters, left_on='id', right_on='participant_id', how='left')
pupil_teacher_clusters = pd.merge(pupil_teacher_clusters, teacher_clusters, left_on='teacher_participant_id', right_on='participant_id', suffixes=('_pupil', '_teacher'), how='left')
pupil_teacher_clusters = pupil_teacher_clusters[['id', 'teacher_participant_id', 'cluster_pupil', 'cluster_teacher']]
pupil_teacher_clusters = pupil_teacher_clusters.dropna(subset=['cluster_pupil', 'cluster_teacher'])


# pupil_teacher_clusters

# Create a crosstab for teacher cluster vs pupil cluster
cluster_flow_norm = pd.crosstab(pupil_teacher_clusters['cluster_teacher'], 
                                pupil_teacher_clusters['cluster_pupil'], 
                                normalize='index')

# Plot normalized heatmap
plt.figure(figsize=(8,6))
sns.heatmap(cluster_flow_norm, annot=True, fmt='.001%', cmap='Blues', 
            annot_kws={'size': 20, 'weight': 'bold'},
            cbar_kws={'label': 'Proportion of Pupils', })
plt.xlabel('Pupil Cluster', size=20)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(rotation=0, fontsize=12)
plt.ylabel('', size=20)
plt.title('After Learning', size=18)
plt.show()


In [None]:
# Add cluster information to the original learning_accuracy_df

learning_accuracy_df_Pupil_restructured = learning_accuracy_df_Pupil.pivot(
    index='participant_id',
    columns='trial_type_id',
    values='accuracy'
).reset_index()

# Rename columns to be more descriptive
learning_accuracy_df_Pupil_restructured.columns = ['participant_id', 'hatched_acc', 'plain_acc', 'triangle_acc']

learning_accuracy_df_Pupil_restructured, cluster_centers = perform_clustering(learning_accuracy_df_Pupil_restructured)
learning_acc_Pupil_with_cluster = learning_accuracy_df_Pupil.merge(
    learning_accuracy_df_Pupil_restructured[['participant_id', 'cluster']],
    on='participant_id',
    how='left'
)

# Create a mapping dictionary for cluster renaming
cluster_mapping = {1: 'Level1', 0: 'Level3', 2: 'Level0'}

# Apply the mapping to rename clusters
learning_acc_Pupil_with_cluster['cluster'] = learning_acc_Pupil_with_cluster['cluster'].map(cluster_mapping)
learning_accuracy_df_Pupil_restructured['cluster'] = learning_accuracy_df_Pupil_restructured['cluster'].map(cluster_mapping)


In [None]:
# learning_accuracy_df_Pupil = calculate_accuracy_by_trial_type(data_trials_Pupil_learning)
# plot_accuracy_by_trial_type(learning_acc_Pupil_with_cluster, 'Learning Accuracy by Trial Type Pupil', cluster=True)
plot_accuracy_boxplots(learning_acc_Pupil_with_cluster, 'Learning Accuracy by Trial Type Pupil', cluster=True)

In [None]:

plot_accuracy_by_cluster_and_trial_type(learning_acc_Pupil_with_cluster, 'Accuracy Pupil')

In [None]:
Pupil_learning_rate_df = calculate_learning_rate(data_trials_Pupil_testing)
plot_learning_curves(Pupil_learning_rate_df[Pupil_learning_rate_df.participant_id != 62], 'Testing Type Pupil')

In [None]:
Pupil_learning_rate_df = calculate_learning_rate(data_trials_Pupil_learning, window=15, by_trial_type=False)
# plot_learning_curves(Pupil_learning_rate_df[Pupil_learning_rate_df.participant_id != 62], 'Learning Curves by Trial Type Pupil')
Pupil_learning_rate_df = Pupil_learning_rate_df[Pupil_learning_rate_df.participant_id != (62,)]

teacher_learning_rate_df = calculate_learning_rate(data_trials_Teacher_learning, window=15, by_trial_type=False)
teacher_learning_rate_df


In [None]:

pupil_learning_rate

In [None]:
# Count number of trials per participant and trial type
trial_counts = data_trials_Pupil_learning.groupby(['participant_id', 'trial_type_id']).size().reset_index(name='trial_count')

# Create pivot table to show trials per participant by trial type
trial_counts_pivot = trial_counts.pivot(index='participant_id', columns='trial_type_id', values='trial_count').fillna(0)

trial_counts_pivot

In [None]:
pupil_learning_rate

In [None]:
data_trials_Pupil_learning = data_trials_Pupil_learning[data_trials_Pupil_learning.participant_id != 62]

In [None]:
# Calculate learning rates for both Teacher and Pupil learning data
teacher_learning_rate = calculate_learning_rate(data_trials_Teacher_learning, window=5, by_trial_type=False)

pupil_learning_rate = calculate_learning_rate(data_trials_Pupil_learning, 
                                              window=5, by_trial_type=False)
pupil_test_learning_rate = calculate_learning_rate(data_trials_Pupil_testing, 
                                              window=5, by_trial_type=False)

# Add identifier columns
teacher_learning_rate['type'] = 'Teacher'
pupil_learning_rate['type'] = 'Pupil_w/_feedback'
pupil_test_learning_rate['type'] = 'Pupil_w/o_feedback'



pupil_learning_rate_filtered = pupil_learning_rate[pupil_learning_rate['trial_number'] <= 60]
pupil_test_learning_rate = pupil_test_learning_rate[pupil_test_learning_rate['trial_number'] <= 30]




# Combine dataframes
combined_learning_rate = pd.concat([teacher_learning_rate])
# combined_learning_rate = pd.concat([pupil_learning_rate_filtered])
# combined_learning_rate = pd.concat([pupil_test_learning_rate ])



# Create plot
plt.figure(figsize=(6, 6))

# Plot learning curves
sns.lineplot(
    data=combined_learning_rate, 
    x='trial_number', 
    y='cumulative_accuracy', 
    hue='type',
    errorbar='se',
    palette={'Teacher': '#3A6B9F', 'Pupil_w/_feedback': '#E3675C', 'Pupil_w/o_feedback': "#E3675C"},
    linewidth=3
)

# Add horizontal line at 0.5 (chance level)
plt.axhline(y=0.5, color='gray', linestyle='--', alpha=0.7, label='Chance level')

# Enhance the plot appearance
plt.title('', fontsize=18, fontweight='bold', pad=15)
plt.xlabel('Trial Number', fontsize=14)
plt.ylabel('Cumulative Accuracy', fontsize=14)
plt.ylim(0.4, 1.0)
plt.grid(True, alpha=0.3, linestyle='--')

# Improve legend
plt.legend(title='Participant Type', title_fontsize=12, fontsize=12, framealpha=0.9)

# Create custom x-ticks that show 30-90 instead of 0-60
# new_ticks = np.arange(30, 91, 10)  # [30, 40, 50, 60, 70, 80, 90]
# plt.xticks(np.arange(0, 61, 10), new_ticks)  #


# new_ticks = np.arange(0, 31, 10)  # [10, 20, 30]
# plt.xticks(np.arange(0, 31, 10), new_ticks)  #


# Remove top and right spines
sns.despine()

plt.tight_layout()
plt.show()

In [None]:
combined_learning_rate = pd.concat([pupil_test_learning_rate ])
combined_learning_rate

In [None]:
pupil_test_learning_rate['type'] = 'Pupil_Test'
pupil_test_learning_rate

In [None]:
# Extract participant IDs properly from string tuples or integers
# First convert to string to handle all types safely
teacher_learning_rate['participant_id_clean'] = teacher_learning_rate['participant_id'].astype(str)
# Now strip parentheses and commas
teacher_learning_rate['participant_id_clean'] = teacher_learning_rate['participant_id_clean'].str.strip('()').str.replace(',', '')
# Convert to integer, only for values that can be converted
teacher_learning_rate['participant_id_clean'] = pd.to_numeric(teacher_learning_rate['participant_id_clean'], errors='coerce')

In [None]:
teacher_learning_rate

In [None]:
# Get teacher cluster information and merge with learning rate data
teacher_learning_with_cluster = teacher_learning_rate.merge(
    teacher_clusters, 
    left_on=teacher_learning_rate['participant_id_clean'].astype(int),
    right_on='participant_id',
    how='left'
).dropna(subset=['cluster'])

# Create the plot
plt.figure(figsize=(6, 6))

# Define cluster colors
cluster_colors = {'Level0': '#3A6B9F', 'Level1': '#E3675C', 'Level3': '#4CAF50'}

# Plot learning curves for each cluster
for cluster in ['Level0', 'Level1', 'Level3']:
    cluster_data = teacher_learning_with_cluster[teacher_learning_with_cluster['cluster'] == cluster]
    
    sns.lineplot(
        data=cluster_data,
        x='trial_number',
        y='cumulative_accuracy',
        color=cluster_colors[cluster],
        label=f'{cluster}',
        errorbar='se',
        linewidth=3
    )

# Add horizontal line at chance level
plt.axhline(y=0.5, color='gray', linestyle='--', alpha=0.7, label='Chance level')

# Enhance plot appearance
plt.title('', fontsize=18, fontweight='bold', pad=15)
plt.xlabel('Trial Number', fontsize=14)
plt.ylabel('Cumulative Accuracy', fontsize=14)
plt.ylim(0.4, 1.0)
plt.grid(True, alpha=0.3, linestyle='--')

# Improve legend
# plt.legend(title='Performance Cluster', title_fontsize=12, fontsize=12, framealpha=0.9)

# Remove top and right spines
sns.despine()

plt.tight_layout()
plt.show()

In [None]:
# Create the plot for pupil clusters in testing
plt.figure(figsize=(6, 6))

# Define color palette for clusters
cluster_colors = {'Level0': '#3A6B9F', 'Level1': '#E3675C', 'Level3': '#4CAF50'}

# Get unique clusters
clusters = testing_acc_Pupil_with_cluster['cluster'].unique()

# Calculate learning curves for each cluster
for cluster in clusters:
    # Filter data for this cluster
    cluster_data = data_trials_Pupil_testing[data_trials_Pupil_testing['participant_id'].isin(
        testing_acc_Pupil_with_cluster[testing_acc_Pupil_with_cluster['cluster'] == cluster]['participant_id']
    )]
    
    # Calculate learning rate for this cluster
    cluster_learning_rate = calculate_learning_rate(cluster_data, window=10, by_trial_type=False)
    
    # Filter to show only first 60 trials
    cluster_learning_rate = cluster_learning_rate[cluster_learning_rate['trial_number'] <= 30]

    
    # Add cluster identifier
    cluster_learning_rate['cluster'] = cluster
    
    # Plot learning curve for this cluster
    sns.lineplot(
        data=cluster_learning_rate,
        x='trial_number',
        y='cumulative_accuracy',
        color=cluster_colors[cluster],
        label=f'{cluster}',
        errorbar='se',
        linewidth=3
    )

# Add horizontal line at chance level
plt.axhline(y=0.5, color='gray', linestyle='--', alpha=0.7, label='Chance level')

# Enhance plot appearance
plt.title('', fontsize=18, fontweight='bold', pad=15)
plt.xlabel('Trial Number', fontsize=14)
plt.ylabel('Cumulative Accuracy', fontsize=14)
plt.ylim(0.4, 1.0)
plt.grid(True, alpha=0.3, linestyle='--')

# Improve legend
# plt.legend(title='Performance Cluster', title_fontsize=12, fontsize=12, framealpha=0.9)

# Remove top and right spines
sns.despine()

plt.tight_layout()
plt.show()

In [None]:
# Create the plot for pupil clusters in learning
plt.figure(figsize=(6, 6))

# Define color palette for clusters
cluster_colors = {'Level0': '#3A6B9F', 'Level1': '#E3675C', 'Level3': '#4CAF50'}

# Get unique clusters
clusters = learning_acc_Pupil_with_cluster['cluster'].unique()

# Calculate learning curves for each cluster
for cluster in clusters:
    # Filter data for this cluster
    cluster_data = data_trials_Pupil_learning[data_trials_Pupil_learning['participant_id'].isin(
        learning_acc_Pupil_with_cluster[learning_acc_Pupil_with_cluster['cluster'] == cluster]['participant_id']
    )]
    
    # Calculate learning rate for this cluster
    cluster_learning_rate = calculate_learning_rate(cluster_data, window=10, by_trial_type=False)
    
    # Filter to show only first 60 trials
    cluster_learning_rate = cluster_learning_rate[cluster_learning_rate['trial_number'] <= 60]

    
    # Add cluster identifier
    cluster_learning_rate['cluster'] = cluster
    
    # Plot learning curve for this cluster
    sns.lineplot(
        data=cluster_learning_rate,
        x='trial_number',
        y='cumulative_accuracy',
        color=cluster_colors[cluster],
        label=f'{cluster}',
        errorbar='se',
        linewidth=3
    )

# Add horizontal line at chance level
plt.axhline(y=0.5, color='gray', linestyle='--', alpha=0.7, label='Chance level')

# Enhance plot appearance
plt.title('', fontsize=18, fontweight='bold', pad=15)
plt.xlabel('Trial Number', fontsize=14)
plt.ylabel('Cumulative Accuracy', fontsize=14)
plt.ylim(0.4, 1.0)
plt.grid(True, alpha=0.3, linestyle='--')

# Improve legend
# plt.legend(title='Performance Cluster', title_fontsize=12, fontsize=12, framealpha=0.9)

# Remove top and right spines
sns.despine()

plt.tight_layout()
plt.show()

In [None]:
teacher_learning_rate_df['type'] = 'teacher'
Pupil_learning_rate_df['type'] = 'pupil'

# concatenate the two dataframes
learning_rate_avg = pd.concat([teacher_learning_rate_df, Pupil_learning_rate_df])

# Display the averaged learning rate dataframe

# plot and compare the two learning curves
plt.figure(figsize=(10, 6))
sns.lineplot(data=learning_rate_avg, x='trial_number', y='cumulative_accuracy', hue='type', errorbar='se', marker='')
plt.title('Learning Curves', fontsize=16)
plt.xlabel('Trial Number', fontsize=14)
plt.ylabel('Average Cumulative Accuracy', fontsize=14)
plt.grid(True, alpha=0.3)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
pupil_clusters = learning_acc_Pupil_with_cluster[['participant_id', 'cluster']].drop_duplicates().reset_index(drop=True)
teacher_clusters = learning_acc_with_cluster[['participant_id', 'cluster']].drop_duplicates().reset_index(drop=True)
pupil_teacher_relation = pupil_participants[['id', 'teacher_participant_id']]

pupil_teacher_clusters = pd.merge(pupil_teacher_relation, pupil_clusters, left_on='id', right_on='participant_id', how='left')
pupil_teacher_clusters = pd.merge(pupil_teacher_clusters, teacher_clusters, left_on='teacher_participant_id', right_on='participant_id', suffixes=('_pupil', '_teacher'), how='left')
pupil_teacher_clusters = pupil_teacher_clusters[['id', 'teacher_participant_id', 'cluster_pupil', 'cluster_teacher']]
pupil_teacher_clusters = pupil_teacher_clusters.dropna(subset=['cluster_pupil', 'cluster_teacher'])


pupil_teacher_clusters

In [None]:
# Create a crosstab for teacher cluster vs pupil cluster
cluster_flow = pd.crosstab(pupil_teacher_clusters['cluster_teacher'], pupil_teacher_clusters['cluster_pupil'])

# Plot as heatmap
plt.figure(figsize=(8,6))
sns.heatmap(cluster_flow, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Pupil Cluster')
plt.ylabel('Teacher Cluster')
plt.title('Teacher Cluster to Pupil Cluster Mapping')
plt.show()


In [None]:
from sklearn.preprocessing import MinMaxScaler
rule_scores_df = pd.read_csv("TeachFollowUp_Ruleset_Scores.csv").drop(columns=['teaching_text'])



ruleset_cols = ['Ruleset1', 'Ruleset2', 'Ruleset3', 'Ruleset4']

# Normalize each row independently
normalized_rules = rule_scores_df[ruleset_cols].apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=1)

rule_scores_df[['Ruleset1_norm', 'Ruleset2_norm', 'Ruleset3_norm', 'Ruleset4_norm']] = normalized_rules

rule_scores_df_cluster = rule_scores_df.merge(teacher_clusters, left_on='participant_id', right_on='participant_id', how='left')

In [None]:


pupil_teacher_relation.columns = ['pupil_id', 'teacher_id']

learning_accuracy_df_Pupil_restructured.columns = ['pupil_id', 'hatched_acc_pupil', 'plain_acc_pupil', 'triangle_acc_pupil', 'cluster_pupil']
learning_accuracy_df_restructured.columns = ['teacher_id', 'hatched_acc_teacher', 'plain_acc_teacher', 'triangle_acc_teacher', 'cluster_teacher']

# Merge pupil and teacher data using pupil_teacher_relation as the mapping table
# First merge pupil data with the relation table
pupil_teacher_data = pupil_teacher_relation.merge(
    learning_accuracy_df_Pupil_restructured,
    on='pupil_id',
    how='inner'
)

# Then merge with teacher data
combined_data = pupil_teacher_data.merge(
    learning_accuracy_df_restructured,
    on='teacher_id',
    how='inner'
)

# Now we have a dataframe with both pupil and teacher accuracy data
print("Combined pupil-teacher accuracy data:")
(combined_data.head())

# Create a figure to plot pupil-teacher accuracy connections
fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)

# Define trial types
trial_types = ['plain', 'hatched', 'triangle']

# Loop through each trial type
for i, trial_type in enumerate(trial_types):
    ax = axes[i]
    
    # Extract pupil and teacher accuracies for this trial type
    pupil_acc = combined_data[f'{trial_type}_acc_pupil']
    teacher_acc = combined_data[f'{trial_type}_acc_teacher']

    # Create scatter plot for teacher accuracies
    teacher_scatter = ax.scatter(
        np.ones(len(teacher_acc)) * 0.8,  # x-position for teacher points (now on left)
        teacher_acc,
        c=combined_data['cluster_teacher'].map({'Level0': 'blue', 'Level1': 'red', 'Level3': 'green'}),
        label='Teacher',
        alpha=0.7,
        s=80,
        marker='s'  # square markers for teachers
    )
    
    # Create scatter plot for pupil accuracies
    pupil_scatter = ax.scatter(
        np.ones(len(pupil_acc)) * 1.2,  # x-position for pupil points (now on right)
        pupil_acc,
        c=combined_data['cluster_pupil'].map({'Level0': 'blue', 'Level1': 'red', 'Level3': 'green'}),
        label='Pupil',
        alpha=0.7,
        s=80,
        marker='o'
    )
    

    
    # Draw lines connecting pupil-teacher pairs
    for j in range(len(combined_data)):
        ax.plot(
            [0.8, 1.2],  # x-coordinates for the line
            [teacher_acc.iloc[j], pupil_acc.iloc[j]],  # y-coordinates for the line (teacher first, then pupil)
            color='gray',
            alpha=0.3,
            linestyle='-',
            linewidth=1
        )
    
    # Set title and labels
    ax.set_title(f'{trial_type.capitalize()} Trials', fontsize=16)
    ax.set_xlim(0.5, 1.5)
    ax.set_ylim(0, 1.1)
    ax.set_xticks([0.8, 1.2])
    ax.set_xticklabels(['Teacher', 'Pupil'], fontsize=14)
    
    if i == 0:
        ax.set_ylabel('Accuracy', fontsize=14)
    
    # Add a legend for the clusters
    if i == 2:
        legend_elements = [
            plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='blue', markersize=10, label='Level 0'),
            plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='red', markersize=10, label='Level 1'),
            plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='green', markersize=10, label='Level 3')
        ]
        ax.legend(handles=legend_elements, loc='upper right', fontsize=12)

plt.tight_layout()
plt.show()










In [None]:
# Create figure with 3 panels (subplots) for the three trial types
fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)

# Define trial types and their positions in the plot
trial_types = ['plain', 'hatched', 'triangle']

# Loop through each trial type and create a plot
for i, trial_type in enumerate(trial_types):
    # Filter data for this trial type
    teacher_data = testing_acc_Pupil_with_cluster[testing_acc_Pupil_with_cluster['trial_type_id'] == trial_type]
    pupil_data = learning_acc_Pupil_with_cluster[learning_acc_Pupil_with_cluster['trial_type_id'] == trial_type]
    
    # Create box plots for both teacher and pupil data
    sns.boxplot(x=['Pupil_wo_feedback']*len(teacher_data) + ['Pupil_w_feedback']*len(pupil_data),
                y=pd.concat([teacher_data['accuracy'], pupil_data['accuracy']]),
                ax=axes[i],
                palette=['skyblue', 'lightgreen'],
                width=0.5)
    
    # Add individual data points with jitter
    sns.stripplot(x=['Pupil_wo_feedback']*len(teacher_data) + ['Pupil_w_feedback']*len(pupil_data),
                  y=pd.concat([teacher_data['accuracy'], pupil_data['accuracy']]),
                  ax=axes[i],
                  size=4,
                  alpha=0.6,
                  jitter=True,
                  hue=pd.concat([teacher_data['cluster'], pupil_data['cluster']]),
                  palette={'Level0': 'blue', 'Level1': 'red', 'Level3': 'green'},
                  dodge=False)
    
    # Set title and labels
    axes[i].set_title(f'{trial_type.capitalize()} Trials', fontsize=14)
    axes[i].set_xlabel('')
    
    # Only set y-label for the first subplot
    if i == 0:
        axes[i].set_ylabel('Accuracy', fontsize=12)
    else:
        axes[i].set_ylabel('')
        
    # Set y-axis limits
    axes[i].set_ylim(0, 1.05)
    
    # Remove legend from all but the last subplot
    if i < 2:
        if axes[i].get_legend() is not None:
            axes[i].get_legend().remove()

# Adjust the legend position for the last subplot
handles, labels = axes[2].get_legend_handles_labels()
legend_labels = list(dict.fromkeys(labels))  # Remove duplicates
legend_handles = [handles[labels.index(label)] for label in legend_labels]
axes[2].legend(legend_handles, legend_labels, title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')

# Add overall title
fig.suptitle('Comparison of pupil without vs with feedback Accuracy by Trial Type', fontsize=16, y=1.05)

# Adjust spacing between subplots
plt.tight_layout()
plt.show()

In [None]:
teaching_explanation = pd.read_excel('./Teaching explanation annotation.xlsx')

level0_ids = ~teaching_explanation['Level 1'] & ~teaching_explanation['Level 2'] & ~teaching_explanation['Level 3']
level1_ids = teaching_explanation['Level 1'] & ~teaching_explanation['Level 2'] 
level2_ids = teaching_explanation['Level 1'] & teaching_explanation['Level 2'] & ~teaching_explanation['Level 3']
level3_ids = teaching_explanation['Level 1'] & teaching_explanation['Level 2'] & teaching_explanation['Level 3']

teaching_explanation['cluster_explanation'] = pd.Series(len(teaching_explanation))
teaching_explanation.loc[level0_ids, 'cluster_explanation'] = 'Level0'
teaching_explanation.loc[level1_ids, 'cluster_explanation'] = 'Level1'
teaching_explanation.loc[level2_ids, 'cluster_explanation'] = 'Level2'
teaching_explanation.loc[level3_ids, 'cluster_explanation'] = 'Level3'




# teaching_explanation['cluster'] = teaching_explanation['Level 1'] +

In [None]:
teaching_explanation[['participant_id', 'cluster_explanation']]

In [None]:
teaching_explanation

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data for each subplot
datasets = [
    [50, 50, 50],    # first panel
    [100, 0, 100],   # second panel
    [100, 100, 0],   # third panel
    [100, 100, 100]  # fourth panel
]

labels = ['P', 'H', 'T']
x = np.arange(len(labels))

fig, axes = plt.subplots(1, 4, figsize=(12, 3), sharey=True)

for ax, values in zip(axes, datasets):
    bars = ax.bar(x, values, color='navy')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.set_ylim(0, 110)
    ax.set_yticks(np.arange(0, 111, 20))
    # add percentage labels on top of each bar
    for bar, v in zip(bars, values):
        ax.text(bar.get_x() + bar.get_width()/2, v + 2, f'{v}%', 
                ha='center', va='bottom', fontsize=9)
    # light grid lines for poster style
    ax.grid(axis='y', linestyle='--', alpha=0.5)

fig.tight_layout()
plt.show()
