# Rescaling After Experiment

This notebook is used to rescale the raw similarity data collected during the experiment to a more interpretable scale (1-7).

## How to use this notebook:
1. Place your participant data Excel files in the `Participantdata` directory
2. Update the `participant_file` variable below to point to your file
3. Run all cells to rescale the data
4. The rescaled data will be saved to the same directory with a '_rescaled' suffix

Note: This notebook comes with example data in the `ExampleData` directory that you can use to test the functionality.

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Configuration - Update these variables as needed
# Set to True to use example data, False to use your own data
use_example_data = True

# If using your own data, specify the participant number or filename
participant_number = "example"

# Define paths using relative paths instead of absolute paths
if use_example_data:
    data_dir = "ExampleData"
    participant_file = os.path.join(data_dir, f"participant_{participant_number}.xlsx")
else:
    data_dir = "Participantdata"
    participant_file = os.path.join(data_dir, f"participant_{participant_number}.xlsx")

# Check if the file exists
if not os.path.exists(participant_file):
    print(f"Error: File {participant_file} not found.")
    print("Please make sure the file exists or set use_example_data=True to use example data.")
else:
    # Read the data
    print(f"Reading data from {participant_file}")
    df = pd.read_excel(participant_file, index_col=0)
    print(f"Data loaded successfully with shape {df.shape}")

## Data Rescaling

The raw data is rescaled from pixel distances to a 1-7 scale where:
- Values below 50 pixels are set to 1 (most similar)
- Values above 570 pixels are set to 7 (least similar)
- Values in between are linearly scaled
- Diagonal elements (self-comparisons) are set to 0

In [None]:
# Make a copy of the original data before rescaling
df_original = df.copy()

# Rescale df from values beneath 50 to 1 and above 570 to 7 and scale in between
df = df.apply(lambda x: (x - 50) / (570 - 50) * (7 - 1) + 1)

# Diagonal elements should be 0
np.fill_diagonal(df.values, 0)

# If a value is above 7, set it to 7 and below 1 set it to 1
df = df.clip(lower=1, upper=7)

print("Data rescaled successfully")

## Sort Data by Video ID

This step sorts the rows and columns based on the video ID numbers for better organization.

In [None]:
# Sort indexes and columns based on the numbers before "_"
try:
    df = df.reindex(sorted(df.columns, key=lambda x: int(x.split('_')[0])), axis=1)
    df = df.reindex(sorted(df.index, key=lambda x: int(x.split('_')[0])), axis=0)
    print("Data sorted successfully by video ID")
except Exception as e:
    print(f"Warning: Could not sort data by video ID. Error: {e}")
    print("Continuing with unsorted data...")

## Display the Rescaled Data

This shows a preview of the rescaled similarity matrix.

In [None]:
# Display the first few rows and columns of the rescaled data
df.head(10)

## Visualize the Similarity Matrix

This creates a heatmap visualization of the similarity matrix.

In [None]:
# Set up the matplotlib figure
plt.figure(figsize=(12, 10))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(df, cmap="YlGnBu", vmin=1, vmax=7, 
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

plt.title('Rescaled Similarity Matrix (1-7 scale)')
plt.tight_layout()
plt.show()

## Save the Rescaled Data

This saves the rescaled data to a new Excel file.

In [None]:
# Save the rescaled data
output_file = os.path.join(data_dir, f"participant_{participant_number}_rescaled.xlsx")
df.to_excel(output_file)
print(f"Rescaled data saved to {output_file}")

## Compare Original vs. Rescaled Data

This section shows a comparison between the original and rescaled data distributions.

In [None]:
# Create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Plot original data distribution
original_values = df_original.values.flatten()
original_values = original_values[original_values != 0]  # Remove diagonal zeros
sns.histplot(original_values, kde=True, ax=ax1)
ax1.set_title('Original Data Distribution')
ax1.set_xlabel('Pixel Distance')

# Plot rescaled data distribution
rescaled_values = df.values.flatten()
rescaled_values = rescaled_values[rescaled_values != 0]  # Remove diagonal zeros
sns.histplot(rescaled_values, kde=True, ax=ax2)
ax2.set_title('Rescaled Data Distribution (1-7)')
ax2.set_xlabel('Similarity Score')

plt.tight_layout()
plt.show()

## Batch Processing (Optional)

If you have multiple participant files to process, you can use this section to batch process them all at once.

In [None]:
def rescale_participant_data(file_path):
    """Rescale a single participant's data file"""
    try:
        # Read the data
        df = pd.read_excel(file_path, index_col=0)
        
        # Rescale df from values beneath 50 to 1 and above 570 to 7 and scale in between
        df = df.apply(lambda x: (x - 50) / (570 - 50) * (7 - 1) + 1)
        
        # Diagonal elements should be 0
        np.fill_diagonal(df.values, 0)
        
        # If a value is above 7, set it to 7 and below 1 set it to 1
        df = df.clip(lower=1, upper=7)
        
        # Sort indexes and columns based on the numbers before "_"
        try:
            df = df.reindex(sorted(df.columns, key=lambda x: int(x.split('_')[0])), axis=1)
            df = df.reindex(sorted(df.index, key=lambda x: int(x.split('_')[0])), axis=0)
        except:
            pass  # Continue if sorting fails
        
        # Save the rescaled data
        output_file = file_path.replace('.xlsx', '_rescaled.xlsx')
        df.to_excel(output_file)
        return True, output_file
    except Exception as e:
        return False, str(e)

# Uncomment and modify the code below to batch process multiple files
"""
# Directory containing participant data files
data_dir = "Participantdata"

# Process all Excel files in the directory
if os.path.exists(data_dir):
    files_processed = 0
    for file in os.listdir(data_dir):
        if file.endswith('.xlsx') and not file.endswith('_rescaled.xlsx'):
            file_path = os.path.join(data_dir, file)
            success, result = rescale_participant_data(file_path)
            if success:
                print(f"Successfully processed {file} -> {os.path.basename(result)}")
                files_processed += 1
            else:
                print(f"Failed to process {file}: {result}")
    
    print(f"\nBatch processing complete. {files_processed} files processed.")
else:
    print(f"Directory {data_dir} not found.")
"""