# Prepare dataset

### Clear Solomon data

Convert all distant (2) & unclear (3) interactions to no-interaction (0) to ensure dichotomous outcome.

In [2]:
import pandas as pd

# List of dataset names
dataset_names = ['DYAD06NF', 'DYAD10NF', 'DYAD11NF', 'DYAD21NF', 'DYAD24NF']

# Base directory for input and output
input_dir = '/Users/ruzenkakaldenbach/Desktop/Solomon_output/'
output_dir = '/Users/ruzenkakaldenbach/Desktop/Solomon_output/'

# Process each dataset
for dat_name in dataset_names:
    print(f"Processing {dat_name}...")
    
    # Load the dataset
    file_path = f"{input_dir}solomon_{dat_name}.csv"
    df = pd.read_csv(file_path)
    
    # Replace `2` (distant) and `3` (unclear) with `0` (no interaction)
    df[['si_ry', 'si_by', 'si_rb']] = df[['si_ry', 'si_by', 'si_rb']].replace({2: 0, 3: 0})
    
    # Save the modified dataset
    output_file = f"{output_dir}solomon_{dat_name}_dichotomous.csv"
    df.to_csv(output_file, index=False)
    print(f"Saved processed file to {output_file}")

print("Processing complete.")


Processing DYAD06NF...
Saved processed file to /Users/ruzenkakaldenbach/Desktop//Solomon_output/solomon_DYAD06NF_dichotomous.csv
Processing DYAD10NF...
Saved processed file to /Users/ruzenkakaldenbach/Desktop//Solomon_output/solomon_DYAD10NF_dichotomous.csv
Processing DYAD11NF...
Saved processed file to /Users/ruzenkakaldenbach/Desktop//Solomon_output/solomon_DYAD11NF_dichotomous.csv
Processing DYAD21NF...
Saved processed file to /Users/ruzenkakaldenbach/Desktop//Solomon_output/solomon_DYAD21NF_dichotomous.csv
Processing DYAD24NF...
Saved processed file to /Users/ruzenkakaldenbach/Desktop//Solomon_output/solomon_DYAD24NF_dichotomous.csv
Processing complete.


In [3]:
df

Unnamed: 0,frame_timestamp,si_ry,si_by,si_rb
0,0.00,0,0,0
1,0.25,0,0,0
2,0.50,0,0,0
3,0.75,0,0,0
4,1.00,0,0,0
...,...,...,...,...
684,171.00,0,0,0
685,171.25,0,0,0
686,171.50,0,0,0
687,171.75,0,0,0


### Create a common dataset for Loopy and Solomon data

The resulting dataset will contain Loopy data as predictor (distance, angle, facing) and Solomon data as outcome (social interaction). All dyads will be listed one below the other. All spreadsheets will then be listed one below the other.

In [14]:
import pandas as pd
import numpy as np

# List of dataset names
dataset_names = ['DYAD06NF', 'DYAD10NF', 'DYAD11NF', 'DYAD21NF', 'DYAD24NF']

# Directories for input and output
solomon_dir = '/Users/ruzenkakaldenbach/Desktop/Solomon_output/'
loopy_dir = '/Users/ruzenkakaldenbach/Desktop/Drive/Loopy_preprocessed_data/'
output_file = '/Users/ruzenkakaldenbach/Desktop/common_dataset.csv'

# Initialize an empty DataFrame for the combined dataset
common_dataset = pd.DataFrame()

# Loop through each dataset
for dataset in dataset_names:
    print(f"Processing {dataset}...")

    # Load Solomon data
    solomon_file = f'{solomon_dir}solomon_{dataset}_dichotomous.csv'
    solomon_data = pd.read_csv(solomon_file)

    # Load Loopy data
    loopy_file = f'{loopy_dir}Loopy_{dataset}__processed.csv'
    loopy_data = pd.read_csv(loopy_file)

    # Extract relevant dyad columns from Loopy data
    loopy_data_expanded = pd.DataFrame()
    for dyad, columns in [
        ('red-yellow', ['dist_c_ry', 'dist_f_ry', 'deg_ry', 'facing_ry']),
        ('blue-yellow', ['dist_c_by', 'dist_f_by', 'deg_by', 'facing_by']),
        ('red-blue', ['dist_c_rb', 'dist_f_rb', 'deg_rb', 'facing_rb']),
    ]:
        dyad_data = loopy_data[columns].copy()  # Extract and copy relevant columns
        dyad_data['dyad'] = dyad
        dyad_data['frame_timestamp'] = loopy_data['frame_timestamp']
        dyad_data['video'] = dataset  # Add a column with the dataset name
        dyad_data.columns = ['distance_central', 'distance_front', 'angle', 'facing', 'dyad', 'frame_timestamp', 'video']
        loopy_data_expanded = pd.concat([loopy_data_expanded, dyad_data], ignore_index=True)

    # Extract relevant dyad columns from Solomon data
    solomon_data_expanded = pd.DataFrame()
    for dyad, column in [
        ('red-yellow', 'si_ry'),
        ('blue-yellow', 'si_by'),
        ('red-blue', 'si_rb'),
    ]:
        dyad_data = solomon_data[['frame_timestamp', column]].copy()
        dyad_data['dyad'] = dyad
        dyad_data['video'] = dataset  # Add a column with the dataset name
        dyad_data.columns = ['frame_timestamp', 'interaction', 'dyad', 'video']
        solomon_data_expanded = pd.concat([solomon_data_expanded, dyad_data], ignore_index=True)

    # Merge Loopy and Solomon data
    merged_data = pd.merge(loopy_data_expanded, solomon_data_expanded, on=['frame_timestamp', 'dyad', 'video'], how='inner')

    # Append to the common dataset
    common_dataset = pd.concat([common_dataset, merged_data], ignore_index=True)

# Save the combined dataset to a CSV file
common_dataset.to_csv(output_file, index=False)
print(f"Saved combined dataset to {output_file}")

common_dataset


Processing DYAD06NF...
Processing DYAD10NF...
Processing DYAD11NF...
Processing DYAD21NF...
Processing DYAD24NF...
Saved combined dataset to /Users/ruzenkakaldenbach/Desktop/common_dataset.csv


Unnamed: 0,distance_central,distance_front,angle,facing,dyad,frame_timestamp,video,interaction
0,786.034067,780.383917,1.085105,1,red-yellow,0.00,DYAD06NF,0
1,787.961685,784.651162,1.511665,1,red-yellow,0.25,DYAD06NF,0
2,779.495416,774.217968,7.789983,1,red-yellow,0.50,DYAD06NF,0
3,832.415345,764.099601,26.381913,1,red-yellow,0.75,DYAD06NF,0
4,842.576190,763.734086,25.957632,1,red-yellow,1.00,DYAD06NF,0
...,...,...,...,...,...,...,...,...
30943,753.075769,642.652602,65.617127,1,red-blue,171.00,DYAD24NF,0
30944,777.794215,654.928245,56.330383,1,red-blue,171.25,DYAD24NF,0
30945,746.123270,659.720564,39.667990,1,red-blue,171.50,DYAD24NF,0
30946,749.126311,661.904582,22.876373,1,red-blue,171.75,DYAD24NF,0


In [15]:
# Create the new column `distance` as the mean of `distance_front` and `distance_central`
common_dataset['distance'] = common_dataset[['distance_front', 'distance_central']].mean(axis=1)

# Drop the old columns `distance_front` and `distance_central`
common_dataset = common_dataset.drop(columns=['distance_front', 'distance_central'])

# Reorder the columns to place `distance` at the position of the original columns
columns_order = ['distance'] + [col for col in common_dataset.columns if col != 'distance']
common_dataset = common_dataset[columns_order]

# Save the updated dataset if needed
common_dataset.to_csv("/Users/ruzenkakaldenbach/Desktop/common_dataset_mean_distance.csv", index=False)

common_dataset

Unnamed: 0,distance,angle,facing,dyad,frame_timestamp,video,interaction
0,783.208992,1.085105,1,red-yellow,0.00,DYAD06NF,0
1,786.306424,1.511665,1,red-yellow,0.25,DYAD06NF,0
2,776.856692,7.789983,1,red-yellow,0.50,DYAD06NF,0
3,798.257473,26.381913,1,red-yellow,0.75,DYAD06NF,0
4,803.155138,25.957632,1,red-yellow,1.00,DYAD06NF,0
...,...,...,...,...,...,...,...
30943,697.864185,65.617127,1,red-blue,171.00,DYAD24NF,0
30944,716.361230,56.330383,1,red-blue,171.25,DYAD24NF,0
30945,702.921917,39.667990,1,red-blue,171.50,DYAD24NF,0
30946,705.515446,22.876373,1,red-blue,171.75,DYAD24NF,0


# Apply ML

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the dataset
file_path = '/Users/ruzenkakaldenbach/Desktop/common_dataset_mean_distance.csv'
common_dataset = pd.read_csv(file_path)

# List of dataset names (videos)
dataset_names = ['DYAD06NF', 'DYAD10NF', 'DYAD11NF', 'DYAD21NF', 'DYAD24NF']

# Function to balance the dataset by sampling interaction = 0 to match interaction = 1
def balance_dataset(data):
    interaction_1 = data[data['interaction'] == 1]
    interaction_0 = data[data['interaction'] == 0]
    interaction_0_sampled = interaction_0.sample(n=len(interaction_1), random_state=42)
    balanced_data = pd.concat([interaction_1, interaction_0_sampled], ignore_index=True)
    return balanced_data.sample(frac=1, random_state=42).reset_index(drop=True) # Shuffles the combined dataset to mix the rows randomly

# Lists to store accuracies for averaging later
training_accuracies = []
testing_accuracies = []

# Loop through each possible test video
for test_video in dataset_names:
    print(f"Processing with test video: {test_video}...")

    # Divide into training and test datasets
    train_videos = [video for video in dataset_names if video != test_video] # All videos except the current test_video
    train_data = common_dataset[common_dataset['video'].isin(train_videos)]
    test_data = common_dataset[common_dataset['video'] == test_video]

    # Balance training and test datasets
    train_data_balanced = balance_dataset(train_data)
    test_data_balanced = balance_dataset(test_data)

    # Logistic Regression
    # Select predictors (distance, angle, facing) and outcome (interaction)
    predictors = ['distance', 'angle', 'facing']
    outcome = 'interaction'

    X_train = train_data_balanced[predictors]
    y_train = train_data_balanced[outcome]
    X_test = test_data_balanced[predictors]
    y_test = test_data_balanced[outcome]

    # Train the logistic regression model
    model = LogisticRegression(random_state=42)
    model.fit(X_train, y_train)

    # Evaluate on the training set
    y_train_pred = model.predict(X_train) # Once the model is fit, this only writes out the existing rows for how well the model fits the data to predict outcome from predictor
    train_accuracy = accuracy_score(y_train, y_train_pred) * 100 # Built-in function imported, compares the true values (y_train) with the predicted values (y_train_pred) and calculates the proportion of correct predictions
    training_accuracies.append(train_accuracy) # Store for averaging across videos later
    print("\nTraining Set Performance:")
    # Format and display the training confusion matrix
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_train_df = pd.DataFrame(
        cm_train,
        index=["Actual 0", "Actual 1"],
        columns=["Predicted 0", "Predicted 1"]
    )
    # True Positives (TP): Correctly predicted 1.
    # True Negatives (TN): Correctly predicted 0.
    # False Positives (FP): Predicted 1 when the true value was 0.
    # False Negatives (FN): Predicted 0 when the true value was 1.
    print("\nConfusion Matrix (Training):")
    print(cm_train_df)
    #print(confusion_matrix(y_train, y_train_pred))
    print("\nClassification Report (Training):")
    # Precision: Proportion of positive predictions (1) that were correct, TP/(TP+FP)
    # Recall or sensitivity: Proportion of actual positives (1) that were identified, TP/(TP+FN)
    # F1-Score: Harmonic mean of precision and recall, 2x(precision*recall)/(precision-recall)
    # Support: Total number of actual occurences within the 0 and 1 category
    print(classification_report(y_train, y_train_pred))
    print(f"\nAccuracy Score (Training): {train_accuracy:.2f}%")

    # Evaluate on the test set
    y_test_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred) * 100
    testing_accuracies.append(test_accuracy)
    print("\nTest Set Performance:")
    # Format and display the training confusion matrix
    cm_test = confusion_matrix(y_test, y_test_pred)
    cm_test_df = pd.DataFrame(
        cm_test,
        index=["Actual 0", "Actual 1"],
        columns=["Predicted 0", "Predicted 1"]
    )
    print("\nConfusion Matrix (Test):")
    print(cm_test_df)
    #print(confusion_matrix(y_test, y_test_pred))
    print("\nClassification Report (Test):")
    print(classification_report(y_test, y_test_pred))
    print(f"\nAccuracy Score (Test): {test_accuracy:.2f}%")

    # Save the balanced datasets for this split
    train_output_path = f'/Users/ruzenkakaldenbach/Desktop/train_dataset_balanced_{test_video}.csv'
    test_output_path = f'/Users/ruzenkakaldenbach/Desktop/test_dataset_balanced_{test_video}.csv'
    train_data_balanced.to_csv(train_output_path, index=False)
    test_data_balanced.to_csv(test_output_path, index=False)

    print(f"Training dataset for test video {test_video} saved to {train_output_path}")
    print(f"Test dataset for test video {test_video} saved to {test_output_path}")

# Calculate average accuracies
average_train_accuracy = sum(training_accuracies) / len(training_accuracies)
average_test_accuracy = sum(testing_accuracies) / len(testing_accuracies)

print("\nOverall Performance:")
print(f"Average Training Accuracy: {average_train_accuracy:.2f}%")
print(f"Average Testing Accuracy: {average_test_accuracy:.2f}%")
print("Processing complete.")


Processing with test video: DYAD06NF...

Training Set Performance:

Confusion Matrix (Training):
          Predicted 0  Predicted 1
Actual 0         1499          309
Actual 1          159         1649

Classification Report (Training):
              precision    recall  f1-score   support

           0       0.90      0.83      0.86      1808
           1       0.84      0.91      0.88      1808

    accuracy                           0.87      3616
   macro avg       0.87      0.87      0.87      3616
weighted avg       0.87      0.87      0.87      3616


Accuracy Score (Training): 87.06%

Test Set Performance:

Confusion Matrix (Test):
          Predicted 0  Predicted 1
Actual 0          401           21
Actual 1          178          244

Classification Report (Test):
              precision    recall  f1-score   support

           0       0.69      0.95      0.80       422
           1       0.92      0.58      0.71       422

    accuracy                           0.76       84