# Introduction
This dataset contains detailed information about mouse social behavior experiments, including tracking data, behavioral annotations, and experimental conditions. The data comes from multiple research labs (AdaptableSnail, BoisterousParrot, CRIM13, CalMS21_supplemental) studying social interactions between mice in various experimental setups.

Key features include:

Mouse characteristics (strain, color, sex, age, ID, condition)

Video metadata (frames per second, duration, resolution)

Arena specifications (size, shape, type)

Body parts tracked and behaviors labeled

Multi-mouse interaction data

The dataset is ideal for behavioral analysis, social interaction classification, and animal behavior prediction tasks.

# Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load the dataset

In [None]:
df = pd.read_csv('/kaggle/input/MABe-mouse-behavior-detection/train.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
df.head()

In [None]:
# Data types and missing values
print("Dataset Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum().sort_values(ascending=False).head(20))

In [None]:
# Basic statistics
print("Basic Statistics:")
df.describe(include='all')

# Exploratory Data Analysis

In [None]:
# Distribution of lab sources
plt.figure(figsize=(10, 6))
sns.countplot(data=df, y='lab_id', order=df['lab_id'].value_counts().index)
plt.title('Distribution of Experiments by Lab')
plt.xlabel('Count')
plt.tight_layout()
plt.show()

In [None]:
# Mouse strain distribution
plt.figure(figsize=(12, 6))
strain_data = pd.concat([df['mouse1_strain'], df['mouse2_strain']]).dropna()
sns.countplot(data=pd.DataFrame({'strain': strain_data}), y='strain', 
              order=strain_data.value_counts().index)
plt.title('Mouse Strain Distribution')
plt.xlabel('Count')
plt.tight_layout()
plt.show()

In [None]:
# Arena type analysis
plt.figure(figsize=(10, 6))
sns.countplot(data=df, y='arena_type', order=df['arena_type'].value_counts().index)
plt.title('Arena Type Distribution')
plt.xlabel('Count')
plt.tight_layout()
plt.show()

In [None]:
# Video duration distribution
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(df['video_duration_sec'], bins=50, kde=True)
plt.title('Video Duration Distribution')
plt.xlabel('Duration (seconds)')

plt.subplot(1, 2, 2)
sns.boxplot(y=df['video_duration_sec'])
plt.title('Video Duration Boxplot')
plt.ylabel('Duration (seconds)')

plt.tight_layout()
plt.show()

In [None]:
# Frames per second analysis
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='frames_per_second')
plt.title('Frames Per Second Distribution')
plt.xlabel('FPS')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
# Tracking method used
plt.figure(figsize=(8, 6))
sns.countplot(data=df, y='tracking_method', order=df['tracking_method'].value_counts().index)
plt.title('Tracking Methods Used')
plt.xlabel('Count')
plt.tight_layout()
plt.show()

# Feature Engineering

In [None]:
# Create new features for analysis
# Number of mice in experiment
def count_mice(row):
    mice_count = 0
    for i in range(1, 5):
        if pd.notna(row[f'mouse{i}_strain']):
            mice_count += 1
    return mice_count

df['mice_count'] = df.apply(count_mice, axis=1)

# Behavioral complexity (number of unique behaviors)
df['behavior_count'] = df['behaviors_labeled'].apply(lambda x: len(eval(x)) if pd.notna(x) else 0)

# Body parts complexity
df['body_parts_count'] = df['body_parts_tracked'].apply(lambda x: len(eval(x)) if pd.notna(x) else 0)

print("New Features Created:")
print(f"Mice Count: {df['mice_count'].value_counts().to_dict()}")
print(f"Behavior Count Stats: Min={df['behavior_count'].min()}, Max={df['behavior_count'].max()}")
print(f"Body Parts Count Stats: Min={df['body_parts_count'].min()}, Max={df['body_parts_count'].max()}")

In [None]:
# Visualize new features
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.countplot(data=df, x='mice_count')
plt.title('Number of Mice per Experiment')

plt.subplot(1, 3, 2)
sns.histplot(df['behavior_count'], bins=30, kde=True)
plt.title('Behavioral Complexity')

plt.subplot(1, 3, 3)
sns.histplot(df['body_parts_count'], bins=30, kde=True)
plt.title('Body Parts Tracking Complexity')

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis for numerical features
numerical_cols = ['frames_per_second', 'video_duration_sec', 'pix_per_cm_approx', 
                  'video_width_pix', 'video_height_pix', 'arena_width_cm', 
                  'arena_height_cm', 'mice_count', 'behavior_count', 'body_parts_count']

plt.figure(figsize=(12, 10))
corr_matrix = df[numerical_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

# Behavioral Analysis

In [None]:
# Analyze most common behaviors
all_behaviors = []
for behaviors in df['behaviors_labeled'].dropna():
    all_behaviors.extend(eval(behaviors))

behavior_df = pd.DataFrame({'behavior': all_behaviors})
behavior_counts = behavior_df['behavior'].value_counts().head(20)

plt.figure(figsize=(12, 8))
sns.barplot(y=behavior_counts.index, x=behavior_counts.values)
plt.title('Top 20 Most Common Behaviors')
plt.xlabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
# Analyze behavior patterns by lab
lab_behaviors = {}
for lab in df['lab_id'].unique():
    lab_behaviors[lab] = []
    lab_data = df[df['lab_id'] == lab]
    for behaviors in lab_data['behaviors_labeled'].dropna():
        lab_behaviors[lab].extend(eval(behaviors))

# Top behaviors by lab
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.ravel()

for i, (lab, behaviors) in enumerate(lab_behaviors.items()):
    if i < 4:  # Limit to first 4 labs
        behavior_counts = pd.Series(behaviors).value_counts().head(10)
        sns.barplot(y=behavior_counts.index, x=behavior_counts.values, ax=axes[i])
        axes[i].set_title(f'Top Behaviors - {lab}')
        axes[i].set_xlabel('Frequency')

plt.tight_layout()
plt.show()

# Predictive Modeling

In [None]:
# Prepare data for modeling
# Let's predict the lab_id based on experimental setup

# Select features for modeling
feature_columns = [
    'frames_per_second', 'video_duration_sec', 'pix_per_cm_approx',
    'video_width_pix', 'video_height_pix', 'arena_width_cm', 'arena_height_cm',
    'mice_count', 'behavior_count', 'body_parts_count'
]

# Add categorical features after encoding
categorical_cols = ['arena_shape', 'arena_type', 'tracking_method']

# Prepare feature matrix and target
X = df[feature_columns].copy()
y = df['lab_id']

# Handle categorical variables
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(df[col].astype(str))

# Handle missing values
X = X.fillna(X.median())

# Encode target variable
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)

print(f"Feature matrix shape: {X.shape}")
print(f"Target classes: {len(np.unique(y_encoded))}")

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")


# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Initialize models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='rbf', random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
}

# Evaluate models using cross-validation
results = {}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=cv, scoring='f1_weighted')
    results[name] = cv_scores
    print(f"{name} - CV F1 Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

In [None]:
# Visualize cross-validation results
plt.figure(figsize=(10, 6))
model_names = list(results.keys())
cv_means = [np.mean(scores) for scores in results.values()]
cv_stds = [np.std(scores) for scores in results.values()]

sns.barplot(x=model_names, y=cv_means, yerr=cv_stds)
plt.title('Model Comparison (5-Fold Cross Validation)')
plt.ylabel('F1 Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Train the best model (Random Forest)
best_model = RandomForestClassifier(n_estimators=100, random_state=42)
best_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = best_model.predict(X_test_scaled)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1 Score: {f1:.4f}")

In [None]:
# Detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le_target.classes_))

In [None]:
# Confusion matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=le_target.classes_, 
            yticklabels=le_target.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
plt.title('Top 10 Most Important Features')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

# Advanced Analysis

In [None]:
# Behavioral complexity by lab
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='lab_id', y='behavior_count')
plt.title('Behavioral Complexity by Lab')
plt.xlabel('Lab')
plt.ylabel('Number of Behaviors')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Experimental duration by lab
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='lab_id', y='video_duration_sec')
plt.title('Experiment Duration by Lab')
plt.xlabel('Lab')
plt.ylabel('Duration (seconds)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Mouse strain analysis across labs
strain_lab_analysis = pd.melt(df, 
                             id_vars=['lab_id'], 
                             value_vars=[f'mouse{i}_strain' for i in range(1, 5)],
                             var_name='mouse_num', 
                             value_name='strain')

plt.figure(figsize=(12, 8))
strain_pivot = strain_lab_analysis.groupby(['lab_id', 'strain']).size().unstack(fill_value=0)
sns.heatmap(strain_pivot, annot=True, fmt='d', cmap='YlOrRd')
plt.title('Mouse Strain Distribution Across Labs')
plt.xlabel('Strain')
plt.ylabel('Lab')
plt.tight_layout()
plt.show()

# Model Optimization

In [None]:
# Hyperparameter tuning for Random Forest
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Grid search with cross-validation
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1
)

grid_search.fit(X_train_scaled, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

In [None]:
# Train optimized model
optimized_model = grid_search.best_estimator_
y_pred_optimized = optimized_model.predict(X_test_scaled)

# Evaluate optimized model
accuracy_opt = accuracy_score(y_test, y_pred_optimized)
f1_opt = f1_score(y_test, y_pred_optimized, average='weighted')

print(f"Optimized Model Performance:")
print(f"Test Accuracy: {accuracy_opt:.4f}")
print(f"Test F1 Score: {f1_opt:.4f}")

# Summary and Conclusions


This comprehensive analysis of the mouse social behavior dataset reveals several key insights:

Dataset Characteristics: The dataset contains diverse experimental setups from multiple labs with varying numbers of mice, behavioral annotations, and tracking methodologies.

Behavioral Patterns: Different labs focus on different types of social interactions, with some specializing in aggression studies and others in more complex social dynamics.

Model Performance: The Random Forest classifier achieved strong performance (F1 score within the target range of 0.55-0.87) in predicting the lab source based on experimental setup features.

Key Predictors: Video duration, number of behaviors tracked, and arena specifications were among the most important features for lab identification.

The dataset provides a rich foundation for studying animal social behavior and developing predictive models for behavioral analysis. The models developed can help identify patterns in experimental design and potentially predict behavioral outcomes based on setup parameters.