In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Step 0: Install Required Libraries  

In this step we install all the Python packages needed for this notebook.


In [None]:
pip install pandas numpy matplotlib seaborn scikit-learn torch torchvision


## Step 1: Import Libraries and Load Dataset  

Here we import pandas and read the training metadata CSV file.


In [None]:
import pandas as pd

train_meta = pd.read_csv('/kaggle/input/MABe-mouse-behavior-detection/train.csv')
print(train_meta.head())


## Step 2: Basic Exploration of the Dataset  

Here we check the shape, column names, distribution of labs and look at example behaviors per video.


In [None]:
print(train_meta.shape)          # rows x columns
print(train_meta.columns)        # column names
print(train_meta['lab_id'].value_counts())  # kaunse labs kitne videos diye
print(train_meta['behaviors_labeled'].head(5))  # example behaviors per video


## Step 3: Parse and Count Behaviors  

In the dataset the `behaviors_labeled` column contains behaviors stored as string representations of Python lists.
To analyze them we:

1. Define a helper function `safe_eval` to safely convert each string into an actual Python list (and turn NaN values into empty lists).
2. Apply this function to create a new column `behaviors_list`.
3. Flatten all behaviors from all videos into a single list (we also take the last part after a comma to get the behavior name).
4. Use `collections.Counter` to count how many times each behavior occurs across the dataset.

Finally, we print the top 10 most common behaviors.


In [None]:
import ast
from collections import Counter
import pandas as pd

# Convert string to list, NaN ko empty list me convert
def safe_eval(x):
    if pd.isna(x):
        return []
    else:
        return ast.literal_eval(x)

train_meta['behaviors_list'] = train_meta['behaviors_labeled'].apply(safe_eval)

# Flatten all behaviors
all_behaviors = [b.split(',')[-1] for blist in train_meta['behaviors_list'] for b in blist]

# Count frequency
behavior_counts = Counter(all_behaviors)
print(behavior_counts.most_common(10))  # top 10 behaviors


## Step 4: Locate and List Tracking Files  

The dataset also contains mouse tracking data stored as `.parquet` files inside multiple lab folders.  
In this step we:

1. Specify the root path to the tracking data.
2. Loop through each lab folder and collect the full paths of all `.parquet` files.
3. Print the total number of tracking files and show the first few paths as a sample.


In [None]:
import os

# Trackings
tracking_root = '/kaggle/input/MABe-mouse-behavior-detection/train_tracking'
tracking_files = []
for lab in os.listdir(tracking_root):
    lab_path = os.path.join(tracking_root, lab)
    if os.path.isdir(lab_path):
        for file in os.listdir(lab_path):
            if file.endswith('.parquet'):
                tracking_files.append(os.path.join(lab_path, file))

print(f'Total tracking files: {len(tracking_files)}')
print(tracking_files[:5])


## Step 5: Locate and List Annotation Files  

Apart from the tracking data, the dataset also contains **annotation files** (labels) stored as `.parquet` files inside multiple lab folders.  

In this step we:

1. Specify the root path to the annotation data.
2. Loop through each lab folder and collect the full paths of all `.parquet` annotation files.
3. Print the total number of annotation files and display the first few paths as a sample.


In [None]:
annotation_root = '/kaggle/input/MABe-mouse-behavior-detection/train_annotation'
annotation_files = []
for lab in os.listdir(annotation_root):
    lab_path = os.path.join(annotation_root, lab)
    if os.path.isdir(lab_path):
        for file in os.listdir(lab_path):
            if file.endswith('.parquet'):
                annotation_files.append(os.path.join(lab_path, file))

print(f'Total annotation files: {len(annotation_files)}')
print(annotation_files[:5])


## Step 6: Load Tracking and Annotation Data for One Video  

To inspect the structure of the data, we pick the **first video** from our lists of tracking and annotation files.

- `pose_data` will hold the mouse pose/position data loaded from the first tracking `.parquet` file.
- `annotations` will hold the labeled behaviors loaded from the corresponding annotation `.parquet` file.

We then print the first few rows of each to see their format.


In [None]:
# Pick first video
tracking_file = tracking_files[0]
annotation_file = annotation_files[0]

pose_data = pd.read_parquet(tracking_file)
annotations = pd.read_parquet(annotation_file)

print(pose_data.head())
print(annotations.head())


## Step 7: Create Per-Frame Labels from Annotations  

The annotation file gives us start and stop frames for each behavior.  
To work with them easily we:

1. Find the maximum frame index from `pose_data`.
2. Create a NumPy array `frame_labels` filled with the default label `'none'` for all frames.
3. Loop through each row of the `annotations` DataFrame and, for the frame range between `start_frame` and `stop_frame`, assign the corresponding `action` label.
4. Print a small slice of `frame_labels` to check that labels have been filled correctly.


In [None]:
import numpy as np

max_frame = pose_data['video_frame'].max()
frame_labels = np.array(['none'] * (max_frame+1))  # default 'none'

# Fill labels from annotations
for _, row in annotations.iterrows():
    frame_labels[row['start_frame']:row['stop_frame']+1] = row['action']

print(frame_labels[1750:1770])  # check around first annotated frame


## Step 8: Normalize Pose Coordinates  

To make the pose data consistent across videos of different sizes, we perform **simple min-max normalization**:

- Divide each `x` coordinate by the maximum `x` value in the video.
- Divide each `y` coordinate by the maximum `y` value in the video.

This scales all coordinates to the range `[0, 1]` and makes the data suitable for modeling or visualization.


In [None]:
# Example: simple min-max normalization per video
pose_data['x_norm'] = pose_data['x'] / pose_data['x'].max()
pose_data['y_norm'] = pose_data['y'] / pose_data['y'].max()


## Step 9: Create Sequences and Assign Labels  

To prepare data for sequence-based modeling:

1. Set a `sequence_length` (here 30 frames per sequence).
2. Initialize empty lists `sequences` and `labels`.
3. Check all unique actions in `frame_labels` and create an **action map** to convert action names to integers.
4. Loop through frames in steps of `sequence_length`:
    - Select a window of frames from `pose_data`.
    - Pivot the data to create a matrix with shape `(frames_in_window, bodyparts*2)` containing `x` and `y` coordinates.
    - Skip empty sequences.
    - For labels, take the **majority action** in the window and convert it to integer using `action_map`.
5. Append sequence arrays and their corresponding labels.

Finally, print:
- Total number of sequences
- Shape of the first sequence
- First label


In [None]:
import numpy as np
import pandas as pd

sequence_length = 30
sequences = []
labels = []

# frame-level labels already created
# frame_labels = np.array([...])

# check all unique actions
unique_actions = np.unique(frame_labels)
print("Unique actions in this video:", unique_actions)

# automatically create action map
action_map = {act:i for i, act in enumerate(unique_actions)}

max_frame = pose_data['video_frame'].max()

for start in range(0, max_frame-sequence_length, sequence_length):
    seq = pose_data[(pose_data['video_frame']>=start) & (pose_data['video_frame']<start+sequence_length)]
    
    # Pivot with aggregation to handle duplicates
    seq_array = seq.pivot_table(
        index='video_frame',
        columns='bodypart',
        values=['x','y'],    # <-- changed from x_norm/y_norm
        aggfunc='mean'
    ).values
    
    if seq_array.shape[0] == 0:   # skip empty sequences
        continue
    
    sequences.append(seq_array)
    
    # Majority action in this window
    window_labels = frame_labels[start:start+sequence_length]
    
    # map actions to integers safely
    label_ints = [action_map.get(a, action_map.get('none',0)) for a in window_labels]
    labels.append(np.bincount(label_ints).argmax())

print(f"Number of sequences: {len(sequences)}")
print(f"Shape of first sequence: {sequences[0].shape}")
print(f"First label: {labels[0]}")


## Step 10: Pad Sequences to Uniform Length  

Since sequences may have different lengths (especially at the end of the video), we need to **pad them** so that all sequences have the same number of frames.

Steps:

1. Find the maximum sequence length among all sequences.
2. Loop through each sequence and pad it with zeros at the end if its length is shorter than the maximum.
3. Convert the list of padded sequences to a NumPy array `X`.
4. Convert labels list to a NumPy array `y`.

After this, `X` has shape `(num_sequences, max_seq_len, num_features)` suitable for feeding into sequence-based models (RNN, LSTM, etc.).


In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Find max sequence length
max_seq_len = max([s.shape[0] for s in sequences])

# Pad sequences to same length
# seq.shape = (frames, features) -> pad along frames axis
padded_sequences = []
for s in sequences:
    pad_len = max_seq_len - s.shape[0]
    if pad_len > 0:
        # Pad with zeros at the end
        s_padded = np.pad(s, ((0,pad_len),(0,0)), mode='constant', constant_values=0)
    else:
        s_padded = s
    padded_sequences.append(s_padded)

# Convert to numpy array
X = np.array(padded_sequences)
y = np.array(labels)

print("X shape after padding:", X.shape)
print("y shape:", y.shape)


## Step 11: Split Data into Training and Validation Sets  

Before training a model, we split the data into **training** and **validation** sets:

- Use `train_test_split` from `sklearn.model_selection`.
- `test_size=0.2` → 20% of data for validation.
- `stratify=y` → ensures the label distribution is preserved in both sets.
- `random_state=42` → for reproducibility.

After this, `X_train` and `y_train` will be used for training the model, while `X_val` and `y_val` will be used to evaluate performance during training.


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shapes:", X_train.shape, y_train.shape)
print("Validation shapes:", X_val.shape, y_val.shape)


## Step 12: Define LSTM Model  

We now define a simple **LSTM-based neural network** for sequence classification:

1. **Masking Layer**: `Masking(mask_value=0.)`  
   - Ignores the zero-padded frames in the sequences.

2. **LSTM Layer**: `LSTM(64)`  
   - Processes the sequence and outputs a feature vector of size 64.
   - `return_sequences=False` since we only need the final output for classification.

3. **Dense Layer**: `Dense(64, activation='relu')`  
   - Fully connected layer to learn additional features.

4. **Output Layer**: `Dense(num_classes, activation='softmax')`  
   - Outputs probabilities for each behavior class.

5. **Compile Model**:  
   - Loss: `sparse_categorical_crossentropy` (since labels are integers)  
   - Optimizer: `adam`  
   - Metric: `accuracy`

Finally, we print `model.summary()` to see the architecture and number of parameters.


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking

num_classes = len(np.unique(y))  # Number of unique behaviors

model = Sequential([
    Masking(mask_value=0., input_shape=(X_train.shape[1], X_train.shape[2])),  # ignore zero-padding
    LSTM(64, return_sequences=False),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()


## Step 13: Train the LSTM Model  

We train the LSTM model using the training data:

- `X_train`, `y_train`: training sequences and labels.
- `validation_data=(X_val, y_val)`: to monitor performance on validation set.
- `epochs=20`: start with a small number of epochs; can increase later depending on GPU/memory resources.
- `batch_size=32`: number of sequences processed in one iteration.

The training history is stored in the `history` object, which can later be used to plot loss and accuracy curves.


In [None]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,           # chhota epoch se start karo, GPU resources dekh ke badha sakte ho
    batch_size=32
)



## Step 14: Plot Training and Validation Accuracy  

After training, we can visualize how the model performed over epochs:

- `history.history['accuracy']` → training accuracy per epoch.
- `history.history['val_accuracy']` → validation accuracy per epoch.
- Plotting both curves helps to check for **overfitting** or **underfitting**.


In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'], label='train_acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


## Step 15: Evaluate Model on Validation Set  

After training, we evaluate the model on the **validation data**:

- `model.evaluate(X_val, y_val)` returns the loss and accuracy on the validation set.
- This gives a final measure of how well the model generalizes to unseen data.


In [None]:
val_loss, val_acc = model.evaluate(X_val, y_val)
print("Validation Accuracy:", val_acc)


## Step 16: Generate Predictions and Map to Behavior Names  

After training and evaluation, we can predict behaviors for the validation sequences:

1. `model.predict(X_val)` → gives probability distribution over all classes for each sequence.
2. `np.argmax(..., axis=1)` → get the predicted class index for each sequence.
3. Map the predicted class indices back to **behavior names** using `inv_action_map`.
4. Print the first few predicted behaviors as a sample.


In [None]:
y_pred_probs = model.predict(X_val)
y_pred_classes = np.argmax(y_pred_probs, axis=1)

# Map back to behavior names
inv_action_map = {0:'none',1:'sniff',2:'attack',3:'sniffgenital',4:'chase',
                  5:'approach',6:'mount',7:'rear',8:'escape',9:'avoid',10:'chaseattack'}

pred_behaviors = [inv_action_map[i] for i in y_pred_classes]
print(pred_behaviors[:10])


## Step 17: Classification Report and Confusion Matrix  

To evaluate the model in detail:

1. **Classification Report**:  
   - Provides precision, recall, f1-score, and support for each behavior class.
   - Helps to understand which behaviors are predicted well or poorly.

2. **Confusion Matrix**:  
   - Shows the number of correct and incorrect predictions for each class.
   - Helps to visualize misclassifications between behaviors.


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_val, y_pred_classes))
cm = confusion_matrix(y_val, y_pred_classes)
print(cm)


## Step 18: Generate Submission File  

To prepare the submission for Kaggle:

1. Initialize an empty list `submission` and a `row_id` counter.
2. Loop through all videos in `train_meta`.
3. For each video:
    - Load the pose data.
    - Predict behaviors for sequences (here a placeholder list `['none'] * num_sequences` is used; in practice use model predictions).
4. For each sequence, create a row with:
    - `row_id`
    - `video_id`
    - `agent_id` and `target_id` (mouse1, mouse2)
    - `action` (predicted behavior)
    - `start_frame` and `stop_frame`
5. Convert the list to a pandas DataFrame and save as `submission.csv`.

This format matches the expected Kaggle submission structure.


In [None]:
import pandas as pd

submission = []
row_id = 0
sequence_length = 30
mouse_map = {1: 'mouse1', 2: 'mouse2'}

# Loop through all videos in train_meta
for idx, row in train_meta.iterrows():
    video_id = row['video_id']
    
    # Pose data load karo
    pose_data = pd.read_parquet(f'/kaggle/input/MABe-mouse-behavior-detection/train_tracking/{row["lab_id"]}/{video_id}.parquet')
    
    # frame_labels ya model se predictions
    # sequences aur pred_behaviors calculate karo (jaise pehle kiya)
    # yahan ek example prediction list le rahe hain
    max_frame = pose_data['video_frame'].max()
    num_sequences = max_frame // sequence_length
    pred_behaviors = ['none'] * num_sequences   # placeholder, model predictions
    
    # Submission rows add karo
    for i in range(num_sequences):
        start_frame = i * sequence_length
        stop_frame = start_frame + sequence_length - 1
        action = pred_behaviors[i]
        
        submission.append([
            row_id,
            video_id,
            mouse_map[1],
            mouse_map[2],
            action,
            start_frame,
            stop_frame
        ])
        row_id += 1

sub_df = pd.DataFrame(submission, columns=['row_id','video_id','agent_id','target_id','action','start_frame','stop_frame'])
sub_df.to_csv('submission.csv', index=False)
print(sub_df.head())
