# 【AAPlot for Animal Behavior (Analyze)】
## Extract speed and trajectory of **Spike2** exported .txt file

This notebook analyzes animal movement data before and after specific events, including:
- Data cleaning (removal of artifacts)
- Time rescaling around events
- Speed analysis in different time windows
- Distance calculations
- Statistical analysis and data export


Run under `PLOT` environment
    
The `PLOT` enviorment：
- Python3.12.7
- pandas
- numpy
- matplotlib
- seaborn
- ipykernel

*Warning*

*! Make sure you have installed `Anaconda`，and added to PATH（refering to internet）*

*! Make sure you have already confiured the `PLOT` environment, if not, run this command: `conda env create -n PLOT python=3.12.7 pandas numpy matplotlib seaborn ipykernel` (If you are using ARM64 CPU, use Python3.13.3 intead，add `conda-forge` at the end of command)*

*Apply `PLOT` in VScode ：Select in the Kernel*

*Apply `PLOT` in terminal：`conda activate PLOT`*

## 1. Import Libraries and Load Data

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from scipy import stats

# File selection
Filename = r'D:\Temp\DrugIntake behavior\THC\male\C16_THC_0_1mpk_LocationOutput_TimeOverSpeed.txt'

# First, let's look at the file structure
with open(Filename, 'r') as file:
    first_few_lines = [next(file) for _ in range(5)]
print("First few lines of the file:")
for line in first_few_lines:
    print(line.strip())

# Read file with no header, using numbered columns
try:
    # Try reading with different separators, skip the first row that contains titles
    for separator in ['\t', ',', '\s+']:
        try:
            df = pd.read_csv(Filename, sep=separator, header=None, skiprows=1)
            if len(df.columns) >= 2:  # We need at least time and speed columns
                print(f"\nSuccessfully read file with '{separator}' separator")
                break
        except:
            continue

    print("\nInitial data shape:", df.shape)
    print("Sample of raw data:")
    print(df.head())
    
    # Convert time column to numeric
    df.iloc[:, 0] = pd.to_numeric(df.iloc[:, 0], errors='coerce')
    
    # Rename columns by position
    if len(df.columns) >= 3:
        df = df.iloc[:, :3]  # Take only first three columns if there are more
        df.columns = ['Time', 'Speed', 'Marker']
    elif len(df.columns) == 2:
        df = df.iloc[:, :2]  # Take only first two columns
        df.columns = ['Time', 'Speed']
        # Add marker column with default event at middle point
        df['Marker'] = 0
        middle_idx = len(df) // 2
        df.loc[middle_idx, 'Marker'] = 1
        print("\nNo marker column found. Added marker at middle point.")
    else:
        raise ValueError("File must have at least 2 columns (Time and Speed)")

except Exception as e:
    print(f"Error reading file: {str(e)}")
    raise

print("\nFinal data structure:")
print(df.info())
print("\nSample of processed data:")
print(df.head())

  for separator in ['\t', ',', '\s+']:


First few lines of the file:
"Time","2 Speed(cm/s)","1 Channel 1"
0.0,12.39,0
0.1,10.70,0
0.2,9.42,0
0.3,8.57,0

Successfully read file with ',' separator

Initial data shape: (417008, 3)
Sample of raw data:
     0      1  2
0  0.0  12.39  0
1  0.1  10.70  0
2  0.2   9.42  0
3  0.3   8.57  0
4  0.4   8.12  0

Final data structure:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 417008 entries, 0 to 417007
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    417008 non-null  float64
 1   Speed   417008 non-null  float64
 2   Marker  417008 non-null  int64  
dtypes: float64(2), int64(1)
memory usage: 9.5 MB
None

Sample of processed data:
   Time  Speed  Marker
0   0.0  12.39       0
1   0.1  10.70       0
2   0.2   9.42       0
3   0.3   8.57       0
4   0.4   8.12       0

Successfully read file with ',' separator

Initial data shape: (417008, 3)
Sample of raw data:
     0      1  2
0  0.0  12.39  0
1  0.1  10.70  0


## 2. Data Cleaning and Preprocessing

In [32]:
# Data cleaning
print("Original data shape:", df.shape)

# Convert speed column to numeric, handling any non-numeric values
df_cleaned = df.copy()
# Speed is always in column 1 (second column)
df_cleaned.iloc[:, 1] = pd.to_numeric(df_cleaned.iloc[:, 1], errors='coerce')
print("\nInitial speed statistics:")
print(df_cleaned.iloc[:, 1].describe())

# Remove speed artifacts (>100 cm/s)
artifacts_mask = df_cleaned.iloc[:, 1] > 100
n_artifacts = artifacts_mask.sum()
df_cleaned.iloc[artifacts_mask, 1] = np.nan  # Replace artifacts with NaN
print(f"\nNumber of artifacts removed (speed > 100 cm/s): {n_artifacts}")
print("\nSpeed statistics after removing artifacts:")
print(df_cleaned.iloc[:, 1].describe())

# Interpolate NaN values
df_cleaned.iloc[:, 1] = df_cleaned.iloc[:, 1].interpolate(method='linear')
print("\nSpeed statistics after interpolation:")
print(df_cleaned.iloc[:, 1].describe())

# Verify event markers (using position-based indexing for consistency)
event_times = df_cleaned[df_cleaned.iloc[:, 2] == 1].iloc[:, 0]  # Column 2 is Marker, Column 0 is Time
print(f"\nNumber of events found: {len(event_times)}")
print("Event time points:", event_times.values)

# Calculate percentiles for speed distribution
percentiles = [5, 25, 50, 75, 95]
speed_percentiles = np.percentile(df_cleaned.iloc[:, 1].dropna(), percentiles)
print("\nSpeed distribution percentiles:")
for p, v in zip(percentiles, speed_percentiles):
    print(f"{p}th percentile: {v:.2f} cm/s")

Original data shape: (417008, 3)

Initial speed statistics:
count    417008.000000
mean          6.617100
std           8.006922
min         -31.820000
25%           2.830000
50%           5.160000
75%           8.300000
max         588.160000
Name: Speed, dtype: float64

Number of artifacts removed (speed > 100 cm/s): 186

Speed statistics after removing artifacts:
count    416822.000000
mean          6.509506
std           5.751905
min         -31.820000
25%           2.830000
50%           5.160000
75%           8.300000
max          98.950000
Name: Speed, dtype: float64

Speed statistics after interpolation:
count    417008.000000
mean          6.539925
std           5.943909
min         -31.820000
25%           2.830000
50%           5.160000
75%           8.300000
max          98.950000
Name: Speed, dtype: float64

Number of events found: 1
Event time points: [7109.1]

Speed distribution percentiles:
5th percentile: 0.90 cm/s
25th percentile: 2.83 cm/s
50th percentile: 5.16 cm/s


## 3. Time Rescaling

In [33]:
# Find the event time
try:
    # Using position-based indexing for consistency
    event_markers = df_cleaned[df_cleaned.iloc[:, 2] == 1]
    if len(event_markers) > 0:
        event_time = event_markers.iloc[0, 0]  # Get time from first event marker
        print(f"Found event marker at time: {event_time:.2f} seconds")
    else:
        # If no event markers found, use middle of the time range
        time_min = df_cleaned.iloc[:, 0].min()
        time_max = df_cleaned.iloc[:, 0].max()
        event_time = (time_max + time_min) / 2
        print(f"No event markers found. Using middle point as event time: {event_time:.2f} seconds")
except Exception as e:
    print(f"Error finding event time: {str(e)}")
    # Default to middle of time range
    time_min = df_cleaned.iloc[:, 0].min()
    time_max = df_cleaned.iloc[:, 0].max()
    event_time = (time_max + time_min) / 2
    print(f"Using middle point as event time: {event_time:.2f} seconds")

# Rescale time relative to event (in hours)
df_cleaned['Time_hours'] = (df_cleaned.iloc[:, 0] - event_time) / 3600  # Convert to hours

# Create time windows for analysis
time_windows = {
    'pre_1h': (-1, 0),
    'post_1h': (0, 1),
    'post_2h': (1, 2),
    'post_3h': (2, 3)
}

# Print time range information
print("\nTime range in dataset:")
print(f"Start: {df_cleaned['Time_hours'].min():.2f} hours")
print(f"End: {df_cleaned['Time_hours'].max():.2f} hours")
print(f"Total duration: {df_cleaned['Time_hours'].max() - df_cleaned['Time_hours'].min():.2f} hours")

# Print data points in each window
print("\nData coverage in time windows:")
for window_name, (start, end) in time_windows.items():
    window_data = df_cleaned[(df_cleaned['Time_hours'] >= start) & (df_cleaned['Time_hours'] < end)]
    print(f"{window_name}: {len(window_data)} data points")

Found event marker at time: 7109.10 seconds

Time range in dataset:
Start: -1.97 hours
End: 9.61 hours
Total duration: 11.58 hours

Data coverage in time windows:
pre_1h: 35999 data points
post_1h: 36000 data points
post_2h: 36000 data points
post_3h: 36001 data points


## 4. Speed Analysis

In [34]:
# Calculate average speeds for each time window
speed_results = {}
data_completeness = {}

for window_name, (start, end) in time_windows.items():
    mask = (df_cleaned['Time_hours'] >= start) & (df_cleaned['Time_hours'] < end)
    window_data = df_cleaned[mask]
    
    if len(window_data) > 0:
        speed_results[window_name] = {
            'mean_speed': window_data['Speed'].mean(),
            'std_speed': window_data['Speed'].std(),
            'n_points': len(window_data)
        }
        # Calculate data completeness (percentage of the full hour covered)
        time_coverage = (window_data['Time_hours'].max() - window_data['Time_hours'].min()) * 60  # in minutes
        data_completeness[window_name] = min(100, (time_coverage / 60) * 100)  # as percentage of an hour
    else:
        speed_results[window_name] = {
            'mean_speed': np.nan,
            'std_speed': np.nan,
            'n_points': 0
        }
        data_completeness[window_name] = 0

# Print results
print("Speed Analysis Results:")
for window, results in speed_results.items():
    print(f"\n{window}:")
    print(f"Mean Speed: {results['mean_speed']:.2f} cm/s")
    print(f"Std Dev: {results['std_speed']:.2f} cm/s")
    print(f"Data Points: {results['n_points']}")
    print(f"Data Completeness: {data_completeness[window]:.1f}%")

Speed Analysis Results:

pre_1h:
Mean Speed: 7.32 cm/s
Std Dev: 5.14 cm/s
Data Points: 35999
Data Completeness: 100.0%

post_1h:
Mean Speed: 7.35 cm/s
Std Dev: 4.96 cm/s
Data Points: 36000
Data Completeness: 100.0%

post_2h:
Mean Speed: 7.36 cm/s
Std Dev: 5.39 cm/s
Data Points: 36000
Data Completeness: 100.0%

post_3h:
Mean Speed: 7.68 cm/s
Std Dev: 6.17 cm/s
Data Points: 36001
Data Completeness: 100.0%


## 5. Distance Calculation

In [35]:
# Calculate distances for each time window
distance_results = {}

for window_name, (start, end) in time_windows.items():
    mask = (df_cleaned['Time_hours'] >= start) & (df_cleaned['Time_hours'] < end)
    window_data = df_cleaned[mask]
    
    if len(window_data) > 0:
        # Calculate distance by integrating speed over time
        # Convert speed from cm/s to cm/h and time differences to hours
        speeds_cmh = window_data['Speed'] * 3600  # convert to cm/h
        time_diff_h = np.diff(window_data['Time_hours'])
        speeds_for_calc = speeds_cmh[:-1]  # use speeds except last point
        
        # Calculate distance
        distance = np.sum(speeds_for_calc * time_diff_h)
        
        distance_results[window_name] = {
            'distance_cm': distance,
            'distance_m': distance / 100,  # convert to meters
            'completeness': data_completeness[window]
        }
    else:
        distance_results[window_name] = {
            'distance_cm': np.nan,
            'distance_m': np.nan,
            'completeness': 0
        }

# Print results
print("Distance Analysis Results:")
for window, results in distance_results.items():
    print(f"\n{window}:")
    print(f"Distance: {results['distance_m']:.2f} meters")
    print(f"Data Completeness: {results['completeness']:.1f}%")

Distance Analysis Results:

pre_1h:
Distance: 263.35 meters
Data Completeness: 100.0%

post_1h:
Distance: 264.55 meters
Data Completeness: 100.0%

post_2h:
Distance: 265.00 meters
Data Completeness: 100.0%

post_3h:
Distance: 276.48 meters
Data Completeness: 100.0%


## 6. Data Export

In [36]:
# Create summary DataFrame
combined_summary = pd.DataFrame({
    'Metric': [
        'Total Duration (hours)',
        'Pre-event Duration (hours)',
        'Post-event Duration (hours)',
        'Mean Speed - Pre 1h (cm/s)',
        'Mean Speed - Post 1h (cm/s)',
        'Mean Speed - Post 2h (cm/s)',
        'Mean Speed - Post 3h (cm/s)',
        'Distance - Pre 1h (m)',
        'Distance - Post 1h (m)',
        'Distance - Post 2h (m)',
        'Distance - Post 3h (m)'
    ],
    'Value': [
        df_cleaned['Time_hours'].max() - df_cleaned['Time_hours'].min(),
        abs(df_cleaned['Time_hours'].min()),
        df_cleaned['Time_hours'].max(),
        speed_results['pre_1h']['mean_speed'],
        speed_results['post_1h']['mean_speed'],
        speed_results['post_2h']['mean_speed'],
        speed_results['post_3h']['mean_speed'],
        distance_results['pre_1h']['distance_m'],
        distance_results['post_1h']['distance_m'],
        distance_results['post_2h']['distance_m'],
        distance_results['post_3h']['distance_m']
    ],
    'Completeness_%': [
        100,
        100 if abs(df_cleaned['Time_hours'].min()) >= 1 else abs(df_cleaned['Time_hours'].min()) * 100,
        100,
        data_completeness['pre_1h'],
        data_completeness['post_1h'],
        data_completeness['post_2h'],
        data_completeness['post_3h'],
        data_completeness['pre_1h'],
        data_completeness['post_1h'],
        data_completeness['post_2h'],
        data_completeness['post_3h']
    ]
})

# Prepare file paths
base_name = os.path.splitext(os.path.basename(Filename))[0]
output_dir = os.path.dirname(Filename)
data_csv = os.path.join(output_dir, f"{base_name}_data.csv")
summary_csv = os.path.join(output_dir, f"{base_name}_summary.csv")

try:
    # Export the cleaned data
    df_cleaned.to_csv(data_csv, index=False)
    # Export the summary
    combined_summary.to_csv(summary_csv, index=False)
    
    print(f"\nAnalysis exported to:")
    print(f"1. Raw Data: {data_csv}")
    print(f"2. Summary: {summary_csv}")

except Exception as e:
    print(f"\nError exporting CSV files: {str(e)}")

# Display the summary
print("\nAnalysis Summary:")
print(combined_summary.to_string(index=False))


Analysis exported to:
1. Raw Data: D:\Temp\DrugIntake behavior\THC\male\C16_THC_0_1mpk_LocationOutput_TimeOverSpeed_data.csv
2. Summary: D:\Temp\DrugIntake behavior\THC\male\C16_THC_0_1mpk_LocationOutput_TimeOverSpeed_summary.csv

Analysis Summary:
                     Metric      Value  Completeness_%
     Total Duration (hours)  11.583528      100.000000
 Pre-event Duration (hours)   1.974750      100.000000
Post-event Duration (hours)   9.608778      100.000000
 Mean Speed - Pre 1h (cm/s)   7.315757       99.994444
Mean Speed - Post 1h (cm/s)   7.348808       99.997222
Mean Speed - Post 2h (cm/s)   7.361364       99.997222
Mean Speed - Post 3h (cm/s)   7.679872      100.000000
      Distance - Pre 1h (m) 263.350475       99.994444
     Distance - Post 1h (m) 264.547630       99.997222
     Distance - Post 2h (m) 264.998260       99.997222
     Distance - Post 3h (m) 276.478620      100.000000
