In [34]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from models import *   
import load_data
import nan_imputation
import helpers
from helpers import find_repo_root

In [None]:
# Reload isdead.py
import importlib
importlib.reload(load_data)
importlib.reload(nan_imputation)
importlib.reload(helpers)

## Step 0 : Load the Data

In [36]:
repo_root = find_repo_root()
repo_root

data_path = os.path.join(repo_root, 'Data/Lifespan')

In [37]:
worms = load_data.load_lifespan(data_path)
#worms.pop("worm_1_companyDrug", None)  # The second argument avoids KeyError if the key doesn't exist

In [None]:
# just a check print on worm 3 (companyDrug)
worm_name = 'worm_3'  # Change this to the name of the worm you want to print
print(f"Worm: {worm_name}")
worm_data = worms[worm_name]
df = pd.DataFrame(worm_data.T, columns=['Frame', 'Speed', 'X', 'Y', 'Changed Pixels', 'Category'])
print(df)

In [None]:
helpers.print_fdict_summary(worms)  

## Step 1 : NaN imputation
> impute only on X and Y columns since only where there are NaN

In [None]:
for name, lifespan_array in worms.items(): 
    print(f"Processing {name}")
    lifespan_arrayxy = lifespan_array[2:4,:]  # Extract columns for X and Y
    missing_sequences = nan_imputation.count_successive_missing(lifespan_arrayxy)
    for start, end, length in missing_sequences:
        print(f"  Missing sequence starts at column {start}, ends at column {end - 1}, length: {length}")

In [None]:
#print(lifespan_arrayxy)
print(f"Missing sequences for {name}: {missing_sequences}")

In [42]:
# Rows to check for missing values (2:4 in zero-based indexing)
rows_to_check = slice(2, 4)  # Rows 2 and 3 not row 4

# Apply cut_array to each worm in the dataset
cut_nan_dict = {name: nan_imputation.cut_array(array, rows_to_check) for name, array in worms.items()}

In [None]:
# Print the shape of the filtered arrays
for name, item in cut_nan_dict.items():
    print(f'{name} : {item.shape}')

In [None]:
# just a check print --> If we check we do have the number of frames decreased (because NaNs where removed) --> example with worm_3
worm_name = 'worm_3'  # Change this to the name of the worm you want to print
print(f"Worm: {worm_name}")
worm_data = cut_nan_dict[worm_name]
df = pd.DataFrame(worm_data.T, columns=['Frame', 'Speed', 'X', 'Y', 'Changed Pixels', 'Category'])

# Check for NaN values in the DataFrame
if df.isna().sum().sum() == 0:
    print(f"Worm {worm_name} has no NaN values after NaN imputation.")
else:
    print(f"Worm {worm_name} still contains NaN values.")

df

#And we see that the total number of frames is decreased 

# Step 2 : Figure out when do the worms die
>When we find out on which frame he dies, drop the frames after his death

In [None]:
import isdead
importlib.reload(isdead)

In [None]:
movement_threshold = 1.0 # Threshold for inactivity detection
processed_worms = {} # Dictionary to store processed worms

dying_times = []

# Use the cleaned data from nan_imputation
cleaned_worms = cut_nan_dict  # Replace with the variable holding your cleaned data

# Iterate through each worm in the dataset
for worm_name, worm_data in cleaned_worms.items():
    print(f"Processing {worm_name}...")
    # Transpose worm_data for DataFrame creation
    df_worm = pd.DataFrame(worm_data.T,columns=['Frame', 'Speed', 'X', 'Y', 'Changed Pixels', 'Category']) # Transpose the array

    result = isdead.estimate_dying_time(df_worm, movement_threshold) # Use the estimate_dying_time function to find the dying frame
    if result[0] is None:
        print(f"  {worm_name}: No inactivity detected. Retaining all data.")
        processed_worms[worm_name] = worm_data
        continue

    dying_frame, absolute_frame, dying_time_hours, segment_number = result
  
    dying_times.append(dying_time_hours) # Append dying time in hours to the list

    print(f"  {worm_name}: Dying frame = {dying_frame} of Segment = {segment_number}, Absolute frame = {absolute_frame}, Dying time = {dying_time_hours:.2f} hours") # Print details

    # Truncate the data up to the dying frame
    truncated_data = worm_data[:, worm_data[0, :] <= dying_frame]
    processed_worms[worm_name] = truncated_data

# Print summary of processed worms
print("\nSummary of processed worms:")
for name, data in processed_worms.items():
    print(f"{name}: Original frames = {worms[name].shape[1]}, After truncation = {data.shape[1]}")

In [None]:
# just a check print --> Check worm 3
worm_name = 'worm_3'  # Change this to the name of the worm you want to print
print(f"Worm: {worm_name}")
worm_data = processed_worms[worm_name]
df = pd.DataFrame(worm_data.T, columns=['Frame', 'Speed', 'X', 'Y', 'Changed Pixels', 'Category'])
df

# this for a movement threshold of 1.0
# check worm 3 : Loading Data = 64794 --> Removing NaNs = 64533 frames --> Removing dead franes = 62175 Frames

In [None]:
# Plot the survival curve
dying_times_sorted = sorted(dying_times) # Sort the dying times in ascending order

# Compute the survival rate
survival_rate = [1 - (i / len(dying_times_sorted)) for i in range(len(dying_times_sorted))]

# Plot the survival curve
plt.figure(figsize=(8, 5))
plt.plot(dying_times_sorted, survival_rate, marker='o', linestyle='-', color='blue')
plt.xlabel('Dying Time (Hours)')
plt.ylabel('Survival Rate')
plt.title('Survival Curve')
plt.grid()
plt.tight_layout()
plt.show()

In [None]:
# Visualization

# Prepare data for the plot
original_lengths = [worms[name].shape[1] for name in worms.keys()]
truncated_lengths = [processed_worms[name].shape[1] for name in processed_worms.keys()]
worm_ids = list(worms.keys())

# Plot the data
plt.figure(figsize=(10, 6))
plt.bar(worm_ids, original_lengths, label='Original Lifespan', alpha=0.7)
plt.bar(worm_ids, truncated_lengths, label='Truncated Lifespan', alpha=0.7)
plt.xticks(rotation=45, ha='right')
plt.xlabel('Worms')
plt.ylabel('Number of Frames')
plt.title('Original vs Truncated Lifespan for Each Worm')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
importlib.reload(helpers)

In [51]:
# Specify the indices of the features to standardize (e.g., Speed, X, Y)
feature_columns = [1, 2, 3, 4]  # Assuming 1 = Speed, 2 = X, 3 = Y, 4 = Changed Pixels, not standardizing the frame nummber = 0 and the catetgory = 5

# Apply per-worm standardization
standardized_worms = helpers.standardization(processed_worms, feature_columns)

In [None]:
# Retrieve the worm data
worm_3_data = standardized_worms['worm_3']  # Assuming worms is your dictionary of worm data

# Convert to a DataFrame for easier inspection
columns = ['Frame', 'Speed', 'X', 'Y', 'Changed Pixels', 'Category']
df_worm_3 = pd.DataFrame(worm_3_data.T, columns=columns)  # Transpose for proper orientation

df_worm_3

# Step 4 : Splitting the Data
> Split the worms in train worms and test worms

In [None]:
train_worms, test_worms = load_data.split_worms(standardized_worms, test_size=0.2)

print(f"Training Worms: {list(train_worms.keys())}")
print(f"Testing Worms: {list(test_worms.keys())}")

# Step 5 : Load only early Lifespan for train set and keep the whole lifespan for test (validation set)
> Now we will load only a portion of the worms

In [None]:
#importlib.reload(load_data)

In [None]:
#data_fraction = 0.4

#early_train_worms = load_data.load_earlylifespan(train_worms, data_fraction)

In [None]:
# Retrieve the worm data
worm_3_data = early_train_worms['worm_3']  # Assuming worms is your dictionary of worm data

# Convert to a DataFrame for easier inspection
columns = ['Frame', 'Speed', 'X', 'Y', 'Changed Pixels', 'Category']
df_worm_3 = pd.DataFrame(worm_3_data.T, columns=columns)  # Transpose for proper orientation

df_worm_3

# Step 6 : Prepare data for Models

# Step 6.1 : Separate features X and target Y

In [None]:
# Prepare training data
X_train = []
y_train = []

for worm_name, worm_data in train_worms.items():
    # Convert to DataFrame for better readability
    df = pd.DataFrame(
        worm_data.T, 
        columns=['Frame', 'Speed', 'X', 'Y', 'Changed Pixels', 'Category']
    )
    
    # Use the already extracted early lifespan data as features
    X_train.append(df[['Speed', 'X', 'Y', 'Changed Pixels']].values)
    
    # Total lifespan (number of frames) is the target
    y_train.append(len(df))

# Convert lists to NumPy arrays
X_train = np.array(X_train, dtype=object)
y_train = np.array(y_train)

print(f"Training data prepared: {len(X_train)} worms with variable-length sequences.")

# Prepare test data (similar process)
X_test = []
y_test = []

for worm_name, worm_data in test_worms.items():
    df = pd.DataFrame(
        worm_data.T, 
        columns=['Frame', 'Speed', 'X', 'Y', 'Changed Pixels', 'Category']
    )
    X_test.append(df[['Speed', 'X', 'Y', 'Changed Pixels']].values)
    y_test.append(len(df))

X_test = np.array(X_test, dtype=object)
y_test = np.array(y_test)

print(f"Test data prepared: {len(X_test)} worms with variable-length sequences.")

# Step 6.2 : Truncate early lifespan on X_train,X_test but keep Y_train,Y_test full length

In [None]:
importlib.reload(load_data)

In [66]:
data_fraction = 0.4
early_X_train = load_data.truncate_lifespan(X_train, data_fraction)
early_X_test = load_data.truncate_lifespan(X_test, data_fraction)

In [None]:
#Dimension check
# Check dimensions of X_train and X_test
print(f"Number of worms in X_train: {len(early_X_train)}")
print(f"Number of worms in X_test: {len(early_X_test)}")

# Check lengths of sequences for a few worms
print("Lengths of sequences in X_train (first 5 worms):")
print([len(worm) for worm in X_train[:5]])

print("Lengths of sequences in X_test (first 5 worms):")
print([len(worm) for worm in X_test[:5]])

# Check dimensions of y_train and y_test
print(f"Number of worms in y_train: {len(y_train)}")
print(f"Number of worms in y_test: {len(y_test)}")

# Verify alignment between features and targets
assert len(X_train) == len(y_train), "Mismatch: X_train and y_train do not align!"
assert len(X_test) == len(y_test), "Mismatch: X_test and y_test do not align!"

# Step 6.2 : Flattend data for simple models

In [None]:
# Flatten X_train and X_test for models requiring fixed-length input
X_train_flat = np.concatenate(X_train)
y_train_flat = np.repeat(y_train, [len(x) for x in X_train])

X_test_flat = np.concatenate(X_test)
y_test_flat = np.repeat(y_test, [len(x) for x in X_test])

print(f"Flattened X_train shape: {X_train_flat.shape}")
print(f"Flattened y_train shape: {y_train_flat.shape}")

# Step 6.3 : Train and evaluate the model