In [20]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from models import *   
import load_data
import nan_imputation
import helpers
from helpers import find_repo_root

In [21]:
# Reload isdead.py
import importlib
importlib.reload(load_data)
importlib.reload(nan_imputation)
importlib.reload(helpers)

<module 'helpers' from '/Users/louistschanz/Documents/EPFL-Cours/MA1/ML/Project-2/ML-Project-2/helpers.py'>

## Step 0 : Load the Data

In [22]:
repo_root = find_repo_root()
repo_root

data_path = os.path.join(repo_root, 'Data/Lifespan')

In [23]:
worms = load_data.load_lifespan(data_path)
#worms.pop("worm_1_companyDrug", None)  # The second argument avoids KeyError if the key doesn't exist

In [26]:
# just a check print on worm 3 (companyDrug)
worm_name = 'worm_3'  # Change this to the name of the worm you want to print
print(f"Worm: {worm_name}")
worm_data = worms[worm_name]
df = pd.DataFrame(worm_data.T, columns=['Frame', 'Speed', 'X', 'Y', 'Changed Pixels', 'Category'])
print(df)

Worm: worm_3
         Frame     Speed           X           Y  Changed Pixels  Category
0          1.0  0.000000  270.078261  110.278261             0.0       0.0
1          2.0  0.088129  270.078261  110.278261             1.0       0.0
2          3.0  0.100739  270.112069  110.250000             1.0       0.0
3          4.0  0.000000  270.162393  110.247863             0.0       0.0
4          5.0  0.334962  270.162393  110.247863             3.0       0.0
...        ...       ...         ...         ...             ...       ...
53990  10795.0  0.023434  557.853448  215.936782             1.0       0.0
53991  10796.0  0.107492  557.853026  215.925072             5.0       0.0
53992  10797.0  0.386756  557.807471  215.896552            11.0       0.0
53993  10798.0  0.590065  557.630372  215.974212            32.0       0.0
53994  10799.0  1.183972  557.374640  215.827089            66.0       0.0

[53995 rows x 6 columns]


In [27]:
helpers.print_fdict_summary(worms)  


Summary of Loaded Worm Data:
Worm: worm_1
  Shape: (6, 75593)
----------------------------------------
Worm: worm_2
  Shape: (6, 53995)
----------------------------------------
Worm: worm_3
  Shape: (6, 53995)
----------------------------------------
Worm: worm_4
  Shape: (6, 75593)
----------------------------------------
Worm: worm_5
  Shape: (6, 75593)
----------------------------------------
Worm: worm_6
  Shape: (6, 64794)
----------------------------------------
Worm: worm_7
  Shape: (6, 75593)
----------------------------------------
Worm: worm_8
  Shape: (6, 75593)
----------------------------------------
Worm: worm_9
  Shape: (6, 75593)
----------------------------------------
Worm: worm_10
  Shape: (6, 53995)
----------------------------------------
Worm: worm_11
  Shape: (6, 75593)
----------------------------------------
Worm: worm_12
  Shape: (6, 75593)
----------------------------------------
Worm: worm_13
  Shape: (6, 75593)
----------------------------------------
Worm

## Step 1 : NaN imputation
> impute only on X and Y columns since only where there are NaN

In [28]:
for name, lifespan_array in worms.items(): 
    print(f"Processing {name}")
    lifespan_arrayxy = lifespan_array[2:4,:]  # Extract columns for X and Y
    missing_sequences = nan_imputation.count_successive_missing(lifespan_arrayxy)
    for start, end, length in missing_sequences:
        print(f"  Missing sequence starts at column {start}, ends at column {end - 1}, length: {length}")

Processing worm_1
  Missing sequence starts at column 361, ends at column 368, length: 8
  Missing sequence starts at column 609, ends at column 610, length: 2
  Missing sequence starts at column 859, ends at column 899, length: 41
  Missing sequence starts at column 1018, ends at column 1076, length: 59
  Missing sequence starts at column 1127, ends at column 1154, length: 28
  Missing sequence starts at column 1178, ends at column 1208, length: 31
  Missing sequence starts at column 4077, ends at column 4093, length: 17
  Missing sequence starts at column 4127, ends at column 4189, length: 63
  Missing sequence starts at column 4744, ends at column 4768, length: 25
  Missing sequence starts at column 5469, ends at column 5562, length: 94
  Missing sequence starts at column 5693, ends at column 5719, length: 27
  Missing sequence starts at column 5727, ends at column 5920, length: 194
  Missing sequence starts at column 5922, ends at column 5924, length: 3
  Missing sequence starts at

In [29]:
#print(lifespan_arrayxy)
print(f"Missing sequences for {name}: {missing_sequences}")

Missing sequences for worm_48: [(4919, 4921, 2), (5399, 5401, 2), (11407, 11429, 22), (20959, 20984, 25), (21283, 21292, 9), (31413, 31435, 22), (31472, 31474, 2), (31475, 31499, 24), (31678, 31733, 55), (31774, 31781, 7), (38363, 38374, 11), (44096, 44243, 147)]


In [30]:
# Rows to check for missing values (2:4 in zero-based indexing)
rows_to_check = slice(2, 4)  # Rows 2 and 3 not row 4

# Apply cut_array to each worm in the dataset
cut_nan_dict = {name: nan_imputation.cut_array(array, rows_to_check) for name, array in worms.items()}

In [31]:
# Print the shape of the filtered arrays
for name, item in cut_nan_dict.items():
    print(f'{name} : {item.shape}')

worm_1 : (6, 74414)
worm_2 : (6, 53396)
worm_3 : (6, 52787)
worm_4 : (6, 74062)
worm_5 : (6, 72595)
worm_6 : (6, 64759)
worm_7 : (6, 70217)
worm_8 : (6, 72216)
worm_9 : (6, 71822)
worm_10 : (6, 52217)
worm_11 : (6, 75159)
worm_12 : (6, 72932)
worm_13 : (6, 75343)
worm_14 : (6, 74855)
worm_15 : (6, 64534)
worm_16 : (6, 74307)
worm_17 : (6, 75395)
worm_18 : (6, 75593)
worm_19 : (6, 74496)
worm_20 : (6, 72844)
worm_21 : (6, 73638)
worm_22 : (6, 53811)
worm_23 : (6, 74689)
worm_24 : (6, 73516)
worm_25 : (6, 63319)
worm_26 : (6, 42253)
worm_27 : (6, 71833)
worm_28 : (6, 57462)
worm_29 : (6, 64033)
worm_30 : (6, 72883)
worm_31 : (6, 64658)
worm_32 : (6, 75349)
worm_33 : (6, 60114)
worm_34 : (6, 48238)
worm_35 : (6, 50937)
worm_36 : (6, 58947)
worm_37 : (6, 41570)
worm_38 : (6, 69204)
worm_39 : (6, 52676)
worm_40 : (6, 49463)
worm_41 : (6, 64015)
worm_42 : (6, 58571)
worm_43 : (6, 52738)
worm_44 : (6, 73137)
worm_45 : (6, 73183)
worm_46 : (6, 64461)
worm_47 : (6, 51694)
worm_48 : (6, 53667)


In [32]:
# just a check print --> If we check we do have the number of frames decreased (because NaNs where removed) --> example with worm_3
worm_name = 'worm_3'  # Change this to the name of the worm you want to print
print(f"Worm: {worm_name}")
worm_data = cut_nan_dict[worm_name]
df = pd.DataFrame(worm_data.T, columns=['Frame', 'Speed', 'X', 'Y', 'Changed Pixels', 'Category'])

# Check for NaN values in the DataFrame
if df.isna().sum().sum() == 0:
    print(f"Worm {worm_name} has no NaN values after NaN imputation.")
else:
    print(f"Worm {worm_name} still contains NaN values.")

df

#And we see that the total number of frames is decreased 

Worm: worm_3
Worm worm_3 has no NaN values after NaN imputation.


Unnamed: 0,Frame,Speed,X,Y,Changed Pixels,Category
0,1.0,0.000000,270.078261,110.278261,0.0,0.0
1,2.0,0.088129,270.078261,110.278261,1.0,0.0
2,3.0,0.100739,270.112069,110.250000,1.0,0.0
3,4.0,0.000000,270.162393,110.247863,0.0,0.0
4,5.0,0.334962,270.162393,110.247863,3.0,0.0
...,...,...,...,...,...,...
52782,10795.0,0.023434,557.853448,215.936782,1.0,0.0
52783,10796.0,0.107492,557.853026,215.925072,5.0,0.0
52784,10797.0,0.386756,557.807471,215.896552,11.0,0.0
52785,10798.0,0.590065,557.630372,215.974212,32.0,0.0


# Step 2 : Figure out when do the worms die
>When we find out on which frame he dies, drop the frames after his death

In [33]:
import isdead
importlib.reload(isdead)

<module 'isdead' from '/Users/louistschanz/Documents/EPFL-Cours/MA1/ML/Project-2/ML-Project-2/isdead.py'>

In [34]:
movement_threshold = 1.0 # Threshold for inactivity detection
processed_worms = {} # Dictionary to store processed worms

dying_times = []

# Use the cleaned data from nan_imputation
cleaned_worms = cut_nan_dict  # Replace with the variable holding your cleaned data

# Iterate through each worm in the dataset
for worm_name, worm_data in cleaned_worms.items():
    print(f"Processing {worm_name}...")
    # Transpose worm_data for DataFrame creation
    df_worm = pd.DataFrame(worm_data.T,columns=['Frame', 'Speed', 'X', 'Y', 'Changed Pixels', 'Category']) # Transpose the array

    result = isdead.estimate_dying_time(df_worm, movement_threshold) # Use the estimate_dying_time function to find the dying frame
    if result[0] is None:
        print(f"  {worm_name}: No inactivity detected. Retaining all data.")
        processed_worms[worm_name] = worm_data
        continue

    dying_frame, absolute_frame, dying_time_hours, segment_number = result
  
    dying_times.append(dying_time_hours) # Append dying time in hours to the list

    print(f"  {worm_name}: Dying frame = {dying_frame} of Segment = {segment_number}, Absolute frame = {absolute_frame}, Dying time = {dying_time_hours:.2f} hours") # Print details

    # Truncate the data up to the dying frame
    truncated_data = worm_data[:, worm_data[0, :] <= dying_frame]
    processed_worms[worm_name] = truncated_data

# Print summary of processed worms
print("\nSummary of processed worms:")
for name, data in processed_worms.items():
    print(f"{name}: Original frames = {worms[name].shape[1]}, After truncation = {data.shape[1]}")

Processing worm_1...
  worm_1: Dying frame = 9293.0 of Segment = 6, Absolute frame = 72908, Dying time = 486.00 hours
Processing worm_2...
  worm_2: Dying frame = 9853.0 of Segment = 4, Absolute frame = 52450, Dying time = 348.14 hours
Processing worm_3...
  worm_3: Dying frame = 9902.0 of Segment = 4, Absolute frame = 51890, Dying time = 342.33 hours
Processing worm_4...
  worm_4: Dying frame = 9002.0 of Segment = 6, Absolute frame = 72265, Dying time = 480.15 hours
Processing worm_5...
  worm_5: Dying frame = 10142.0 of Segment = 6, Absolute frame = 71938, Dying time = 474.47 hours
Processing worm_6...
  worm_6: Dying frame = 10568.0 of Segment = 5, Absolute frame = 64528, Dying time = 426.35 hours
Processing worm_7...
  worm_7: Dying frame = 10537.0 of Segment = 6, Absolute frame = 69955, Dying time = 462.36 hours
Processing worm_8...
  worm_8: Dying frame = 9002.0 of Segment = 6, Absolute frame = 70419, Dying time = 468.12 hours
Processing worm_9...
  worm_9: Dying frame = 9293.0 o

In [None]:
# just a check print --> Check worm 3
worm_name = 'worm_3'  # Change this to the name of the worm you want to print
print(f"Worm: {worm_name}")
worm_data = processed_worms[worm_name]
df = pd.DataFrame(worm_data.T, columns=['Frame', 'Speed', 'X', 'Y', 'Changed Pixels', 'Category'])
df

# this for a movement threshold of 1.0
# check worm 3 : Loading Data = 64794 --> Removing NaNs = 64533 frames --> Removing dead franes = 62175 Frames

In [None]:
# Plot the survival curve
dying_times_sorted = sorted(dying_times) # Sort the dying times in ascending order

# Compute the survival rate
survival_rate = [1 - (i / len(dying_times_sorted)) for i in range(len(dying_times_sorted))]

# Plot the survival curve
plt.figure(figsize=(8, 5))
plt.plot(dying_times_sorted, survival_rate, marker='o', linestyle='-', color='blue')
plt.xlabel('Dying Time (Hours)')
plt.ylabel('Survival Rate')
plt.title('Survival Curve')
plt.grid()
plt.tight_layout()
plt.show()

In [None]:
# Visualization

# Prepare data for the plot
original_lengths = [worms[name].shape[1] for name in worms.keys()]
truncated_lengths = [processed_worms[name].shape[1] for name in processed_worms.keys()]
worm_ids = list(worms.keys())

# Plot the data
plt.figure(figsize=(10, 6))
plt.bar(worm_ids, original_lengths, label='Original Lifespan', alpha=0.7)
plt.bar(worm_ids, truncated_lengths, label='Truncated Lifespan', alpha=0.7)
plt.xticks(rotation=45, ha='right')
plt.xlabel('Worms')
plt.ylabel('Number of Frames')
plt.title('Original vs Truncated Lifespan for Each Worm')
plt.legend()
plt.tight_layout()
plt.show()

# Step 3 : Standardization
> Standardization ensures that all features are on the same scale, typically with a mean of 0 and a standard deviation of 1. This helps models converge faster and makes them less sensitive to the scale of input features.
> We should avoid standardizing categorical columns.

In [None]:
importlib.reload(helpers)

In [18]:
# Specify the indices of the features to standardize (e.g., Speed, X, Y)
feature_columns = [1, 2, 3, 4]  # Assuming 1 = Speed, 2 = X, 3 = Y, 4 = Changed Pixels, not standardizing the frame nummber = 0 and the catetgory = 5

# Apply per-worm standardization
standardized_worms = helpers.standardization(processed_worms, feature_columns)

In [None]:
# Retrieve the worm data
worm_3_data = standardized_worms['worm_3']  # Assuming worms is your dictionary of worm data

# Convert to a DataFrame for easier inspection
columns = ['Frame', 'Speed', 'X', 'Y', 'Changed Pixels', 'Category']
df_worm_3 = pd.DataFrame(worm_3_data.T, columns=columns)  # Transpose for proper orientation

df_worm_3

# Step 4 : Splitting the Data
> Split the worms in train worms and test worms

In [None]:
train_worms, test_worms = load_data.split_worms(standardized_worms, test_size=0.2)

print(f"Training Worms: {list(train_worms.keys())}")
print(f"Testing Worms: {list(test_worms.keys())}")

# Step 5 : Load only early Lifespan for train set and keep the whole lifespan for test (validation set)
> Now we will load only a portion of the worms

In [None]:
importlib.reload(load_data)

In [None]:
data_fraction = 0.4

early_train_worms = load_data.load_earlylifespan(train_worms, data_fraction)

In [None]:
# Retrieve the worm data
worm_3_data = early_train_worms['worm_3']  # Assuming worms is your dictionary of worm data

# Convert to a DataFrame for easier inspection
columns = ['Frame', 'Speed', 'X', 'Y', 'Changed Pixels', 'Category']
df_worm_3 = pd.DataFrame(worm_3_data.T, columns=columns)  # Transpose for proper orientation

df_worm_3

# Step 6 : Prepare data for Linear Models