## Initialization

In [None]:
### Set user-defined parameters for analysis ###
settings_file = '../data/processed/vVB_210528_TrackMateOut/Analysis_settings/20210528_msd_analysis.json'

# Plotting and figure saving params
plot_figs = True
plot_all_tracks = False # Don't do this one 
save_figs = False # Code for saving data still needs work. I've just been manually saving them as they pop up.
save_data = False
manual_correlation_override = False

plot_settings = '../src/plotting_settings.py'
save_dir_reports = '../reports/figures/'

In [None]:
# Load modules
%load_ext autoreload
%autoreload 2

import os, sys, inspect

# Add source code directory to path to enable module import
module_dir = '../src'
os.sys.path.insert(0, module_dir)

import matplotlib
import matplotlib.pylab as plt
import numpy as np
import math
from scipy import stats
import random
import glob
import json
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import parse_trackmate as pt
import diffusion as dif



In [None]:
# Load the json settings file

with open(settings_file) as fd:
    json_data = json.load(fd)
    
base_dir = os.path.split(settings_file)[0]
base_dir = os.path.abspath(base_dir) # Because paths are relative in the settings

# Build lists of folders containing tracks for each condition
conditions = {}
for condition, folders in json_data['conditions'].items():
    if type(folders) is not list: folders = [folders]
    paths = []
    for folder in folders:
        full_path = os.path.join(base_dir, folder)
        paths = paths + [os.path.normpath(full_path)]
    conditions[condition] = paths

# Build paths for output files
save_dir = os.path.join(save_dir_reports, json_data['save_dir_reports'])
save_dir = os.path.normpath(save_dir)
save_dir_data = os.path.join(base_dir, json_data['save_dir_filt_data'])
save_dir_data = os.path.normpath(save_dir_data)

# Load the analysis settings
diff_dim = json_data['diff_dim']
min_averages_for_msd = json_data['min_averages_for_msd']
dc_fit_nframes = json_data['dc_fit_nframes']

print(diff_dim, min_averages_for_msd, dc_fit_nframes)

In [None]:
# Set up figure save dirs and load plotting style
if save_figs:
    %matplotlib
    %run $plot_settings save_large
    
    # Make directories for saving figures
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    dir_sum_figs = os.path.join(save_dir, 'summary_figures')
    if not os.path.exists(dir_sum_figs):
        os.makedirs(dir_sum_figs)
    
    dir_examples = os.path.join(save_dir, 'examples')
    if not os.path.exists(dir_examples):
        os.makedirs(dir_examples)

else:
    %matplotlib
    %run $plot_settings plot_only
    
if save_data:
    if not os.path.exists(save_dir_data):
        os.makedirs(save_dir_data)

## Set up directories/locations for your movies

<b> Note: </b>If you get a '.str' type of error below, make sure that there are no typos when specifying directories in the .json settings file.

In [None]:
### Parse track data from TrackMate xml files into Pandas dataframe ###

def read_data(data_dirs):
    if type(data_dirs) is not list: data_dirs = [data_dirs] # for a single directory
    data_files, file_names = [],[]
    for data_dir in data_dirs:
        curr_files = sorted(glob.glob(os.path.join(data_dir,'**/*.xml'), recursive=True))
        stripped_names = [os.path.basename(f) for f in curr_files]
        curr_names = [os.path.splitext(fn)[0] for fn in stripped_names]
        data_files = data_files + curr_files
        file_names = file_names + curr_names
    data = pd.DataFrame({'file_name' : file_names, 'file_path' : data_files})
    data['color'] = data['file_name'].str.slice(0,2)
    data['movie_ID'] = data['file_name'].str.slice(3,)
    
    # Parse data
    data['parsed'] = ""
    spot_num_attr = []
    '''if do_int_analysis:
        if int_settings["bkgnd_correction_type"] == "local":
            spot_num_attr = ['INT_C1_CORR_LOC', 'INT_C2_CORR_LOC']
        elif int_settings["bkgnd_correction_type"] == "global":
            spot_num_attr = ['INT_GLOBAL_C1', 'INT_GLOBAL_C2']
        else: # load all
            spot_num_attr = ['INT_C1_CORR_LOC', 'INT_C2_CORR_LOC', 
                             'INT_GLOBAL_C1', 'INT_GLOBAL_C2']'''
    for idx in data.index:
        a,_,_ = pt.parse_trackmate_file(data['file_path'].loc[idx],
                                       spot_num_attr=spot_num_attr)
        data.at[idx, 'parsed'] = a
    
    return data

data_parsed = {}
for condition in conditions:
    print("Now processing condition: " + condition)
    data_parsed[condition] = read_data(conditions[condition])
print('Done')

In [None]:
# Defines a function that can take in the movies and return track trajectories in the form of coordinates within
# pandas dataframe corresponding to each channel.

def get_movie_data_both_channels(data, movie_ID):
    print("Processing movie:", movie_ID)
    
    # Get indices of the movies of both colors, denoted by C1 and C2
    movie_idx = data['movie_ID'] == movie_ID
    idx_C1 = data.index[movie_idx & (data['color'] == 'C1')]
    idx_C2 = data.index[movie_idx & (data['color'] == 'C2')]
    
    # Locate data for each color of the current movie
    df_C1 = data.loc[idx_C1, 'parsed'].iat[0]
    df_C2 = data.loc[idx_C2, 'parsed'].iat[0]
    
    # Locate corresponding file names and paths
    filename_C1 = data.loc[idx_C1, 'file_name'].iat[0]
    file_path_C1 = data.loc[idx_C1, 'file_path'].iat[0]
    filename_C2 = data.loc[idx_C2, 'file_name'].iat[0]
    file_path_C2 = data.loc[idx_C2, 'file_path'].iat[0]
    
    names_paths = {'name_C1' : filename_C1, 'path_C1' : file_path_C1,
                    'name_C2' : filename_C1, 'path_C2' : file_path_C2}
    
    return df_C1, df_C2, names_paths

In [None]:
# Define a function to pool the tracks in a channel
frame_interval = 0.05 # Change this according to needs
def pool_tracks(channel_dat):
    channel_tracks = []
    cols = ['x', 'y', 't']
    for i in range(0, max(channel_dat['track_ID'])):
        track = np.array(channel_dat[channel_dat['track_ID'] == i][cols])
        channel_tracks.append(track)
    return channel_tracks

# Define function to calculate MSD for each track and fit to a
# diffusion coefficient.

def calculate_msd(tracks, frame_interval):
    
    d = []
    for track in tracks:
        len_track = np.shape(track)[0]
        if len_track < (dc_fit_nframes * min_averages_for_msd):
            continue

        # Break up the track into sub-trajectories for MSD calc'n
        track_chunks = []
        n_chunks = int(np.floor(len_track / dc_fit_nframes))
        for i in range(n_chunks):
            start_slice = i * dc_fit_nframes
            end_slice = start_slice + dc_fit_nframes
            chunk = track[start_slice:end_slice, :]
            track_chunks.append(chunk)

        # Get the data
        t_dsq, msd_data = dif.calc_msd(track_chunks, frame_interval)

        time, mean_dsq, std_dsq, sterr_dsq = msd_data

        # Fit to get the diffusion coefficient
        fit_params = dif.fit_diffusion_const(msd_data, dim = diff_dim,
                                            nframes = dc_fit_nframes)

        d.append({'Track_ID':i, 'Diff_const':fit_params['dc']})


    data = pd.DataFrame(d)
    
    return data

In [None]:
# Define a function for calculating the MSD for particles in 
# each channel for each movie in each condition

def get_condition_msd(all_data, condition):
    data = all_data[condition]
    
    condition_msd = []
    movie_counter = 1
    for movie in data:
        print('Now processing movie:', str(movie_counter))
        movie_counter += 1
        movie_msd = []
        channel_counter = 1
        for channel in movie:
            print('Now processing channel:', str(channel_counter))
            channel_counter += 1
            channel_tracks = pool_tracks(channel)
            msd_dat = calculate_msd(channel_tracks, frame_interval)
            movie_msd.append(msd_dat)
        condition_msd.append(movie_msd)
    print('Processing finished!')
    return condition_msd

## Data formatting and MSD calculation

In [None]:
# Format the data in an accessible manner - this will be a dictionary that has the condition as keys. The 
# values are lists that contains coordinate information for each channel in the movie.
all_dat = dict()
for condition in conditions:
    all_dat[condition] = list()
    condition_dat = data_parsed[condition]
    for movie_id in condition_dat['movie_ID'].unique():
        df_C1, df_C2, names_paths = get_movie_data_both_channels(condition_dat,
                                                                movie_id)
        all_dat[condition].append([df_C1, df_C2])
        

In [None]:
# Similar to the code block above, this code block will give you a dictionary that uses the condition as the keys.
# The values will be a list containing the MSDs of particles associated with a treatment condition. This code block
# takes a while, especially if you are pooling datasets.

condition_msds = dict()
for condition in conditions:
    print('Now processing:', condition)
    condition_msds[condition] = get_condition_msd(all_dat, condition)
print('Really finished')

In [None]:
# Appends the "condition" column to the current dataframe to allow for subsequent subsetting/slicing of data
def append_condition(condition_channel_df, condition, channel):
    condition_list = list()
    channel_list = list()
    for i in range(0, len(condition_channel_df)):
        condition_list.append(condition)
        if channel == 0:
            channel_list.append('Green')
        elif channel == 1:
            channel_list.append('Red')
    condition_channel_df['Condition'] = condition_list
    condition_channel_df['Channel'] = channel_list
    return condition_channel_df

In [None]:
# Optional code block for cleaner labels. Used for plotting
conditions_1 = ['1x Halo', '2x Halo', 'No Stress', '0.025 mM PA', '0.10 mM PA', '0.50 mM PA', '1.00 mM PA']

## Data formatting

In [None]:
# Code block for making a Pandas dataframe suitable for plotting violin plots.
master_df = pd.DataFrame()
counter = 0
for condition in conditions:
    for movie in condition_msds[condition]:
        for channel in range(0, len(movie)):
            master_df = master_df.append(append_condition(movie[channel], 
                                                          conditions_1[counter], 
                                                          channel))
    counter += 1

In [None]:
# Optional code block for seeing the MSDs in natural log scale
log_dc = list()
for index, row in master_df.iterrows():
    log_dc.append(np.log(row['Diff_const']))
master_df['log_DC'] = log_dc

## Data Visualization

In [None]:
# Create a violin plot that splits into red or green channels
ax = sns.violinplot(x = 'Condition', 
                    y = 'Diff_const', data = master_df,
                    hue = master_df.Channel, split = True, cut = 0,
                   palette=['green', 'red'])
ax.set_title('Diffusion Constants of IRE1 Molecules in CL-VB-69.4 Cells')
ax.set_ylabel('Diffusion Constant')
ax.set_xlabel('Conditions')
ax.axhline(0.1, linestyle = '--', color = 'black')

plt.show()

In [None]:
# Differences across conditions, regardless of spot channel

fig2 = plt.figure()

ax = sns.violinplot(x = 'Condition', 
                    y = 'Diff_const', data = master_df, cut = 0, color = 'steelblue')
ax.set_title('MSD of Correlated Tracks in Green Channel')
ax.set_ylabel('Diffusion Constant')
ax.set_xlabel('Conditions')
ax.axhline(0.1, linestyle = '--', color = 'black')


plt.show()

### To-do: write a script somewhere that can output filtered TrackMate files depending on some minimal or maximal value of MSD.