<center> <H1> Functions for Ganule distribution analysis </H1> </center>



                        *Code written by Timo Rey. Laboratory of Experimental Biophysics, EPFL*

                                            *Created during revisions in 2019/20*



#### Aims:
    Provide most functions necessary to analyse and compare the distribution of granules inside mitochondria compared to a random distribution. This should allow to keep the analysis work-book clean.
    
#### Use:
    This code can be called from other (jupyter notebook) python scripts to use the functions in further analysis.
#### Libraries:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from pathlib import Path
import pprint
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import itertools
from scipy import stats

### 1) Functions for finding & opening the files

In [None]:
# load data with specific extension:
def FindData(directory, extension, random = False):
    folders = Path(directory)
    all_csv = folders.glob('**/*' + extension + '*.csv')             # find files with specific ending
    csvList = sorted(all_csv)                                        # because 'generator object' (.glob) is not a list

    if random != False:
        print("There are %r files with the extension %r." %(len(csvList), extension))
        print("\nThey are:\n")
        pp = pprint.PrettyPrinter(indent = 1)
        pp.pprint(list(zip(csvList, range(len(csvList)))))
    
    return csvList                                                   # returns parsed list
#MitoData = FindData('MRGs', True)                                   # unhash to call this function

In [None]:
# Read file:
def OpenCsvFromList(csvList, Parser):                                # input = list of files to open + position of file in list 
    with open(str(csvList[Parser]), 'r') as file:
        df = pd.read_csv(file, sep = ',')
    return df                                                        # returns dataframe
#mito_df = OpenCsvFromList(MitoData, Parser)                         # unhash to call this function

In [None]:
# Extract relevant data into new file:
def CombineData(csvList):
    Parser = 0                                                       # can change if do not want to start with first file
    Combined_df = pd.DataFrame()                                     # to initiate a df that can be appended to
    
    while Parser < len(csvList):
        next_df = OpenCsvFromList(csvList, Parser)
        Combined_df = pd.concat([Combined_df, next_df], ignore_index = True)
        Parser += 1
        
    return Combined_df                                              # returns data-frame
#mito_df = CombinedData(MitoData)                                   # unhash to call this function

In [None]:
# Extract relevant data into new file:
def CombineData(csvList):
    Parser = 0                                                       # can change if do not want to start with first file
    Combined_df = pd.DataFrame()                                     # to initiate a df that can be appended to
    
    while Parser < len(csvList):
        next_df = OpenCsvFromList(csvList, Parser)
        Combined_df = pd.concat([Combined_df, next_df], ignore_index = True)
        Parser += 1
        
    return Combined_df                                              # returns data-frame
#mito_df = CombinedData(MitoData)  

In [None]:
# filter input-dataframe (list of mitos) by value [& keep particular columns]:
def FilterIsValue(df, filter_by ,value):
    out_df = df[df[filter_by]==value][['IMAGE', 'NAME', 'SHAPE.length', 'MAXIMA']]
    return out_df

In [None]:
# filter input-dataframe (list of mitos) by value [& keep particular columns]:
def FilterBiggerThan(df, filter_by ,value):
    out_df = df[df[filter_by]>value]
    return out_df

In [None]:
# filter input-dataframe (list of mitos) by value [& keep particular columns]:
def FilterSmallerThan(df, filter_by ,value):
    out_df = df[df[filter_by]<value]
    return out_df

In [None]:
# find children & extract their positions:
def FindPositions(df_box, df_particles):
    """ Warning: this function is very sensitive to format of input files (the order of columns)! """

    out_df = pd.DataFrame()                                         # create empty data-frame
    count = 0

    while count < len(df_box):                                      # for each straight mitochondrion
        relevant = []                                               # create empty list to store relevant MRG-IDs

        for i in df_particles.itertuples():                         # check all MRGs
            if i[8] == df_box.iloc[count,1] and i[2] == df_box.iloc[count,0]: # if PARENT.Name & IMAGE are the same
                relevant.append(i[0])                               # remember dataframe_ID
    
        positions = []                                              # create empty list to store MRG-positions
        for ids in relevant:                                        # for all relevant MRGs
            positions.append(df_particles.iloc[ids, 9])             # add parent.distance.pole1

        next_df = pd.DataFrame([[df_box.iloc[count,0],df_box.iloc[count,1], df_box.iloc[count,2],df_box.iloc[count,3], positions]], columns = ('IMAGE','Parent','Length','Particles', 'positions'))
        out_df = pd.concat([out_df, next_df])                       # add new row to out_df
    
        count += 1
    
    return out_df

In [None]:
def LoadObservations(keys = ['_Mito', '_MRGs'], withMRGs = True, twoPoles = True):         # note: default values are overruled if alternatives are provided
    # Find data with particular ending given as keys:
    mitoData          = FindData(input_directory, str(keys[0]))            # can add additional argument 'True' to see list   
    GranuleData       = FindData(input_directory, str(keys[1]))

    # Return 1 dataframe for each type of input-data:
    Mito_df           = CombineData(mitoData)
    Granule_df        = CombineData(GranuleData)
    
    if twoPoles:
        # Filter for mitochondria with only 2 poles:
        straightMito      = FilterIsValue(Mito_df, 'SHAPE.pole', 2)           # variables needed: input df, criterion (column name), value to filter by
    else:
        print("ERROR: If you want to analyse mitochondria with more than two poles, please revise workflow.")
        
    if withMRGs:
        # Filter for mitochondria with MRGs:
        withMrgs      = FilterBiggerThan(straightMito, 'MAXIMA' , 0)
        Observed_dist = FindPositions(withMrgs, Granule_df)                # need box_df & particle_df
    else:
        Observed_dist = FindPositions(straightMito, Granule_df)

    # Find positions of MRGs within straightMito:
    Observed_dist     = Observed_dist.reset_index(drop = True)
    
    return Observed_dist

In [None]:
def RelevantObservations(keys = ['_Mito', '_MRGs'], withMRGs = True,):
    # Find & load experimental data
    df = LoadObservations(keys, withMRGs)
    # Finding negative values:
    to_remove = []
    for i in range(len(df)):
        for j in df.loc[i,'positions']:
            if j < 0:
                #print("row #" + str(i) + " contains negative values and was removed.")
                #print(observations.loc[i,'IMAGE'])
                #print(observations.loc[i,'Parent'])
                to_remove.append(i)
    # remove mitochondria with negative values:
    df_out = df.drop(to_remove)
    df_out = df_out.reset_index(drop = True)
    return df_out

### 2) Functions to create simulations

In [None]:
def sim_creator(observed_object):
    simulation_keeper = []
    
    for i in range(len(observed_object)):
        length        = observed_object[i].Box
        granules      = observed_object[i].Particles
        
        # create simulation object:
        simulation    = RandomSimulation(Name='test_ID_'+str(i), Box=[length], Particles=granules)
        simulation_keeper.append(simulation)                        # collect all simulation objects in a list

    return simulation_keeper                                        # returns list containing all simulation objects        

In [None]:
# Potentially retired function below:

In [None]:
#create simulation-objects using observed mitochondria as constraints:
def simulation_creator(Observations_df):
    simulation_keeper = []                                          # empty list to keep all simulation-objects

    for i in range(len(Observations_df)):                           # for each observed mitochondrion
        length        = Observations_df.loc[i, 'Length']            # use the mitochondrial length
        granules      = Observations_df.loc[i, 'Particles']         # and the number of particles as constraints

        # create simulation object:
        simulation    = RandomSimulation(Name='test_ID_'+str(i), Box=[length], Particles=granules)
        simulation_keeper.append(simulation)                        # collect all simulation objects in a list

    return simulation_keeper                                        # returns list containing all simulation objects

### 3) Functions to create observed objects

In [None]:
# create observation-object for each observed mitochondrion:
def observation_creator(Observations_df):
    observation_keeper = []

    for i in range(len(Observations_df)):
        length      = Observations_df.loc[i, 'Length']
        Image_ID    = Observations_df.loc[i, 'IMAGE']
        Parent_Name = Observations_df.loc[i, 'Parent']
        Particles   = Observations_df.loc[i, 'Particles']
        Positions   = Observations_df.loc[i, 'positions']

        observation = SingleObservation(Name='mito_ID_'+str(i), Box=[length], Particles=Particles, Positions=Positions, Image_ID=Image_ID, Parent_Name=Parent_Name)
        observation_keeper.append(observation)

    return observation_keeper

### 4) Functions to compare observations with simulations

In [1]:
# Determine ratio of distances that satisfy cut-off conditions:
def NN_ratio(data, cut_off):
    counts = 0
    for i in data:
        if i < cut_off:
            counts += 1
    ratio = counts/len(data)
    return ratio

In [None]:
# plot particle distribution:
def particle_distribution(data, data_title, bins=None , c='magenta'):

    plt.figure()
    plt.title("Particle distribution of " + data_title)
    data = np.reshape(data, -1)
    sns.distplot(data, color = c, bins=bins, kde=False)

### 5) Functions to make plots

https://stackoverflow.com/questions/36578458/how-does-one-insert-statistical-annotations-stars-or-p-values-into-matplotlib

In [None]:
def plotCorr(df, value = 'Length'):
    # Assess correlation between MRG-number & mitochondrial size:
    fig, ax = plt.subplots(figsize = (12,5))

    # bin mitochondrial lengths:
    X = round(df[value]) 

    # plot data-distribution:
    sns.boxplot(x = X, y = df['Particles'], color = 'grey', linewidth = 3)
    ax.set(xlabel = "Mitochondrial "+ value + " in [um]", ylabel = "# MRGs")
    ax.tick_params(left = False, bottom = False)

    # plot regression-fit [first degree polynomial]:
    fit = np.polyfit(df[value], df['Particles'], deg=1)
    p   = np.poly1d(fit)
    xp  = np.linspace(0, 13, 10)
    ax.plot(xp, p(xp), '--', color = 'r')

    # compute correlation coefficient:
    correlation = np.corrcoef(df[value], df['Particles'])
    ax.text(0,6.5, "corrCoeff: " + str(round(correlation[0,1], 3)), fontsize=12)
    print('Correlation Coefficient between mitochondrial '+value+' and # MRGs = ', correlation[0,1])
    
    return fig